From a10b328dbc056aafaa696579f9a6e2b0cb8eb25f Mon Sep 17 00:00:00 2001
From: Sunitha Kambhampati <skambha@us.ibm.com>
Date: Tue, 28 Nov 2017 22:01:01 +0100
Subject: [PATCH 001/356] [SPARK-22431][SQL] Ensure that the datatype in the
 schema for the table/view metadata is parseable by Spark before persisting it

## What changes were proposed in this pull request?
* JIRA:  [SPARK-22431](https://issues.apache.org/jira/browse/SPARK-22431)  : Creating Permanent view with illegal type

**Description:**
- It is possible in Spark SQL to create a permanent view that uses an nested field with an illegal name.
- For example if we create the following view:
```create view x as select struct('a' as `$q`, 1 as b) q```
- A simple select fails with the following exception:

```
select * from x;

org.apache.spark.SparkException: Cannot recognize hive type string: struct<$q:string,b:int>
  at org.apache.spark.sql.hive.client.HiveClientImpl$.fromHiveColumn(HiveClientImpl.scala:812)
  at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1$$anonfun$apply$11$$anonfun$7.apply(HiveClientImpl.scala:378)
  at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$getTableOption$1$$anonfun$apply$11$$anonfun$7.apply(HiveClientImpl.scala:378)
...
```
**Issue/Analysis**: Right now, we can create a view with a schema that cannot be read back by Spark from the Hive metastore.  For more details, please see the discussion about the analysis and proposed fix options in comment 1 and comment 2 in the [SPARK-22431](https://issues.apache.org/jira/browse/SPARK-22431)

**Proposed changes**:
 - Fix the hive table/view codepath to check whether the schema datatype is parseable by Spark before persisting it in the metastore. This change is localized to HiveClientImpl to do the check similar to the check in FromHiveColumn. This is fail-fast and we will avoid the scenario where we write something to the metastore that we are unable to read it back.
- Added new unit tests
- Ran the sql related unit test suites ( hive/test, sql/test, catalyst/test) OK

With the fix:
```
create view x as select struct('a' as `$q`, 1 as b) q;
17/11/28 10:44:55 ERROR SparkSQLDriver: Failed in [create view x as select struct('a' as `$q`, 1 as b) q]
org.apache.spark.SparkException: Cannot recognize hive type string: struct<$q:string,b:int>
	at org.apache.spark.sql.hive.client.HiveClientImpl$.org$apache$spark$sql$hive$client$HiveClientImpl$$getSparkSQLDataType(HiveClientImpl.scala:884)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$org$apache$spark$sql$hive$client$HiveClientImpl$$verifyColumnDataType$1.apply(HiveClientImpl.scala:906)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$org$apache$spark$sql$hive$client$HiveClientImpl$$verifyColumnDataType$1.apply(HiveClientImpl.scala:906)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
...
```
## How was this patch tested?
- New unit tests have been added.

hvanhovell, Please review and share your thoughts/comments.  Thank you so much.

Author: Sunitha Kambhampati <skambha@us.ibm.com>

Closes #19747 from skambha/spark22431.
---
 .../sql/execution/command/DDLSuite.scala      | 15 ++++
 .../sql/hive/client/HiveClientImpl.scala      | 17 +++-
 .../sql/hive/execution/HiveDDLSuite.scala     | 82 +++++++++++++++++++
 3 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 878f435c75cb3..fdb9b2f51f9cb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -117,6 +117,21 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo
     }
   }
 
+  test("SPARK-22431: table with nested type col with special char") {
+    withTable("t") {
+      spark.sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING PARQUET")
+      checkAnswer(spark.table("t"), Nil)
+    }
+  }
+
+  test("SPARK-22431: view with nested type") {
+    withView("t", "v") {
+      spark.sql("CREATE VIEW t AS SELECT STRUCT('a' AS `$a`, 1 AS b) q")
+      checkAnswer(spark.table("t"), Row(Row("a", 1)) :: Nil)
+      spark.sql("CREATE VIEW v AS SELECT STRUCT('a' AS `a`, 1 AS b) q")
+      checkAnswer(spark.table("t"), Row(Row("a", 1)) :: Nil)
+    }
+  }
 }
 
 abstract class DDLSuite extends QueryTest with SQLTestUtils {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index b5a5890d47b03..47ce6ba83866c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -488,6 +488,7 @@ private[hive] class HiveClientImpl(
   }
 
   override def createTable(table: CatalogTable, ignoreIfExists: Boolean): Unit = withHiveState {
+    verifyColumnDataType(table.dataSchema)
     client.createTable(toHiveTable(table, Some(userName)), ignoreIfExists)
   }
 
@@ -507,6 +508,7 @@ private[hive] class HiveClientImpl(
     // these properties are still available to the others that share the same Hive metastore.
     // If users explicitly alter these Hive-specific properties through ALTER TABLE DDL, we respect
     // these user-specified values.
+    verifyColumnDataType(table.dataSchema)
     val hiveTable = toHiveTable(
       table.copy(properties = table.ignoredProperties ++ table.properties), Some(userName))
     // Do not use `table.qualifiedName` here because this may be a rename
@@ -520,6 +522,7 @@ private[hive] class HiveClientImpl(
       newDataSchema: StructType,
       schemaProps: Map[String, String]): Unit = withHiveState {
     val oldTable = client.getTable(dbName, tableName)
+    verifyColumnDataType(newDataSchema)
     val hiveCols = newDataSchema.map(toHiveColumn)
     oldTable.setFields(hiveCols.asJava)
 
@@ -872,15 +875,19 @@ private[hive] object HiveClientImpl {
     new FieldSchema(c.name, typeString, c.getComment().orNull)
   }
 
-  /** Builds the native StructField from Hive's FieldSchema. */
-  def fromHiveColumn(hc: FieldSchema): StructField = {
-    val columnType = try {
+  /** Get the Spark SQL native DataType from Hive's FieldSchema. */
+  private def getSparkSQLDataType(hc: FieldSchema): DataType = {
+    try {
       CatalystSqlParser.parseDataType(hc.getType)
     } catch {
       case e: ParseException =>
         throw new SparkException("Cannot recognize hive type string: " + hc.getType, e)
     }
+  }
 
+  /** Builds the native StructField from Hive's FieldSchema. */
+  def fromHiveColumn(hc: FieldSchema): StructField = {
+    val columnType = getSparkSQLDataType(hc)
     val metadata = if (hc.getType != columnType.catalogString) {
       new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
     } else {
@@ -895,6 +902,10 @@ private[hive] object HiveClientImpl {
     Option(hc.getComment).map(field.withComment).getOrElse(field)
   }
 
+  private def verifyColumnDataType(schema: StructType): Unit = {
+    schema.foreach(col => getSparkSQLDataType(toHiveColumn(col)))
+  }
+
   private def toInputFormat(name: String) =
     Utils.classForName(name).asInstanceOf[Class[_ <: org.apache.hadoop.mapred.InputFormat[_, _]]]
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index d3465a641a1a4..9063ef066aa84 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -174,6 +174,88 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA
   test("alter datasource table add columns - partitioned - orc") {
     testAddColumnPartitioned("orc")
   }
+
+  test("SPARK-22431: illegal nested type") {
+    val queries = Seq(
+      "CREATE TABLE t AS SELECT STRUCT('a' AS `$a`, 1 AS b) q",
+      "CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT)",
+      "CREATE VIEW t AS SELECT STRUCT('a' AS `$a`, 1 AS b) q")
+
+    queries.foreach(query => {
+      val err = intercept[SparkException] {
+        spark.sql(query)
+      }.getMessage
+      assert(err.contains("Cannot recognize hive type string"))
+    })
+
+    withView("v") {
+      spark.sql("CREATE VIEW v AS SELECT STRUCT('a' AS `a`, 1 AS b) q")
+      checkAnswer(sql("SELECT q.`a`, q.b FROM v"), Row("a", 1) :: Nil)
+
+      val err = intercept[SparkException] {
+        spark.sql("ALTER VIEW v AS SELECT STRUCT('a' AS `$a`, 1 AS b) q")
+      }.getMessage
+      assert(err.contains("Cannot recognize hive type string"))
+    }
+  }
+
+  test("SPARK-22431: table with nested type") {
+    withTable("t", "x") {
+      spark.sql("CREATE TABLE t(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING PARQUET")
+      checkAnswer(spark.table("t"), Nil)
+      spark.sql("CREATE TABLE x (q STRUCT<col1:INT, col2:STRING>, i1 INT)")
+      checkAnswer(spark.table("x"), Nil)
+    }
+  }
+
+  test("SPARK-22431: view with nested type") {
+    withView("v") {
+      spark.sql("CREATE VIEW v AS SELECT STRUCT('a' AS `a`, 1 AS b) q")
+      checkAnswer(spark.table("v"), Row(Row("a", 1)) :: Nil)
+
+      spark.sql("ALTER VIEW v AS SELECT STRUCT('a' AS `b`, 1 AS b) q1")
+      val df = spark.table("v")
+      assert("q1".equals(df.schema.fields(0).name))
+      checkAnswer(df, Row(Row("a", 1)) :: Nil)
+    }
+  }
+
+  test("SPARK-22431: alter table tests with nested types") {
+    withTable("t1", "t2", "t3") {
+      spark.sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT)")
+      spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`col1`:STRING, col2:Int>)")
+      val newcol = spark.sql("SELECT * FROM t1").schema.fields(2).name
+      assert("newcol1".equals(newcol))
+
+      spark.sql("CREATE TABLE t2(q STRUCT<`a`:INT, col2:STRING>, i1 INT) USING PARQUET")
+      spark.sql("ALTER TABLE t2 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)")
+      spark.sql("ALTER TABLE t2 ADD COLUMNS (newcol2 STRUCT<`col1`:STRING, col2:Int>)")
+
+      val df2 = spark.table("t2")
+      checkAnswer(df2, Nil)
+      assert("newcol1".equals(df2.schema.fields(2).name))
+      assert("newcol2".equals(df2.schema.fields(3).name))
+
+      spark.sql("CREATE TABLE t3(q STRUCT<`$a`:INT, col2:STRING>, i1 INT) USING PARQUET")
+      spark.sql("ALTER TABLE t3 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)")
+      spark.sql("ALTER TABLE t3 ADD COLUMNS (newcol2 STRUCT<`col1`:STRING, col2:Int>)")
+
+      val df3 = spark.table("t3")
+      checkAnswer(df3, Nil)
+      assert("newcol1".equals(df3.schema.fields(2).name))
+      assert("newcol2".equals(df3.schema.fields(3).name))
+    }
+  }
+
+  test("SPARK-22431: negative alter table tests with nested types") {
+    withTable("t1") {
+      spark.sql("CREATE TABLE t1 (q STRUCT<col1:INT, col2:STRING>, i1 INT)")
+      val err = intercept[SparkException] {
+        spark.sql("ALTER TABLE t1 ADD COLUMNS (newcol1 STRUCT<`$col1`:STRING, col2:Int>)")
+      }.getMessage
+      assert(err.contains("Cannot recognize hive type string:"))
+   }
+  }
 }
 
 class HiveDDLSuite

From 475a29f11ef488e7cb19bf7e0696d9d099d77c92 Mon Sep 17 00:00:00 2001
From: Herman van Hovell <hvanhovell@databricks.com>
Date: Tue, 28 Nov 2017 16:03:47 -0800
Subject: [PATCH 002/356] [SPARK-22637][SQL] Only refresh a logical plan once.

## What changes were proposed in this pull request?
`CatalogImpl.refreshTable` uses `foreach(..)` to refresh all tables in a view. This traverses all nodes in the subtree and calls `LogicalPlan.refresh()` on these nodes. However `LogicalPlan.refresh()` is also refreshing its children, as a result refreshing a large view can be quite expensive.

This PR just calls `LogicalPlan.refresh()` on the top node.

## How was this patch tested?
Existing tests.

Author: Herman van Hovell <hvanhovell@databricks.com>

Closes #19837 from hvanhovell/SPARK-22637.
---
 .../main/scala/org/apache/spark/sql/internal/CatalogImpl.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index fdd25330c5e67..6ae307bce10c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -480,7 +480,7 @@ class CatalogImpl(sparkSession: SparkSession) extends Catalog {
     if (tableMetadata.tableType == CatalogTableType.VIEW) {
       // Temp or persistent views: refresh (or invalidate) any metadata/data cached
       // in the plan recursively.
-      table.queryExecution.analyzed.foreach(_.refresh())
+      table.queryExecution.analyzed.refresh()
     } else {
       // Non-temp tables: refresh the metadata cache.
       sessionCatalog.refreshTable(tableIdent)

From e9b2070ab2d04993b1c0c1d6c6aba249e6664c8d Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Tue, 28 Nov 2017 23:02:09 -0800
Subject: [PATCH 003/356] [SPARK-18278][SCHEDULER] Spark on Kubernetes - Basic
 Scheduler Backend

## What changes were proposed in this pull request?

This is a stripped down version of the `KubernetesClusterSchedulerBackend` for Spark with the following components:
- Static Allocation of Executors
- Executor Pod Factory
- Executor Recovery Semantics

It's step 1 from the step-wise plan documented [here](https://github.com/apache-spark-on-k8s/spark/issues/441#issuecomment-330802935).
This addition is covered by the [SPIP vote](http://apache-spark-developers-list.1001551.n3.nabble.com/SPIP-Spark-on-Kubernetes-td22147.html) which passed on Aug 31 .

## How was this patch tested?

- The patch contains unit tests which are passing.
- Manual testing: `./build/mvn -Pkubernetes clean package` succeeded.
- It is a **subset** of the entire changelist hosted in http://github.com/apache-spark-on-k8s/spark which is in active use in several organizations.
- There is integration testing enabled in the fork currently [hosted by PepperData](spark-k8s-jenkins.pepperdata.org:8080) which is being moved over to RiseLAB CI.
- Detailed documentation on trying out the patch in its entirety is in: https://apache-spark-on-k8s.github.io/userdocs/running-on-kubernetes.html

cc rxin felixcheung mateiz (shepherd)
k8s-big-data SIG members & contributors: mccheah ash211 ssuchter varunkatta kimoonkim erikerlandson liyinan926 tnachen ifilonenko

Author: Yinan Li <liyinan926@gmail.com>
Author: foxish <ramanathana@google.com>
Author: mcheah <mcheah@palantir.com>

Closes #19468 from foxish/spark-kubernetes-3.
---
 .travis.yml                                   |   2 +-
 NOTICE                                        |   6 +
 .../cluster/SchedulerBackendUtils.scala       |  47 ++
 dev/sparktestsupport/modules.py               |   8 +
 docs/configuration.md                         |   4 +-
 pom.xml                                       |   7 +
 project/SparkBuild.scala                      |   8 +-
 resource-managers/kubernetes/core/pom.xml     | 100 ++++
 .../org/apache/spark/deploy/k8s/Config.scala  | 123 +++++
 .../spark/deploy/k8s/ConfigurationUtils.scala |  41 ++
 .../apache/spark/deploy/k8s/Constants.scala   |  50 ++
 .../k8s/SparkKubernetesClientFactory.scala    | 102 ++++
 .../cluster/k8s/ExecutorPodFactory.scala      | 219 +++++++++
 .../k8s/KubernetesClusterManager.scala        |  70 +++
 .../KubernetesClusterSchedulerBackend.scala   | 442 ++++++++++++++++++
 .../core/src/test/resources/log4j.properties  |  31 ++
 .../cluster/k8s/ExecutorPodFactorySuite.scala | 135 ++++++
 ...bernetesClusterSchedulerBackendSuite.scala | 440 +++++++++++++++++
 .../spark/deploy/yarn/YarnAllocator.scala     |   3 +-
 .../deploy/yarn/YarnSparkHadoopUtil.scala     |  24 -
 .../cluster/YarnClientSchedulerBackend.scala  |   2 +-
 .../cluster/YarnClusterSchedulerBackend.scala |   2 +-
 22 files changed, 1832 insertions(+), 34 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerBackendUtils.scala
 create mode 100644 resource-managers/kubernetes/core/pom.xml
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/resources/log4j.properties
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala

diff --git a/.travis.yml b/.travis.yml
index d7e9f8c0290e8..05b94adeeb93b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,7 +43,7 @@ notifications:
 # 5. Run maven install before running lint-java.
 install:
   - export MAVEN_SKIP_RC=1
-  - build/mvn -T 4 -q -DskipTests -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
+  - build/mvn -T 4 -q -DskipTests -Pkubernetes -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
 script:
diff --git a/NOTICE b/NOTICE
index f4b64b5c3f470..6ec240efbf12e 100644
--- a/NOTICE
+++ b/NOTICE
@@ -448,6 +448,12 @@ Copyright (C) 2011 Google Inc.
 Apache Commons Pool
 Copyright 1999-2009 The Apache Software Foundation
 
+This product includes/uses Kubernetes & OpenShift 3 Java Client (https://github.com/fabric8io/kubernetes-client)
+Copyright (C) 2015 Red Hat, Inc.
+
+This product includes/uses OkHttp (https://github.com/square/okhttp)
+Copyright (C) 2012 The Android Open Source Project
+
 =========================================================================
 ==  NOTICE file corresponding to section 4(d) of the Apache License,   ==
 ==  Version 2.0, in this case for the DataNucleus distribution.        ==
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerBackendUtils.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerBackendUtils.scala
new file mode 100644
index 0000000000000..c166d030f2c89
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SchedulerBackendUtils.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.config.{DYN_ALLOCATION_MAX_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
+import org.apache.spark.util.Utils
+
+private[spark] object SchedulerBackendUtils {
+  val DEFAULT_NUMBER_EXECUTORS = 2
+
+  /**
+   * Getting the initial target number of executors depends on whether dynamic allocation is
+   * enabled.
+   * If not using dynamic allocation it gets the number of executors requested by the user.
+   */
+  def getInitialTargetExecutorNumber(
+      conf: SparkConf,
+      numExecutors: Int = DEFAULT_NUMBER_EXECUTORS): Int = {
+    if (Utils.isDynamicAllocationEnabled(conf)) {
+      val minNumExecutors = conf.get(DYN_ALLOCATION_MIN_EXECUTORS)
+      val initialNumExecutors = Utils.getDynamicAllocationInitialExecutors(conf)
+      val maxNumExecutors = conf.get(DYN_ALLOCATION_MAX_EXECUTORS)
+      require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
+        s"initial executor number $initialNumExecutors must between min executor number " +
+          s"$minNumExecutors and max executor number $maxNumExecutors")
+
+      initialNumExecutors
+    } else {
+      conf.get(EXECUTOR_INSTANCES).getOrElse(numExecutors)
+    }
+  }
+}
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index dacc89f5f27d4..44f990ec8a5ac 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -532,6 +532,14 @@ def __hash__(self):
     sbt_test_goals=["mesos/test"]
 )
 
+kubernetes = Module(
+    name="kubernetes",
+    dependencies=[],
+    source_file_regexes=["resource-managers/kubernetes/core"],
+    build_profile_flags=["-Pkubernetes"],
+    sbt_test_goals=["kubernetes/test"]
+)
+
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/docs/configuration.md b/docs/configuration.md
index 9b9583d9165ef..e42f866c40566 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1438,10 +1438,10 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 <tr>
   <td><code>spark.scheduler.minRegisteredResourcesRatio</code></td>
-  <td>0.8 for YARN mode; 0.0 for standalone mode and Mesos coarse-grained mode</td>
+  <td>0.8 for KUBERNETES mode; 0.8 for YARN mode; 0.0 for standalone mode and Mesos coarse-grained mode</td>
   <td>
     The minimum ratio of registered resources (registered resources / total expected resources)
-    (resources are executors in yarn mode, CPU cores in standalone mode and Mesos coarsed-grained
+    (resources are executors in yarn mode and Kubernetes mode, CPU cores in standalone mode and Mesos coarsed-grained
      mode ['spark.cores.max' value is total expected resources for Mesos coarse-grained mode] )
     to wait for before scheduling begins. Specified as a double between 0.0 and 1.0.
     Regardless of whether the minimum ratio of resources has been reached,
diff --git a/pom.xml b/pom.xml
index 3b2c629f8ec30..7bc66e7d19540 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2664,6 +2664,13 @@
       </modules>
     </profile>
 
+    <profile>
+      <id>kubernetes</id>
+      <modules>
+        <module>resource-managers/kubernetes/core</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>hive-thriftserver</id>
       <modules>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index c726ec25478ab..75703380cdb4a 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -53,11 +53,11 @@ object BuildCommons {
     "tags", "sketch", "kvstore"
   ).map(ProjectRef(buildLocation, _)) ++ sqlProjects ++ streamingProjects
 
-  val optionallyEnabledProjects@Seq(mesos, yarn,
+  val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn,
     streamingFlumeSink, streamingFlume,
     streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
     dockerIntegrationTests, hadoopCloud) =
-    Seq("mesos", "yarn",
+    Seq("kubernetes", "mesos", "yarn",
       "streaming-flume-sink", "streaming-flume",
       "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud").map(ProjectRef(buildLocation, _))
@@ -671,9 +671,9 @@ object Unidoc {
     publish := {},
 
     unidocProjectFilter in(ScalaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010, sqlKafka010),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, kubernetes, yarn, tags, streamingKafka010, sqlKafka010),
     unidocProjectFilter in(JavaUnidoc, unidoc) :=
-      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, yarn, tags, streamingKafka010, sqlKafka010),
+      inAnyProject -- inProjects(OldDeps.project, repl, examples, tools, streamingFlumeSink, kubernetes, yarn, tags, streamingKafka010, sqlKafka010),
 
     unidocAllClasspaths in (ScalaUnidoc, unidoc) := {
       ignoreClasspaths((unidocAllClasspaths in (ScalaUnidoc, unidoc)).value)
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
new file mode 100644
index 0000000000000..7d35aea8a4142
--- /dev/null
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.3.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>spark-kubernetes_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Project Kubernetes</name>
+  <properties>
+    <sbt.project.name>kubernetes</sbt.project.name>
+    <kubernetes.client.version>3.0.0</kubernetes.client.version>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>io.fabric8</groupId>
+      <artifactId>kubernetes-client</artifactId>
+      <version>${kubernetes.client.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.core</groupId>
+          <artifactId>*</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.fasterxml.jackson.dataformat</groupId>
+          <artifactId>jackson-dataformat-yaml</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+
+    <!-- Required by kubernetes-client but we exclude it -->
+    <dependency>
+      <groupId>com.fasterxml.jackson.dataformat</groupId>
+      <artifactId>jackson-dataformat-yaml</artifactId>
+      <version>${fasterxml.jackson.version}</version>
+    </dependency>
+
+    <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+    </dependency>
+    <!-- End of shaded deps. -->
+
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>com.squareup.okhttp3</groupId>
+      <artifactId>okhttp</artifactId>
+      <version>3.8.1</version>
+    </dependency>
+
+  </dependencies>
+
+
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+
+</project>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
new file mode 100644
index 0000000000000..f0742b91987b6
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.network.util.ByteUnit
+
+private[spark] object Config extends Logging {
+
+  val KUBERNETES_NAMESPACE =
+    ConfigBuilder("spark.kubernetes.namespace")
+      .doc("The namespace that will be used for running the driver and executor pods. When using " +
+        "spark-submit in cluster mode, this can also be passed to spark-submit via the " +
+        "--kubernetes-namespace command line argument.")
+      .stringConf
+      .createWithDefault("default")
+
+  val EXECUTOR_DOCKER_IMAGE =
+    ConfigBuilder("spark.kubernetes.executor.docker.image")
+      .doc("Docker image to use for the executors. Specify this using the standard Docker tag " +
+        "format.")
+      .stringConf
+      .createOptional
+
+  val DOCKER_IMAGE_PULL_POLICY =
+    ConfigBuilder("spark.kubernetes.docker.image.pullPolicy")
+      .doc("Kubernetes image pull policy. Valid values are Always, Never, and IfNotPresent.")
+      .stringConf
+      .checkValues(Set("Always", "Never", "IfNotPresent"))
+      .createWithDefault("IfNotPresent")
+
+  val APISERVER_AUTH_DRIVER_CONF_PREFIX =
+      "spark.kubernetes.authenticate.driver"
+  val APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX =
+      "spark.kubernetes.authenticate.driver.mounted"
+  val OAUTH_TOKEN_CONF_SUFFIX = "oauthToken"
+  val OAUTH_TOKEN_FILE_CONF_SUFFIX = "oauthTokenFile"
+  val CLIENT_KEY_FILE_CONF_SUFFIX = "clientKeyFile"
+  val CLIENT_CERT_FILE_CONF_SUFFIX = "clientCertFile"
+  val CA_CERT_FILE_CONF_SUFFIX = "caCertFile"
+
+  val KUBERNETES_SERVICE_ACCOUNT_NAME =
+    ConfigBuilder(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
+      .doc("Service account that is used when running the driver pod. The driver pod uses " +
+        "this service account when requesting executor pods from the API server. If specific " +
+        "credentials are given for the driver pod to use, the driver will favor " +
+        "using those credentials instead.")
+      .stringConf
+      .createOptional
+
+  // Note that while we set a default for this when we start up the
+  // scheduler, the specific default value is dynamically determined
+  // based on the executor memory.
+  val KUBERNETES_EXECUTOR_MEMORY_OVERHEAD =
+    ConfigBuilder("spark.kubernetes.executor.memoryOverhead")
+      .doc("The amount of off-heap memory (in megabytes) to be allocated per executor. This " +
+        "is memory that accounts for things like VM overheads, interned strings, other native " +
+        "overheads, etc. This tends to grow with the executor size. (typically 6-10%).")
+      .bytesConf(ByteUnit.MiB)
+      .createOptional
+
+  val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
+  val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
+
+  val KUBERNETES_DRIVER_POD_NAME =
+    ConfigBuilder("spark.kubernetes.driver.pod.name")
+      .doc("Name of the driver pod.")
+      .stringConf
+      .createOptional
+
+  val KUBERNETES_EXECUTOR_POD_NAME_PREFIX =
+    ConfigBuilder("spark.kubernetes.executor.podNamePrefix")
+      .doc("Prefix to use in front of the executor pod names.")
+      .internal()
+      .stringConf
+      .createWithDefault("spark")
+
+  val KUBERNETES_ALLOCATION_BATCH_SIZE =
+    ConfigBuilder("spark.kubernetes.allocation.batch.size")
+      .doc("Number of pods to launch at once in each round of executor allocation.")
+      .intConf
+      .checkValue(value => value > 0, "Allocation batch size should be a positive integer")
+      .createWithDefault(5)
+
+  val KUBERNETES_ALLOCATION_BATCH_DELAY =
+    ConfigBuilder("spark.kubernetes.allocation.batch.delay")
+      .doc("Number of seconds to wait between each round of executor allocation.")
+      .longConf
+      .checkValue(value => value > 0, "Allocation batch delay should be a positive integer")
+      .createWithDefault(1)
+
+  val KUBERNETES_EXECUTOR_LIMIT_CORES =
+    ConfigBuilder("spark.kubernetes.executor.limit.cores")
+      .doc("Specify the hard cpu limit for a single executor pod")
+      .stringConf
+      .createOptional
+
+  val KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS =
+    ConfigBuilder("spark.kubernetes.executor.lostCheck.maxAttempts")
+      .doc("Maximum number of attempts allowed for checking the reason of an executor loss " +
+        "before it is assumed that the executor failed.")
+      .intConf
+      .checkValue(value => value > 0, "Maximum attempts of checks of executor lost reason " +
+        "must be a positive integer")
+      .createWithDefault(10)
+
+  val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala
new file mode 100644
index 0000000000000..01717479fddd9
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.k8s
+
+import org.apache.spark.SparkConf
+
+private[spark] object ConfigurationUtils {
+
+  /**
+   * Extract and parse Spark configuration properties with a given name prefix and
+   * return the result as a Map. Keys must not have more than one value.
+   *
+   * @param sparkConf Spark configuration
+   * @param prefix the given property name prefix
+   * @return a Map storing the configuration property keys and values
+   */
+  def parsePrefixedKeyValuePairs(
+      sparkConf: SparkConf,
+      prefix: String): Map[String, String] = {
+    sparkConf.getAllWithPrefix(prefix).toMap
+  }
+
+  def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
+    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
new file mode 100644
index 0000000000000..4ddeefb15a89d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s
+
+private[spark] object Constants {
+
+  // Labels
+  val SPARK_APP_ID_LABEL = "spark-app-selector"
+  val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
+  val SPARK_ROLE_LABEL = "spark-role"
+  val SPARK_POD_DRIVER_ROLE = "driver"
+  val SPARK_POD_EXECUTOR_ROLE = "executor"
+
+  // Default and fixed ports
+  val DEFAULT_DRIVER_PORT = 7078
+  val DEFAULT_BLOCKMANAGER_PORT = 7079
+  val BLOCK_MANAGER_PORT_NAME = "blockmanager"
+  val EXECUTOR_PORT_NAME = "executor"
+
+  // Environment Variables
+  val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
+  val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
+  val ENV_EXECUTOR_CORES = "SPARK_EXECUTOR_CORES"
+  val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
+  val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
+  val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
+  val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
+  val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
+  val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
+  val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
+
+  // Miscellaneous
+  val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
+  val MEMORY_OVERHEAD_FACTOR = 0.10
+  val MEMORY_OVERHEAD_MIN_MIB = 384L
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
new file mode 100644
index 0000000000000..1e3f055e05766
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s
+
+import java.io.File
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
+import io.fabric8.kubernetes.client.utils.HttpClientUtils
+import okhttp3.Dispatcher
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * Spark-opinionated builder for Kubernetes clients. It uses a prefix plus common suffixes to
+ * parse configuration keys, similar to the manner in which Spark's SecurityManager parses SSL
+ * options for different components.
+ */
+private[spark] object SparkKubernetesClientFactory {
+
+  def createKubernetesClient(
+      master: String,
+      namespace: Option[String],
+      kubernetesAuthConfPrefix: String,
+      sparkConf: SparkConf,
+      defaultServiceAccountToken: Option[File],
+      defaultServiceAccountCaCert: Option[File]): KubernetesClient = {
+    val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
+    val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
+    val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
+      .map(new File(_))
+      .orElse(defaultServiceAccountToken)
+    val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
+    ConfigurationUtils.requireNandDefined(
+      oauthTokenFile,
+      oauthTokenValue,
+      s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a " +
+        s"value $oauthTokenConf.")
+
+    val caCertFile = sparkConf
+      .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
+      .orElse(defaultServiceAccountCaCert.map(_.getAbsolutePath))
+    val clientKeyFile = sparkConf
+      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
+    val clientCertFile = sparkConf
+      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
+    val dispatcher = new Dispatcher(
+      ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
+    val config = new ConfigBuilder()
+      .withApiVersion("v1")
+      .withMasterUrl(master)
+      .withWebsocketPingInterval(0)
+      .withOption(oauthTokenValue) {
+        (token, configBuilder) => configBuilder.withOauthToken(token)
+      }.withOption(oauthTokenFile) {
+        (file, configBuilder) =>
+            configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
+      }.withOption(caCertFile) {
+        (file, configBuilder) => configBuilder.withCaCertFile(file)
+      }.withOption(clientKeyFile) {
+        (file, configBuilder) => configBuilder.withClientKeyFile(file)
+      }.withOption(clientCertFile) {
+        (file, configBuilder) => configBuilder.withClientCertFile(file)
+      }.withOption(namespace) {
+        (ns, configBuilder) => configBuilder.withNamespace(ns)
+      }.build()
+    val baseHttpClient = HttpClientUtils.createHttpClient(config)
+    val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
+      .dispatcher(dispatcher)
+      .build()
+    new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
+  }
+
+  private implicit class OptionConfigurableConfigBuilder(val configBuilder: ConfigBuilder)
+    extends AnyVal {
+
+    def withOption[T]
+        (option: Option[T])
+        (configurator: ((T, ConfigBuilder) => ConfigBuilder)): ConfigBuilder = {
+      option.map { opt =>
+        configurator(opt, configBuilder)
+      }.getOrElse(configBuilder)
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
new file mode 100644
index 0000000000000..f79155b117b67
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model._
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.ConfigurationUtils
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.util.Utils
+
+/**
+ * A factory class for configuring and creating executor pods.
+ */
+private[spark] trait ExecutorPodFactory {
+
+  /**
+   * Configure and construct an executor pod with the given parameters.
+   */
+  def createExecutorPod(
+      executorId: String,
+      applicationId: String,
+      driverUrl: String,
+      executorEnvs: Seq[(String, String)],
+      driverPod: Pod,
+      nodeToLocalTaskCount: Map[String, Int]): Pod
+}
+
+private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
+  extends ExecutorPodFactory {
+
+  private val executorExtraClasspath =
+    sparkConf.get(org.apache.spark.internal.config.EXECUTOR_CLASS_PATH)
+
+  private val executorLabels = ConfigurationUtils.parsePrefixedKeyValuePairs(
+    sparkConf,
+    KUBERNETES_EXECUTOR_LABEL_PREFIX)
+  require(
+    !executorLabels.contains(SPARK_APP_ID_LABEL),
+    s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is reserved for Spark.")
+  require(
+    !executorLabels.contains(SPARK_EXECUTOR_ID_LABEL),
+    s"Custom executor labels cannot contain $SPARK_EXECUTOR_ID_LABEL as it is reserved for" +
+      " Spark.")
+  require(
+    !executorLabels.contains(SPARK_ROLE_LABEL),
+    s"Custom executor labels cannot contain $SPARK_ROLE_LABEL as it is reserved for Spark.")
+
+  private val executorAnnotations =
+    ConfigurationUtils.parsePrefixedKeyValuePairs(
+      sparkConf,
+      KUBERNETES_EXECUTOR_ANNOTATION_PREFIX)
+  private val nodeSelector =
+    ConfigurationUtils.parsePrefixedKeyValuePairs(
+      sparkConf,
+      KUBERNETES_NODE_SELECTOR_PREFIX)
+
+  private val executorDockerImage = sparkConf
+    .get(EXECUTOR_DOCKER_IMAGE)
+    .getOrElse(throw new SparkException("Must specify the executor Docker image"))
+  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+  private val blockManagerPort = sparkConf
+    .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
+
+  private val executorPodNamePrefix = sparkConf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX)
+
+  private val executorMemoryMiB = sparkConf.get(org.apache.spark.internal.config.EXECUTOR_MEMORY)
+  private val executorMemoryString = sparkConf.get(
+    org.apache.spark.internal.config.EXECUTOR_MEMORY.key,
+    org.apache.spark.internal.config.EXECUTOR_MEMORY.defaultValueString)
+
+  private val memoryOverheadMiB = sparkConf
+    .get(KUBERNETES_EXECUTOR_MEMORY_OVERHEAD)
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryMiB).toInt,
+      MEMORY_OVERHEAD_MIN_MIB))
+  private val executorMemoryWithOverhead = executorMemoryMiB + memoryOverheadMiB
+
+  private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
+  private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
+
+  override def createExecutorPod(
+      executorId: String,
+      applicationId: String,
+      driverUrl: String,
+      executorEnvs: Seq[(String, String)],
+      driverPod: Pod,
+      nodeToLocalTaskCount: Map[String, Int]): Pod = {
+    val name = s"$executorPodNamePrefix-exec-$executorId"
+
+    // hostname must be no longer than 63 characters, so take the last 63 characters of the pod
+    // name as the hostname.  This preserves uniqueness since the end of name contains
+    // executorId
+    val hostname = name.substring(Math.max(0, name.length - 63))
+    val resolvedExecutorLabels = Map(
+      SPARK_EXECUTOR_ID_LABEL -> executorId,
+      SPARK_APP_ID_LABEL -> applicationId,
+      SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE) ++
+      executorLabels
+    val executorMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(s"${executorMemoryMiB}Mi")
+      .build()
+    val executorMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(s"${executorMemoryWithOverhead}Mi")
+      .build()
+    val executorCpuQuantity = new QuantityBuilder(false)
+      .withAmount(executorCores.toString)
+      .build()
+    val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
+      new EnvVarBuilder()
+        .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
+        .withValue(cp)
+        .build()
+    }
+    val executorExtraJavaOptionsEnv = sparkConf
+      .get(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS)
+      .map { opts =>
+        val delimitedOpts = Utils.splitCommandString(opts)
+        delimitedOpts.zipWithIndex.map {
+          case (opt, index) =>
+            new EnvVarBuilder().withName(s"$ENV_JAVA_OPT_PREFIX$index").withValue(opt).build()
+        }
+      }.getOrElse(Seq.empty[EnvVar])
+    val executorEnv = (Seq(
+      (ENV_DRIVER_URL, driverUrl),
+      // Executor backend expects integral value for executor cores, so round it up to an int.
+      (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
+      (ENV_EXECUTOR_MEMORY, executorMemoryString),
+      (ENV_APPLICATION_ID, applicationId),
+      (ENV_EXECUTOR_ID, executorId)) ++ executorEnvs)
+      .map(env => new EnvVarBuilder()
+        .withName(env._1)
+        .withValue(env._2)
+        .build()
+      ) ++ Seq(
+      new EnvVarBuilder()
+        .withName(ENV_EXECUTOR_POD_IP)
+        .withValueFrom(new EnvVarSourceBuilder()
+          .withNewFieldRef("v1", "status.podIP")
+          .build())
+        .build()
+    ) ++ executorExtraJavaOptionsEnv ++ executorExtraClasspathEnv.toSeq
+    val requiredPorts = Seq(
+      (BLOCK_MANAGER_PORT_NAME, blockManagerPort))
+      .map { case (name, port) =>
+        new ContainerPortBuilder()
+          .withName(name)
+          .withContainerPort(port)
+          .build()
+      }
+
+    val executorContainer = new ContainerBuilder()
+      .withName("executor")
+      .withImage(executorDockerImage)
+      .withImagePullPolicy(dockerImagePullPolicy)
+      .withNewResources()
+        .addToRequests("memory", executorMemoryQuantity)
+        .addToLimits("memory", executorMemoryLimitQuantity)
+        .addToRequests("cpu", executorCpuQuantity)
+        .endResources()
+      .addAllToEnv(executorEnv.asJava)
+      .withPorts(requiredPorts.asJava)
+      .build()
+
+    val executorPod = new PodBuilder()
+      .withNewMetadata()
+        .withName(name)
+        .withLabels(resolvedExecutorLabels.asJava)
+        .withAnnotations(executorAnnotations.asJava)
+        .withOwnerReferences()
+          .addNewOwnerReference()
+            .withController(true)
+            .withApiVersion(driverPod.getApiVersion)
+            .withKind(driverPod.getKind)
+            .withName(driverPod.getMetadata.getName)
+            .withUid(driverPod.getMetadata.getUid)
+            .endOwnerReference()
+        .endMetadata()
+      .withNewSpec()
+        .withHostname(hostname)
+        .withRestartPolicy("Never")
+        .withNodeSelector(nodeSelector.asJava)
+        .endSpec()
+      .build()
+
+    val containerWithExecutorLimitCores = executorLimitCores.map { limitCores =>
+      val executorCpuLimitQuantity = new QuantityBuilder(false)
+        .withAmount(limitCores)
+        .build()
+      new ContainerBuilder(executorContainer)
+        .editResources()
+        .addToLimits("cpu", executorCpuLimitQuantity)
+        .endResources()
+        .build()
+    }.getOrElse(executorContainer)
+
+    new PodBuilder(executorPod)
+      .editSpec()
+        .addToContainers(containerWithExecutorLimitCores)
+        .endSpec()
+      .build()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
new file mode 100644
index 0000000000000..68ca6a7622171
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import java.io.File
+
+import io.fabric8.kubernetes.client.Config
+
+import org.apache.spark.SparkContext
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.SparkKubernetesClientFactory
+import org.apache.spark.internal.Logging
+import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
+import org.apache.spark.util.ThreadUtils
+
+private[spark] class KubernetesClusterManager extends ExternalClusterManager with Logging {
+
+  override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
+
+  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
+    new TaskSchedulerImpl(sc)
+  }
+
+  override def createSchedulerBackend(
+      sc: SparkContext,
+      masterURL: String,
+      scheduler: TaskScheduler): SchedulerBackend = {
+    val sparkConf = sc.getConf
+
+    val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
+      KUBERNETES_MASTER_INTERNAL_URL,
+      Some(sparkConf.get(KUBERNETES_NAMESPACE)),
+      APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX,
+      sparkConf,
+      Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
+      Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
+
+    val executorPodFactory = new ExecutorPodFactoryImpl(sparkConf)
+    val allocatorExecutor = ThreadUtils
+      .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator")
+    val requestExecutorsService = ThreadUtils.newDaemonCachedThreadPool(
+      "kubernetes-executor-requests")
+    new KubernetesClusterSchedulerBackend(
+      scheduler.asInstanceOf[TaskSchedulerImpl],
+      sc.env.rpcEnv,
+      executorPodFactory,
+      kubernetesClient,
+      allocatorExecutor,
+      requestExecutorsService)
+  }
+
+  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
+    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
new file mode 100644
index 0000000000000..e79c987852db2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -0,0 +1,442 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import java.io.Closeable
+import java.net.InetAddress
+import java.util.concurrent.{ConcurrentHashMap, ExecutorService, ScheduledExecutorService, TimeUnit}
+import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
+import javax.annotation.concurrent.GuardedBy
+
+import io.fabric8.kubernetes.api.model._
+import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.SparkException
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.rpc.{RpcAddress, RpcEndpointAddress, RpcEnv}
+import org.apache.spark.scheduler.{ExecutorExited, SlaveLost, TaskSchedulerImpl}
+import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SchedulerBackendUtils}
+import org.apache.spark.util.Utils
+
+private[spark] class KubernetesClusterSchedulerBackend(
+    scheduler: TaskSchedulerImpl,
+    rpcEnv: RpcEnv,
+    executorPodFactory: ExecutorPodFactory,
+    kubernetesClient: KubernetesClient,
+    allocatorExecutor: ScheduledExecutorService,
+    requestExecutorsService: ExecutorService)
+  extends CoarseGrainedSchedulerBackend(scheduler, rpcEnv) {
+
+  import KubernetesClusterSchedulerBackend._
+
+  private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
+  private val RUNNING_EXECUTOR_PODS_LOCK = new Object
+  @GuardedBy("RUNNING_EXECUTOR_PODS_LOCK")
+  private val runningExecutorsToPods = new mutable.HashMap[String, Pod]
+  private val executorPodsByIPs = new ConcurrentHashMap[String, Pod]()
+  private val podsWithKnownExitReasons = new ConcurrentHashMap[String, ExecutorExited]()
+  private val disconnectedPodsByExecutorIdPendingRemoval = new ConcurrentHashMap[String, Pod]()
+
+  private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
+
+  private val kubernetesDriverPodName = conf
+    .get(KUBERNETES_DRIVER_POD_NAME)
+    .getOrElse(throw new SparkException("Must specify the driver pod name"))
+  private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
+    requestExecutorsService)
+
+  private val driverPod = kubernetesClient.pods()
+    .inNamespace(kubernetesNamespace)
+    .withName(kubernetesDriverPodName)
+    .get()
+
+  protected override val minRegisteredRatio =
+    if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
+      0.8
+    } else {
+      super.minRegisteredRatio
+    }
+
+  private val executorWatchResource = new AtomicReference[Closeable]
+  private val totalExpectedExecutors = new AtomicInteger(0)
+
+  private val driverUrl = RpcEndpointAddress(
+    conf.get("spark.driver.host"),
+    conf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
+    CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
+
+  private val initialExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(conf)
+
+  private val podAllocationInterval = conf.get(KUBERNETES_ALLOCATION_BATCH_DELAY)
+
+  private val podAllocationSize = conf.get(KUBERNETES_ALLOCATION_BATCH_SIZE)
+
+  private val executorLostReasonCheckMaxAttempts = conf.get(
+    KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
+
+  private val allocatorRunnable = new Runnable {
+
+    // Maintains a map of executor id to count of checks performed to learn the loss reason
+    // for an executor.
+    private val executorReasonCheckAttemptCounts = new mutable.HashMap[String, Int]
+
+    override def run(): Unit = {
+      handleDisconnectedExecutors()
+
+      val executorsToAllocate = mutable.Map[String, Pod]()
+      val currentTotalRegisteredExecutors = totalRegisteredExecutors.get
+      val currentTotalExpectedExecutors = totalExpectedExecutors.get
+      val currentNodeToLocalTaskCount = getNodesWithLocalTaskCounts()
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        if (currentTotalRegisteredExecutors < runningExecutorsToPods.size) {
+          logDebug("Waiting for pending executors before scaling")
+        } else if (currentTotalExpectedExecutors <= runningExecutorsToPods.size) {
+          logDebug("Maximum allowed executor limit reached. Not scaling up further.")
+        } else {
+          for (_ <- 0 until math.min(
+            currentTotalExpectedExecutors - runningExecutorsToPods.size, podAllocationSize)) {
+            val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
+            val executorPod = executorPodFactory.createExecutorPod(
+              executorId,
+              applicationId(),
+              driverUrl,
+              conf.getExecutorEnv,
+              driverPod,
+              currentNodeToLocalTaskCount)
+            executorsToAllocate(executorId) = executorPod
+            logInfo(
+              s"Requesting a new executor, total executors is now ${runningExecutorsToPods.size}")
+          }
+        }
+      }
+
+      val allocatedExecutors = executorsToAllocate.mapValues { pod =>
+        Utils.tryLog {
+          kubernetesClient.pods().create(pod)
+        }
+      }
+
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        allocatedExecutors.map {
+          case (executorId, attemptedAllocatedExecutor) =>
+            attemptedAllocatedExecutor.map { successfullyAllocatedExecutor =>
+              runningExecutorsToPods.put(executorId, successfullyAllocatedExecutor)
+            }
+        }
+      }
+    }
+
+    def handleDisconnectedExecutors(): Unit = {
+      // For each disconnected executor, synchronize with the loss reasons that may have been found
+      // by the executor pod watcher. If the loss reason was discovered by the watcher,
+      // inform the parent class with removeExecutor.
+      disconnectedPodsByExecutorIdPendingRemoval.asScala.foreach {
+        case (executorId, executorPod) =>
+          val knownExitReason = Option(podsWithKnownExitReasons.remove(
+            executorPod.getMetadata.getName))
+          knownExitReason.fold {
+            removeExecutorOrIncrementLossReasonCheckCount(executorId)
+          } { executorExited =>
+            logWarning(s"Removing executor $executorId with loss reason " + executorExited.message)
+            removeExecutor(executorId, executorExited)
+            // We don't delete the pod running the executor that has an exit condition caused by
+            // the application from the Kubernetes API server. This allows users to debug later on
+            // through commands such as "kubectl logs <pod name>" and
+            // "kubectl describe pod <pod name>". Note that exited containers have terminated and
+            // therefore won't take CPU and memory resources.
+            // Otherwise, the executor pod is marked to be deleted from the API server.
+            if (executorExited.exitCausedByApp) {
+              logInfo(s"Executor $executorId exited because of the application.")
+              deleteExecutorFromDataStructures(executorId)
+            } else {
+              logInfo(s"Executor $executorId failed because of a framework error.")
+              deleteExecutorFromClusterAndDataStructures(executorId)
+            }
+          }
+      }
+    }
+
+    def removeExecutorOrIncrementLossReasonCheckCount(executorId: String): Unit = {
+      val reasonCheckCount = executorReasonCheckAttemptCounts.getOrElse(executorId, 0)
+      if (reasonCheckCount >= executorLostReasonCheckMaxAttempts) {
+        removeExecutor(executorId, SlaveLost("Executor lost for unknown reasons."))
+        deleteExecutorFromClusterAndDataStructures(executorId)
+      } else {
+        executorReasonCheckAttemptCounts.put(executorId, reasonCheckCount + 1)
+      }
+    }
+
+    def deleteExecutorFromClusterAndDataStructures(executorId: String): Unit = {
+      deleteExecutorFromDataStructures(executorId).foreach { pod =>
+        kubernetesClient.pods().delete(pod)
+      }
+    }
+
+    def deleteExecutorFromDataStructures(executorId: String): Option[Pod] = {
+      disconnectedPodsByExecutorIdPendingRemoval.remove(executorId)
+      executorReasonCheckAttemptCounts -= executorId
+      podsWithKnownExitReasons.remove(executorId)
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        runningExecutorsToPods.remove(executorId).orElse {
+          logWarning(s"Unable to remove pod for unknown executor $executorId")
+          None
+        }
+      }
+    }
+  }
+
+  override def sufficientResourcesRegistered(): Boolean = {
+    totalRegisteredExecutors.get() >= initialExecutors * minRegisteredRatio
+  }
+
+  override def start(): Unit = {
+    super.start()
+    executorWatchResource.set(
+      kubernetesClient
+        .pods()
+        .withLabel(SPARK_APP_ID_LABEL, applicationId())
+        .watch(new ExecutorPodsWatcher()))
+
+    allocatorExecutor.scheduleWithFixedDelay(
+      allocatorRunnable, 0L, podAllocationInterval, TimeUnit.SECONDS)
+
+    if (!Utils.isDynamicAllocationEnabled(conf)) {
+      doRequestTotalExecutors(initialExecutors)
+    }
+  }
+
+  override def stop(): Unit = {
+    // stop allocation of new resources and caches.
+    allocatorExecutor.shutdown()
+    allocatorExecutor.awaitTermination(30, TimeUnit.SECONDS)
+
+    // send stop message to executors so they shut down cleanly
+    super.stop()
+
+    try {
+      val resource = executorWatchResource.getAndSet(null)
+      if (resource != null) {
+        resource.close()
+      }
+    } catch {
+      case e: Throwable => logWarning("Failed to close the executor pod watcher", e)
+    }
+
+    // then delete the executor pods
+    Utils.tryLogNonFatalError {
+      deleteExecutorPodsOnStop()
+      executorPodsByIPs.clear()
+    }
+    Utils.tryLogNonFatalError {
+      logInfo("Closing kubernetes client")
+      kubernetesClient.close()
+    }
+  }
+
+  /**
+   * @return A map of K8s cluster nodes to the number of tasks that could benefit from data
+   *         locality if an executor launches on the cluster node.
+   */
+  private def getNodesWithLocalTaskCounts() : Map[String, Int] = {
+    val nodeToLocalTaskCount = synchronized {
+      mutable.Map[String, Int]() ++ hostToLocalTaskCount
+    }
+
+    for (pod <- executorPodsByIPs.values().asScala) {
+      // Remove cluster nodes that are running our executors already.
+      // TODO: This prefers spreading out executors across nodes. In case users want
+      // consolidating executors on fewer nodes, introduce a flag. See the spark.deploy.spreadOut
+      // flag that Spark standalone has: https://spark.apache.org/docs/latest/spark-standalone.html
+      nodeToLocalTaskCount.remove(pod.getSpec.getNodeName).nonEmpty ||
+        nodeToLocalTaskCount.remove(pod.getStatus.getHostIP).nonEmpty ||
+        nodeToLocalTaskCount.remove(
+          InetAddress.getByName(pod.getStatus.getHostIP).getCanonicalHostName).nonEmpty
+    }
+    nodeToLocalTaskCount.toMap[String, Int]
+  }
+
+  override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
+    totalExpectedExecutors.set(requestedTotal)
+    true
+  }
+
+  override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] {
+    val podsToDelete = RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+      executorIds.flatMap { executorId =>
+        runningExecutorsToPods.remove(executorId) match {
+          case Some(pod) =>
+            disconnectedPodsByExecutorIdPendingRemoval.put(executorId, pod)
+            Some(pod)
+
+          case None =>
+            logWarning(s"Unable to remove pod for unknown executor $executorId")
+            None
+        }
+      }
+    }
+
+    kubernetesClient.pods().delete(podsToDelete: _*)
+    true
+  }
+
+  private def deleteExecutorPodsOnStop(): Unit = {
+    val executorPodsToDelete = RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+      val runningExecutorPodsCopy = Seq(runningExecutorsToPods.values.toSeq: _*)
+      runningExecutorsToPods.clear()
+      runningExecutorPodsCopy
+    }
+    kubernetesClient.pods().delete(executorPodsToDelete: _*)
+  }
+
+  private class ExecutorPodsWatcher extends Watcher[Pod] {
+
+    private val DEFAULT_CONTAINER_FAILURE_EXIT_STATUS = -1
+
+    override def eventReceived(action: Action, pod: Pod): Unit = {
+      val podName = pod.getMetadata.getName
+      val podIP = pod.getStatus.getPodIP
+
+      action match {
+        case Action.MODIFIED if (pod.getStatus.getPhase == "Running"
+            && pod.getMetadata.getDeletionTimestamp == null) =>
+          val clusterNodeName = pod.getSpec.getNodeName
+          logInfo(s"Executor pod $podName ready, launched at $clusterNodeName as IP $podIP.")
+          executorPodsByIPs.put(podIP, pod)
+
+        case Action.DELETED | Action.ERROR =>
+          val executorId = getExecutorId(pod)
+          logDebug(s"Executor pod $podName at IP $podIP was at $action.")
+          if (podIP != null) {
+            executorPodsByIPs.remove(podIP)
+          }
+
+          val executorExitReason = if (action == Action.ERROR) {
+            logWarning(s"Received error event of executor pod $podName. Reason: " +
+              pod.getStatus.getReason)
+            executorExitReasonOnError(pod)
+          } else if (action == Action.DELETED) {
+            logWarning(s"Received delete event of executor pod $podName. Reason: " +
+              pod.getStatus.getReason)
+            executorExitReasonOnDelete(pod)
+          } else {
+            throw new IllegalStateException(
+              s"Unknown action that should only be DELETED or ERROR: $action")
+          }
+          podsWithKnownExitReasons.put(pod.getMetadata.getName, executorExitReason)
+
+          if (!disconnectedPodsByExecutorIdPendingRemoval.containsKey(executorId)) {
+            log.warn(s"Executor with id $executorId was not marked as disconnected, but the " +
+              s"watch received an event of type $action for this executor. The executor may " +
+              "have failed to start in the first place and never registered with the driver.")
+          }
+          disconnectedPodsByExecutorIdPendingRemoval.put(executorId, pod)
+
+        case _ => logDebug(s"Received event of executor pod $podName: " + action)
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Executor pod watch closed.", cause)
+    }
+
+    private def getExecutorExitStatus(pod: Pod): Int = {
+      val containerStatuses = pod.getStatus.getContainerStatuses
+      if (!containerStatuses.isEmpty) {
+        // we assume the first container represents the pod status. This assumption may not hold
+        // true in the future. Revisit this if side-car containers start running inside executor
+        // pods.
+        getExecutorExitStatus(containerStatuses.get(0))
+      } else DEFAULT_CONTAINER_FAILURE_EXIT_STATUS
+    }
+
+    private def getExecutorExitStatus(containerStatus: ContainerStatus): Int = {
+      Option(containerStatus.getState).map { containerState =>
+        Option(containerState.getTerminated).map { containerStateTerminated =>
+          containerStateTerminated.getExitCode.intValue()
+        }.getOrElse(UNKNOWN_EXIT_CODE)
+      }.getOrElse(UNKNOWN_EXIT_CODE)
+    }
+
+    private def isPodAlreadyReleased(pod: Pod): Boolean = {
+      val executorId = pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL)
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        !runningExecutorsToPods.contains(executorId)
+      }
+    }
+
+    private def executorExitReasonOnError(pod: Pod): ExecutorExited = {
+      val containerExitStatus = getExecutorExitStatus(pod)
+      // container was probably actively killed by the driver.
+      if (isPodAlreadyReleased(pod)) {
+        ExecutorExited(containerExitStatus, exitCausedByApp = false,
+          s"Container in pod ${pod.getMetadata.getName} exited from explicit termination " +
+            "request.")
+      } else {
+        val containerExitReason = s"Pod ${pod.getMetadata.getName}'s executor container " +
+          s"exited with exit status code $containerExitStatus."
+        ExecutorExited(containerExitStatus, exitCausedByApp = true, containerExitReason)
+      }
+    }
+
+    private def executorExitReasonOnDelete(pod: Pod): ExecutorExited = {
+      val exitMessage = if (isPodAlreadyReleased(pod)) {
+        s"Container in pod ${pod.getMetadata.getName} exited from explicit termination request."
+      } else {
+        s"Pod ${pod.getMetadata.getName} deleted or lost."
+      }
+      ExecutorExited(getExecutorExitStatus(pod), exitCausedByApp = false, exitMessage)
+    }
+
+    private def getExecutorId(pod: Pod): String = {
+      val executorId = pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL)
+      require(executorId != null, "Unexpected pod metadata; expected all executor pods " +
+        s"to have label $SPARK_EXECUTOR_ID_LABEL.")
+      executorId
+    }
+  }
+
+  override def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
+    new KubernetesDriverEndpoint(rpcEnv, properties)
+  }
+
+  private class KubernetesDriverEndpoint(
+      rpcEnv: RpcEnv,
+      sparkProperties: Seq[(String, String)])
+    extends DriverEndpoint(rpcEnv, sparkProperties) {
+
+    override def onDisconnected(rpcAddress: RpcAddress): Unit = {
+      addressToExecutorId.get(rpcAddress).foreach { executorId =>
+        if (disableExecutor(executorId)) {
+          RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+            runningExecutorsToPods.get(executorId).foreach { pod =>
+              disconnectedPodsByExecutorIdPendingRemoval.put(executorId, pod)
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+private object KubernetesClusterSchedulerBackend {
+  private val UNKNOWN_EXIT_CODE = -1
+}
diff --git a/resource-managers/kubernetes/core/src/test/resources/log4j.properties b/resource-managers/kubernetes/core/src/test/resources/log4j.properties
new file mode 100644
index 0000000000000..ad95fadb7c0c0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/resources/log4j.properties
@@ -0,0 +1,31 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
+log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark_project.jetty=WARN
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
new file mode 100644
index 0000000000000..1c7717c238096
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{Pod, _}
+import org.mockito.MockitoAnnotations
+import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+
+class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with BeforeAndAfterEach {
+  private val driverPodName: String = "driver-pod"
+  private val driverPodUid: String = "driver-uid"
+  private val executorPrefix: String = "base"
+  private val executorImage: String = "executor-image"
+  private val driverPod = new PodBuilder()
+    .withNewMetadata()
+    .withName(driverPodName)
+    .withUid(driverPodUid)
+    .endMetadata()
+    .withNewSpec()
+    .withNodeName("some-node")
+    .endSpec()
+    .withNewStatus()
+    .withHostIP("192.168.99.100")
+    .endStatus()
+    .build()
+  private var baseConf: SparkConf = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    baseConf = new SparkConf()
+      .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
+      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
+      .set(EXECUTOR_DOCKER_IMAGE, executorImage)
+  }
+
+  test("basic executor pod has reasonable defaults") {
+    val factory = new ExecutorPodFactoryImpl(baseConf)
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
+
+    // The executor pod name and default labels.
+    assert(executor.getMetadata.getName === s"$executorPrefix-exec-1")
+    assert(executor.getMetadata.getLabels.size() === 3)
+    assert(executor.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL) === "1")
+
+    // There is exactly 1 container with no volume mounts and default memory limits.
+    // Default memory limit is 1024M + 384M (minimum overhead constant).
+    assert(executor.getSpec.getContainers.size() === 1)
+    assert(executor.getSpec.getContainers.get(0).getImage === executorImage)
+    assert(executor.getSpec.getContainers.get(0).getVolumeMounts.isEmpty)
+    assert(executor.getSpec.getContainers.get(0).getResources.getLimits.size() === 1)
+    assert(executor.getSpec.getContainers.get(0).getResources
+      .getLimits.get("memory").getAmount === "1408Mi")
+
+    // The pod has no node selector, volumes.
+    assert(executor.getSpec.getNodeSelector.isEmpty)
+    assert(executor.getSpec.getVolumes.isEmpty)
+
+    checkEnv(executor, Map())
+    checkOwnerReferences(executor, driverPodUid)
+  }
+
+  test("executor pod hostnames get truncated to 63 characters") {
+    val conf = baseConf.clone()
+    conf.set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX,
+      "loremipsumdolorsitametvimatelitrefficiendisuscipianturvixlegeresple")
+
+    val factory = new ExecutorPodFactoryImpl(conf)
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
+
+    assert(executor.getSpec.getHostname.length === 63)
+  }
+
+  test("classpath and extra java options get translated into environment variables") {
+    val conf = baseConf.clone()
+    conf.set(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS, "foo=bar")
+    conf.set(org.apache.spark.internal.config.EXECUTOR_CLASS_PATH, "bar=baz")
+
+    val factory = new ExecutorPodFactoryImpl(conf)
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)]("qux" -> "quux"), driverPod, Map[String, Int]())
+
+    checkEnv(executor,
+      Map("SPARK_JAVA_OPT_0" -> "foo=bar",
+        "SPARK_EXECUTOR_EXTRA_CLASSPATH" -> "bar=baz",
+        "qux" -> "quux"))
+    checkOwnerReferences(executor, driverPodUid)
+  }
+
+  // There is always exactly one controller reference, and it points to the driver pod.
+  private def checkOwnerReferences(executor: Pod, driverPodUid: String): Unit = {
+    assert(executor.getMetadata.getOwnerReferences.size() === 1)
+    assert(executor.getMetadata.getOwnerReferences.get(0).getUid === driverPodUid)
+    assert(executor.getMetadata.getOwnerReferences.get(0).getController === true)
+  }
+
+  // Check that the expected environment variables are present.
+  private def checkEnv(executor: Pod, additionalEnvVars: Map[String, String]): Unit = {
+    val defaultEnvs = Map(
+      ENV_EXECUTOR_ID -> "1",
+      ENV_DRIVER_URL -> "dummy",
+      ENV_EXECUTOR_CORES -> "1",
+      ENV_EXECUTOR_MEMORY -> "1g",
+      ENV_APPLICATION_ID -> "dummy",
+      ENV_EXECUTOR_POD_IP -> null) ++ additionalEnvVars
+
+    assert(executor.getSpec.getContainers.size() === 1)
+    assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)
+    val mapEnvs = executor.getSpec.getContainers.get(0).getEnv.asScala.map {
+      x => (x.getName, x.getValue)
+    }.toMap
+    assert(defaultEnvs === mapEnvs)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
new file mode 100644
index 0000000000000..3febb2f47cfd4
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -0,0 +1,440 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.scheduler.cluster.k8s
+
+import java.util.concurrent.{ExecutorService, ScheduledExecutorService, TimeUnit}
+
+import io.fabric8.kubernetes.api.model.{DoneablePod, Pod, PodBuilder, PodList}
+import io.fabric8.kubernetes.client.{KubernetesClient, Watch, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+import io.fabric8.kubernetes.client.dsl.{FilterWatchListDeletable, MixedOperation, NonNamespaceOperation, PodResource}
+import org.mockito.{AdditionalAnswers, ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.Matchers.{any, eq => mockitoEq}
+import org.mockito.Mockito.{doNothing, never, times, verify, when}
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mockito.MockitoSugar._
+import scala.collection.JavaConverters._
+import scala.concurrent.Future
+
+import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.rpc._
+import org.apache.spark.scheduler.{ExecutorExited, LiveListenerBus, SlaveLost, TaskSchedulerImpl}
+import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
+import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
+import org.apache.spark.util.ThreadUtils
+
+class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val APP_ID = "test-spark-app"
+  private val DRIVER_POD_NAME = "spark-driver-pod"
+  private val NAMESPACE = "test-namespace"
+  private val SPARK_DRIVER_HOST = "localhost"
+  private val SPARK_DRIVER_PORT = 7077
+  private val POD_ALLOCATION_INTERVAL = 60L
+  private val DRIVER_URL = RpcEndpointAddress(
+    SPARK_DRIVER_HOST, SPARK_DRIVER_PORT, CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
+  private val FIRST_EXECUTOR_POD = new PodBuilder()
+    .withNewMetadata()
+      .withName("pod1")
+      .endMetadata()
+    .withNewSpec()
+      .withNodeName("node1")
+      .endSpec()
+    .withNewStatus()
+      .withHostIP("192.168.99.100")
+      .endStatus()
+    .build()
+  private val SECOND_EXECUTOR_POD = new PodBuilder()
+    .withNewMetadata()
+      .withName("pod2")
+      .endMetadata()
+    .withNewSpec()
+      .withNodeName("node2")
+      .endSpec()
+    .withNewStatus()
+      .withHostIP("192.168.99.101")
+      .endStatus()
+    .build()
+
+  private type PODS = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+  private type LABELED_PODS = FilterWatchListDeletable[
+    Pod, PodList, java.lang.Boolean, Watch, Watcher[Pod]]
+  private type IN_NAMESPACE_PODS = NonNamespaceOperation[
+    Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+
+  @Mock
+  private var sparkContext: SparkContext = _
+
+  @Mock
+  private var listenerBus: LiveListenerBus = _
+
+  @Mock
+  private var taskSchedulerImpl: TaskSchedulerImpl = _
+
+  @Mock
+  private var allocatorExecutor: ScheduledExecutorService = _
+
+  @Mock
+  private var requestExecutorsService: ExecutorService = _
+
+  @Mock
+  private var executorPodFactory: ExecutorPodFactory = _
+
+  @Mock
+  private var kubernetesClient: KubernetesClient = _
+
+  @Mock
+  private var podOperations: PODS = _
+
+  @Mock
+  private var podsWithLabelOperations: LABELED_PODS = _
+
+  @Mock
+  private var podsInNamespace: IN_NAMESPACE_PODS = _
+
+  @Mock
+  private var podsWithDriverName: PodResource[Pod, DoneablePod] = _
+
+  @Mock
+  private var rpcEnv: RpcEnv = _
+
+  @Mock
+  private var driverEndpointRef: RpcEndpointRef = _
+
+  @Mock
+  private var executorPodsWatch: Watch = _
+
+  @Mock
+  private var successFuture: Future[Boolean] = _
+
+  private var sparkConf: SparkConf = _
+  private var executorPodsWatcherArgument: ArgumentCaptor[Watcher[Pod]] = _
+  private var allocatorRunnable: ArgumentCaptor[Runnable] = _
+  private var requestExecutorRunnable: ArgumentCaptor[Runnable] = _
+  private var driverEndpoint: ArgumentCaptor[RpcEndpoint] = _
+
+  private val driverPod = new PodBuilder()
+    .withNewMetadata()
+      .withName(DRIVER_POD_NAME)
+      .addToLabels(SPARK_APP_ID_LABEL, APP_ID)
+      .addToLabels(SPARK_ROLE_LABEL, SPARK_POD_DRIVER_ROLE)
+      .endMetadata()
+    .build()
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    sparkConf = new SparkConf()
+      .set(KUBERNETES_DRIVER_POD_NAME, DRIVER_POD_NAME)
+      .set(KUBERNETES_NAMESPACE, NAMESPACE)
+      .set("spark.driver.host", SPARK_DRIVER_HOST)
+      .set("spark.driver.port", SPARK_DRIVER_PORT.toString)
+      .set(KUBERNETES_ALLOCATION_BATCH_DELAY, POD_ALLOCATION_INTERVAL)
+    executorPodsWatcherArgument = ArgumentCaptor.forClass(classOf[Watcher[Pod]])
+    allocatorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
+    requestExecutorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
+    driverEndpoint = ArgumentCaptor.forClass(classOf[RpcEndpoint])
+    when(sparkContext.conf).thenReturn(sparkConf)
+    when(sparkContext.listenerBus).thenReturn(listenerBus)
+    when(taskSchedulerImpl.sc).thenReturn(sparkContext)
+    when(kubernetesClient.pods()).thenReturn(podOperations)
+    when(podOperations.withLabel(SPARK_APP_ID_LABEL, APP_ID)).thenReturn(podsWithLabelOperations)
+    when(podsWithLabelOperations.watch(executorPodsWatcherArgument.capture()))
+      .thenReturn(executorPodsWatch)
+    when(podOperations.inNamespace(NAMESPACE)).thenReturn(podsInNamespace)
+    when(podsInNamespace.withName(DRIVER_POD_NAME)).thenReturn(podsWithDriverName)
+    when(podsWithDriverName.get()).thenReturn(driverPod)
+    when(allocatorExecutor.scheduleWithFixedDelay(
+      allocatorRunnable.capture(),
+      mockitoEq(0L),
+      mockitoEq(POD_ALLOCATION_INTERVAL),
+      mockitoEq(TimeUnit.SECONDS))).thenReturn(null)
+    // Creating Futures in Scala backed by a Java executor service resolves to running
+    // ExecutorService#execute (as opposed to submit)
+    doNothing().when(requestExecutorsService).execute(requestExecutorRunnable.capture())
+    when(rpcEnv.setupEndpoint(
+      mockitoEq(CoarseGrainedSchedulerBackend.ENDPOINT_NAME), driverEndpoint.capture()))
+      .thenReturn(driverEndpointRef)
+
+    // Used by the CoarseGrainedSchedulerBackend when making RPC calls.
+    when(driverEndpointRef.ask[Boolean]
+      (any(classOf[Any]))
+      (any())).thenReturn(successFuture)
+    when(successFuture.failed).thenReturn(Future[Throwable] {
+      // emulate behavior of the Future.failed method.
+      throw new NoSuchElementException()
+    }(ThreadUtils.sameThread))
+  }
+
+  test("Basic lifecycle expectations when starting and stopping the scheduler.") {
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    assert(executorPodsWatcherArgument.getValue != null)
+    assert(allocatorRunnable.getValue != null)
+    scheduler.stop()
+    verify(executorPodsWatch).close()
+  }
+
+  test("Static allocation should request executors upon first allocator run.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    requestExecutorRunnable.getValue.run()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
+    when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
+    allocatorRunnable.getValue.run()
+    verify(podOperations).create(firstResolvedPod)
+    verify(podOperations).create(secondResolvedPod)
+  }
+
+  test("Killing executors deletes the executor pods") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    requestExecutorRunnable.getValue.run()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
+    when(podOperations.create(any(classOf[Pod])))
+      .thenAnswer(AdditionalAnswers.returnsFirstArg())
+    allocatorRunnable.getValue.run()
+    scheduler.doKillExecutors(Seq("2"))
+    requestExecutorRunnable.getAllValues.asScala.last.run()
+    verify(podOperations).delete(secondResolvedPod)
+    verify(podOperations, never()).delete(firstResolvedPod)
+  }
+
+  test("Executors should be requested in batches.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    requestExecutorRunnable.getValue.run()
+    when(podOperations.create(any(classOf[Pod])))
+      .thenAnswer(AdditionalAnswers.returnsFirstArg())
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
+    allocatorRunnable.getValue.run()
+    verify(podOperations).create(firstResolvedPod)
+    verify(podOperations, never()).create(secondResolvedPod)
+    val registerFirstExecutorMessage = RegisterExecutor(
+      "1", mock[RpcEndpointRef], "localhost", 1, Map.empty[String, String])
+    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
+    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
+      .apply(registerFirstExecutorMessage)
+    allocatorRunnable.getValue.run()
+    verify(podOperations).create(secondResolvedPod)
+  }
+
+  test("Scaled down executors should be cleaned up") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+
+    // The scheduler backend spins up one executor pod.
+    requestExecutorRunnable.getValue.run()
+    when(podOperations.create(any(classOf[Pod])))
+      .thenAnswer(AdditionalAnswers.returnsFirstArg())
+    val resolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    allocatorRunnable.getValue.run()
+    val executorEndpointRef = mock[RpcEndpointRef]
+    when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
+    val registerFirstExecutorMessage = RegisterExecutor(
+      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
+    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
+    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
+      .apply(registerFirstExecutorMessage)
+
+    // Request that there are 0 executors and trigger deletion from driver.
+    scheduler.doRequestTotalExecutors(0)
+    requestExecutorRunnable.getAllValues.asScala.last.run()
+    scheduler.doKillExecutors(Seq("1"))
+    requestExecutorRunnable.getAllValues.asScala.last.run()
+    verify(podOperations, times(1)).delete(resolvedPod)
+    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
+
+    val exitedPod = exitPod(resolvedPod, 0)
+    executorPodsWatcherArgument.getValue.eventReceived(Action.DELETED, exitedPod)
+    allocatorRunnable.getValue.run()
+
+    // No more deletion attempts of the executors.
+    // This is graceful termination and should not be detected as a failure.
+    verify(podOperations, times(1)).delete(resolvedPod)
+    verify(driverEndpointRef, times(1)).ask[Boolean](
+      RemoveExecutor("1", ExecutorExited(
+        0,
+        exitCausedByApp = false,
+        s"Container in pod ${exitedPod.getMetadata.getName} exited from" +
+          s" explicit termination request.")))
+  }
+
+  test("Executors that fail should not be deleted.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
+    requestExecutorRunnable.getValue.run()
+    allocatorRunnable.getValue.run()
+    val executorEndpointRef = mock[RpcEndpointRef]
+    when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
+    val registerFirstExecutorMessage = RegisterExecutor(
+      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
+    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
+    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
+      .apply(registerFirstExecutorMessage)
+    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
+    executorPodsWatcherArgument.getValue.eventReceived(
+      Action.ERROR, exitPod(firstResolvedPod, 1))
+
+    // A replacement executor should be created but the error pod should persist.
+    val replacementPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
+    scheduler.doRequestTotalExecutors(1)
+    requestExecutorRunnable.getValue.run()
+    allocatorRunnable.getAllValues.asScala.last.run()
+    verify(podOperations, never()).delete(firstResolvedPod)
+    verify(driverEndpointRef).ask[Boolean](
+      RemoveExecutor("1", ExecutorExited(
+        1,
+        exitCausedByApp = true,
+        s"Pod ${FIRST_EXECUTOR_POD.getMetadata.getName}'s executor container exited with" +
+          " exit status code 1.")))
+  }
+
+  test("Executors disconnected due to unknown reasons are deleted and replaced.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+    val executorLostReasonCheckMaxAttempts = sparkConf.get(
+      KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
+
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
+    requestExecutorRunnable.getValue.run()
+    allocatorRunnable.getValue.run()
+    val executorEndpointRef = mock[RpcEndpointRef]
+    when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
+    val registerFirstExecutorMessage = RegisterExecutor(
+      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
+    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
+    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
+      .apply(registerFirstExecutorMessage)
+
+    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
+    1 to executorLostReasonCheckMaxAttempts foreach { _ =>
+      allocatorRunnable.getValue.run()
+      verify(podOperations, never()).delete(FIRST_EXECUTOR_POD)
+    }
+
+    val recreatedResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
+    allocatorRunnable.getValue.run()
+    verify(podOperations).delete(firstResolvedPod)
+    verify(driverEndpointRef).ask[Boolean](
+      RemoveExecutor("1", SlaveLost("Executor lost for unknown reasons.")))
+  }
+
+  test("Executors that fail to start on the Kubernetes API call rebuild in the next batch.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    when(podOperations.create(firstResolvedPod))
+      .thenThrow(new RuntimeException("test"))
+    requestExecutorRunnable.getValue.run()
+    allocatorRunnable.getValue.run()
+    verify(podOperations, times(1)).create(firstResolvedPod)
+    val recreatedResolvedPod = expectPodCreationWithId(2, FIRST_EXECUTOR_POD)
+    allocatorRunnable.getValue.run()
+    verify(podOperations).create(recreatedResolvedPod)
+  }
+
+  test("Executors that are initially created but the watch notices them fail are rebuilt" +
+    " in the next batch.") {
+    sparkConf
+      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
+      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
+    val scheduler = newSchedulerBackend()
+    scheduler.start()
+    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
+    when(podOperations.create(FIRST_EXECUTOR_POD)).thenAnswer(AdditionalAnswers.returnsFirstArg())
+    requestExecutorRunnable.getValue.run()
+    allocatorRunnable.getValue.run()
+    verify(podOperations, times(1)).create(firstResolvedPod)
+    executorPodsWatcherArgument.getValue.eventReceived(Action.ERROR, firstResolvedPod)
+    val recreatedResolvedPod = expectPodCreationWithId(2, FIRST_EXECUTOR_POD)
+    allocatorRunnable.getValue.run()
+    verify(podOperations).create(recreatedResolvedPod)
+  }
+
+  private def newSchedulerBackend(): KubernetesClusterSchedulerBackend = {
+    new KubernetesClusterSchedulerBackend(
+      taskSchedulerImpl,
+      rpcEnv,
+      executorPodFactory,
+      kubernetesClient,
+      allocatorExecutor,
+      requestExecutorsService) {
+
+      override def applicationId(): String = APP_ID
+    }
+  }
+
+  private def exitPod(basePod: Pod, exitCode: Int): Pod = {
+    new PodBuilder(basePod)
+      .editStatus()
+        .addNewContainerStatus()
+          .withNewState()
+            .withNewTerminated()
+              .withExitCode(exitCode)
+              .endTerminated()
+            .endState()
+          .endContainerStatus()
+        .endStatus()
+      .build()
+  }
+
+  private def expectPodCreationWithId(executorId: Int, expectedPod: Pod): Pod = {
+    val resolvedPod = new PodBuilder(expectedPod)
+      .editMetadata()
+        .addToLabels(SPARK_EXECUTOR_ID_LABEL, executorId.toString)
+        .endMetadata()
+      .build()
+    when(executorPodFactory.createExecutorPod(
+      executorId.toString,
+      APP_ID,
+      DRIVER_URL,
+      sparkConf.getExecutorEnv,
+      driverPod,
+      Map.empty)).thenReturn(resolvedPod)
+    resolvedPod
+  }
+}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 7052fb347106b..506adb363aa90 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -41,6 +41,7 @@ import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef}
 import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveLastAllocatedExecutorId
+import org.apache.spark.scheduler.cluster.SchedulerBackendUtils
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils}
 
 /**
@@ -109,7 +110,7 @@ private[yarn] class YarnAllocator(
     sparkConf.get(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L)
 
   @volatile private var targetNumExecutors =
-    YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sparkConf)
+    SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf)
 
   private var currentNodeBlacklist = Set.empty[String]
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 3d9f99f57bed7..9c1472cb50e3a 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -133,8 +133,6 @@ object YarnSparkHadoopUtil {
 
   val ANY_HOST = "*"
 
-  val DEFAULT_NUMBER_EXECUTORS = 2
-
   // All RM requests are issued with same priority : we do not (yet) have any distinction between
   // request types (like map/reduce in hadoop for example)
   val RM_REQUEST_PRIORITY = Priority.newInstance(1)
@@ -279,27 +277,5 @@ object YarnSparkHadoopUtil {
         securityMgr.getModifyAclsGroups)
     )
   }
-
-  /**
-   * Getting the initial target number of executors depends on whether dynamic allocation is
-   * enabled.
-   * If not using dynamic allocation it gets the number of executors requested by the user.
-   */
-  def getInitialTargetExecutorNumber(
-      conf: SparkConf,
-      numExecutors: Int = DEFAULT_NUMBER_EXECUTORS): Int = {
-    if (Utils.isDynamicAllocationEnabled(conf)) {
-      val minNumExecutors = conf.get(DYN_ALLOCATION_MIN_EXECUTORS)
-      val initialNumExecutors = Utils.getDynamicAllocationInitialExecutors(conf)
-      val maxNumExecutors = conf.get(DYN_ALLOCATION_MAX_EXECUTORS)
-      require(initialNumExecutors >= minNumExecutors && initialNumExecutors <= maxNumExecutors,
-        s"initial executor number $initialNumExecutors must between min executor number " +
-          s"$minNumExecutors and max executor number $maxNumExecutors")
-
-      initialNumExecutors
-    } else {
-      conf.get(EXECUTOR_INSTANCES).getOrElse(numExecutors)
-    }
-  }
 }
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index d482376d14dd7..b722cc401bb73 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -52,7 +52,7 @@ private[spark] class YarnClientSchedulerBackend(
 
     logDebug("ClientArguments called with: " + argsArrayBuf.mkString(" "))
     val args = new ClientArguments(argsArrayBuf.toArray)
-    totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(conf)
+    totalExpectedExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(conf)
     client = new Client(args, conf)
     bindToYarn(client.submitApplication(), None)
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index 4f3d5ebf403e0..e2d477be329c3 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -34,7 +34,7 @@ private[spark] class YarnClusterSchedulerBackend(
     val attemptId = ApplicationMaster.getAttemptId
     bindToYarn(attemptId.getApplicationId(), Some(attemptId))
     super.start()
-    totalExpectedExecutors = YarnSparkHadoopUtil.getInitialTargetExecutorNumber(sc.conf)
+    totalExpectedExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(sc.conf)
   }
 
   override def getDriverLogUrls: Option[Map[String, String]] = {

From 20b239845b695fe6a893ebfe97b49ef05fae773d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 29 Nov 2017 19:18:47 +0800
Subject: [PATCH 004/356] [SPARK-22605][SQL] SQL write job should also set
 Spark task output metrics

## What changes were proposed in this pull request?

For SQL write jobs, we only set metrics for the SQL listener and display them in the SQL plan UI. We should also set metrics for Spark task output metrics, which will be shown in spark job UI.

## How was this patch tested?

test it manually. For a simple write job
```
spark.range(1000).write.parquet("/tmp/p1")
```
now the spark job UI looks like
![ui](https://user-images.githubusercontent.com/3182036/33326478-05a25b7c-d490-11e7-96ef-806117774356.jpg)

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19833 from cloud-fan/ui.
---
 .../execution/datasources/BasicWriteStatsTracker.scala | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
index 11af0aaa7b206..9dbbe9946ee99 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala
@@ -22,7 +22,7 @@ import java.io.FileNotFoundException
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.SQLExecution
@@ -44,7 +44,6 @@ case class BasicWriteTaskStats(
 
 /**
  * Simple [[WriteTaskStatsTracker]] implementation that produces [[BasicWriteTaskStats]].
- * @param hadoopConf
  */
 class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
   extends WriteTaskStatsTracker with Logging {
@@ -106,6 +105,13 @@ class BasicWriteTaskStatsTracker(hadoopConf: Configuration)
 
   override def getFinalStats(): WriteTaskStats = {
     statCurrentFile()
+
+    // Reports bytesWritten and recordsWritten to the Spark output metrics.
+    Option(TaskContext.get()).map(_.taskMetrics().outputMetrics).foreach { outputMetrics =>
+      outputMetrics.setBytesWritten(numBytes)
+      outputMetrics.setRecordsWritten(numRows)
+    }
+
     if (submittedFiles != numFiles) {
       logInfo(s"Expected $submittedFiles files, but only saw $numFiles. " +
         "This could be due to the output format not writing empty files, " +

From 57687280d4171db98d4d9404c7bd3374f51deac0 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 29 Nov 2017 09:17:39 -0800
Subject: [PATCH 005/356] [SPARK-22615][SQL] Handle more cases in
 PropagateEmptyRelation

## What changes were proposed in this pull request?

Currently, in the optimize rule `PropagateEmptyRelation`, the following cases is not handled:
1.  empty relation as right child in left outer join
2. empty relation as left child in right outer join
3. empty relation as right child  in left semi join
4. empty relation as right child  in left anti join
5. only one empty relation in full outer join

case 1 / 2 / 5 can be treated as **Cartesian product** and cause exception. See the new test cases.

## How was this patch tested?
Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19825 from gengliangwang/SPARK-22615.
---
 .../optimizer/PropagateEmptyRelation.scala    |  36 +++-
 .../PropagateEmptyRelationSuite.scala         |  16 +-
 .../sql-tests/inputs/join-empty-relation.sql  |  28 +++
 .../results/join-empty-relation.sql.out       | 194 ++++++++++++++++++
 4 files changed, 257 insertions(+), 17 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
index 52fbb4df2f58e..a6e5aa6daca65 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelation.scala
@@ -41,6 +41,10 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
   private def empty(plan: LogicalPlan) =
     LocalRelation(plan.output, data = Seq.empty, isStreaming = plan.isStreaming)
 
+  // Construct a project list from plan's output, while the value is always NULL.
+  private def nullValueProjectList(plan: LogicalPlan): Seq[NamedExpression] =
+    plan.output.map{ a => Alias(Literal(null), a.name)(a.exprId) }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case p: Union if p.children.forall(isEmptyLocalRelation) =>
       empty(p)
@@ -49,16 +53,28 @@ object PropagateEmptyRelation extends Rule[LogicalPlan] with PredicateHelper {
     // as stateful streaming joins need to perform other state management operations other than
     // just processing the input data.
     case p @ Join(_, _, joinType, _)
-        if !p.children.exists(_.isStreaming) && p.children.exists(isEmptyLocalRelation) =>
-      joinType match {
-        case _: InnerLike => empty(p)
-        // Intersect is handled as LeftSemi by `ReplaceIntersectWithSemiJoin` rule.
-        // Except is handled as LeftAnti by `ReplaceExceptWithAntiJoin` rule.
-        case LeftOuter | LeftSemi | LeftAnti if isEmptyLocalRelation(p.left) => empty(p)
-        case RightOuter if isEmptyLocalRelation(p.right) => empty(p)
-        case FullOuter if p.children.forall(isEmptyLocalRelation) => empty(p)
-        case _ => p
-    }
+        if !p.children.exists(_.isStreaming) =>
+      val isLeftEmpty = isEmptyLocalRelation(p.left)
+      val isRightEmpty = isEmptyLocalRelation(p.right)
+      if (isLeftEmpty || isRightEmpty) {
+        joinType match {
+          case _: InnerLike => empty(p)
+          // Intersect is handled as LeftSemi by `ReplaceIntersectWithSemiJoin` rule.
+          // Except is handled as LeftAnti by `ReplaceExceptWithAntiJoin` rule.
+          case LeftOuter | LeftSemi | LeftAnti if isLeftEmpty => empty(p)
+          case LeftSemi if isRightEmpty => empty(p)
+          case LeftAnti if isRightEmpty => p.left
+          case FullOuter if isLeftEmpty && isRightEmpty => empty(p)
+          case LeftOuter | FullOuter if isRightEmpty =>
+            Project(p.left.output ++ nullValueProjectList(p.right), p.left)
+          case RightOuter if isRightEmpty => empty(p)
+          case RightOuter | FullOuter if isLeftEmpty =>
+            Project(nullValueProjectList(p.left) ++ p.right.output, p.right)
+          case _ => p
+        }
+      } else {
+        p
+      }
 
     case p: UnaryNode if p.children.nonEmpty && p.children.forall(isEmptyLocalRelation) => p match {
       case _: Project => empty(p)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
index bc1c48b99c295..3964508e3a55e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/PropagateEmptyRelationSuite.scala
@@ -21,8 +21,9 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.Literal
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.types.StructType
 
@@ -78,17 +79,18 @@ class PropagateEmptyRelationSuite extends PlanTest {
 
       (true, false, Inner, Some(LocalRelation('a.int, 'b.int))),
       (true, false, Cross, Some(LocalRelation('a.int, 'b.int))),
-      (true, false, LeftOuter, None),
+      (true, false, LeftOuter, Some(Project(Seq('a, Literal(null).as('b)), testRelation1).analyze)),
       (true, false, RightOuter, Some(LocalRelation('a.int, 'b.int))),
-      (true, false, FullOuter, None),
-      (true, false, LeftAnti, None),
-      (true, false, LeftSemi, None),
+      (true, false, FullOuter, Some(Project(Seq('a, Literal(null).as('b)), testRelation1).analyze)),
+      (true, false, LeftAnti, Some(testRelation1)),
+      (true, false, LeftSemi, Some(LocalRelation('a.int))),
 
       (false, true, Inner, Some(LocalRelation('a.int, 'b.int))),
       (false, true, Cross, Some(LocalRelation('a.int, 'b.int))),
       (false, true, LeftOuter, Some(LocalRelation('a.int, 'b.int))),
-      (false, true, RightOuter, None),
-      (false, true, FullOuter, None),
+      (false, true, RightOuter,
+        Some(Project(Seq(Literal(null).as('a), 'b), testRelation2).analyze)),
+      (false, true, FullOuter, Some(Project(Seq(Literal(null).as('a), 'b), testRelation2).analyze)),
       (false, true, LeftAnti, Some(LocalRelation('a.int))),
       (false, true, LeftSemi, Some(LocalRelation('a.int))),
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql b/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql
new file mode 100644
index 0000000000000..8afa3270f4de4
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/join-empty-relation.sql
@@ -0,0 +1,28 @@
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a);
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a);
+
+CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false;
+
+SELECT * FROM t1 INNER JOIN empty_table;
+SELECT * FROM t1 CROSS JOIN empty_table;
+SELECT * FROM t1 LEFT OUTER JOIN empty_table;
+SELECT * FROM t1 RIGHT OUTER JOIN empty_table;
+SELECT * FROM t1 FULL OUTER JOIN empty_table;
+SELECT * FROM t1 LEFT SEMI JOIN empty_table;
+SELECT * FROM t1 LEFT ANTI JOIN empty_table;
+
+SELECT * FROM empty_table INNER JOIN t1;
+SELECT * FROM empty_table CROSS JOIN t1;
+SELECT * FROM empty_table LEFT OUTER JOIN t1;
+SELECT * FROM empty_table RIGHT OUTER JOIN t1;
+SELECT * FROM empty_table FULL OUTER JOIN t1;
+SELECT * FROM empty_table LEFT SEMI JOIN t1;
+SELECT * FROM empty_table LEFT ANTI JOIN t1;
+
+SELECT * FROM empty_table INNER JOIN empty_table;
+SELECT * FROM empty_table CROSS JOIN empty_table;
+SELECT * FROM empty_table LEFT OUTER JOIN empty_table;
+SELECT * FROM empty_table RIGHT OUTER JOIN empty_table;
+SELECT * FROM empty_table FULL OUTER JOIN empty_table;
+SELECT * FROM empty_table LEFT SEMI JOIN empty_table;
+SELECT * FROM empty_table LEFT ANTI JOIN empty_table;
diff --git a/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out b/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out
new file mode 100644
index 0000000000000..857073a827f24
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/join-empty-relation.sql.out
@@ -0,0 +1,194 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 24
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t1 AS SELECT * FROM VALUES (1) AS GROUPING(a)
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+CREATE TEMPORARY VIEW t2 AS SELECT * FROM VALUES (1) AS GROUPING(a)
+-- !query 1 schema
+struct<>
+-- !query 1 output
+
+
+
+-- !query 2
+CREATE TEMPORARY VIEW empty_table as SELECT a FROM t2 WHERE false
+-- !query 2 schema
+struct<>
+-- !query 2 output
+
+
+
+-- !query 3
+SELECT * FROM t1 INNER JOIN empty_table
+-- !query 3 schema
+struct<a:int,a:int>
+-- !query 3 output
+
+
+
+-- !query 4
+SELECT * FROM t1 CROSS JOIN empty_table
+-- !query 4 schema
+struct<a:int,a:int>
+-- !query 4 output
+
+
+
+-- !query 5
+SELECT * FROM t1 LEFT OUTER JOIN empty_table
+-- !query 5 schema
+struct<a:int,a:int>
+-- !query 5 output
+1	NULL
+
+
+-- !query 6
+SELECT * FROM t1 RIGHT OUTER JOIN empty_table
+-- !query 6 schema
+struct<a:int,a:int>
+-- !query 6 output
+
+
+
+-- !query 7
+SELECT * FROM t1 FULL OUTER JOIN empty_table
+-- !query 7 schema
+struct<a:int,a:int>
+-- !query 7 output
+1	NULL
+
+
+-- !query 8
+SELECT * FROM t1 LEFT SEMI JOIN empty_table
+-- !query 8 schema
+struct<a:int>
+-- !query 8 output
+
+
+
+-- !query 9
+SELECT * FROM t1 LEFT ANTI JOIN empty_table
+-- !query 9 schema
+struct<a:int>
+-- !query 9 output
+1
+
+
+-- !query 10
+SELECT * FROM empty_table INNER JOIN t1
+-- !query 10 schema
+struct<a:int,a:int>
+-- !query 10 output
+
+
+
+-- !query 11
+SELECT * FROM empty_table CROSS JOIN t1
+-- !query 11 schema
+struct<a:int,a:int>
+-- !query 11 output
+
+
+
+-- !query 12
+SELECT * FROM empty_table LEFT OUTER JOIN t1
+-- !query 12 schema
+struct<a:int,a:int>
+-- !query 12 output
+
+
+
+-- !query 13
+SELECT * FROM empty_table RIGHT OUTER JOIN t1
+-- !query 13 schema
+struct<a:int,a:int>
+-- !query 13 output
+NULL	1
+
+
+-- !query 14
+SELECT * FROM empty_table FULL OUTER JOIN t1
+-- !query 14 schema
+struct<a:int,a:int>
+-- !query 14 output
+NULL	1
+
+
+-- !query 15
+SELECT * FROM empty_table LEFT SEMI JOIN t1
+-- !query 15 schema
+struct<a:int>
+-- !query 15 output
+
+
+
+-- !query 16
+SELECT * FROM empty_table LEFT ANTI JOIN t1
+-- !query 16 schema
+struct<a:int>
+-- !query 16 output
+
+
+
+-- !query 17
+SELECT * FROM empty_table INNER JOIN empty_table
+-- !query 17 schema
+struct<a:int,a:int>
+-- !query 17 output
+
+
+
+-- !query 18
+SELECT * FROM empty_table CROSS JOIN empty_table
+-- !query 18 schema
+struct<a:int,a:int>
+-- !query 18 output
+
+
+
+-- !query 19
+SELECT * FROM empty_table LEFT OUTER JOIN empty_table
+-- !query 19 schema
+struct<a:int,a:int>
+-- !query 19 output
+
+
+
+-- !query 20
+SELECT * FROM empty_table RIGHT OUTER JOIN empty_table
+-- !query 20 schema
+struct<a:int,a:int>
+-- !query 20 output
+
+
+
+-- !query 21
+SELECT * FROM empty_table FULL OUTER JOIN empty_table
+-- !query 21 schema
+struct<a:int,a:int>
+-- !query 21 output
+
+
+
+-- !query 22
+SELECT * FROM empty_table LEFT SEMI JOIN empty_table
+-- !query 22 schema
+struct<a:int>
+-- !query 22 output
+
+
+
+-- !query 23
+SELECT * FROM empty_table LEFT ANTI JOIN empty_table
+-- !query 23 schema
+struct<a:int>
+-- !query 23 output
+

From 284836862b2312aea5d7555c8e3c9d3c4dbc8eaf Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 30 Nov 2017 01:19:37 +0800
Subject: [PATCH 006/356] [SPARK-22608][SQL] add new API to
 CodeGeneration.splitExpressions()

## What changes were proposed in this pull request?

This PR adds a new API to ` CodeGenenerator.splitExpression` since since several ` CodeGenenerator.splitExpression` are used with `ctx.INPUT_ROW` to avoid code duplication.

## How was this patch tested?

Used existing test suits

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19821 from kiszk/SPARK-22608.
---
 .../spark/sql/catalyst/expressions/Cast.scala |  2 +-
 .../expressions/codegen/CodeGenerator.scala   | 24 ++++++++++++--
 .../sql/catalyst/expressions/predicates.scala | 10 +++---
 .../expressions/stringExpressions.scala       | 32 ++++++-------------
 4 files changed, 37 insertions(+), 31 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 12baddf1bf7ac..8cafaef61c7d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -1040,7 +1040,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         }
        """
     }
-    val fieldsEvalCodes = if (ctx.INPUT_ROW != null && ctx.currentVars == null) {
+    val fieldsEvalCodes = if (ctx.currentVars == null) {
       ctx.splitExpressions(
         expressions = fieldsEvalCode,
         funcName = "castStruct",
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 668c816b3fd8d..1645db12c53f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -788,11 +788,31 @@ class CodegenContext {
    * @param expressions the codes to evaluate expressions.
    */
   def splitExpressions(expressions: Seq[String]): String = {
+    splitExpressions(expressions, funcName = "apply", extraArguments = Nil)
+  }
+
+  /**
+   * Similar to [[splitExpressions(expressions: Seq[String])]], but has customized function name
+   * and extra arguments.
+   *
+   * @param expressions the codes to evaluate expressions.
+   * @param funcName the split function name base.
+   * @param extraArguments the list of (type, name) of the arguments of the split function
+   *                       except for ctx.INPUT_ROW
+  */
+  def splitExpressions(
+      expressions: Seq[String],
+      funcName: String,
+      extraArguments: Seq[(String, String)]): String = {
     // TODO: support whole stage codegen
     if (INPUT_ROW == null || currentVars != null) {
-      return expressions.mkString("\n")
+      expressions.mkString("\n")
+    } else {
+      splitExpressions(
+        expressions,
+        funcName,
+        arguments = ("InternalRow", INPUT_ROW) +: extraArguments)
     }
-    splitExpressions(expressions, funcName = "apply", arguments = ("InternalRow", INPUT_ROW) :: Nil)
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index eb7475354b104..1aaaaf1db48d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -251,12 +251,10 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
           }
         }
        """)
-    val listCodes = if (ctx.INPUT_ROW != null && ctx.currentVars == null) {
-      val args = ("InternalRow", ctx.INPUT_ROW) :: (ctx.javaType(value.dataType), valueArg) :: Nil
-      ctx.splitExpressions(expressions = listCode, funcName = "valueIn", arguments = args)
-    } else {
-      listCode.mkString("\n")
-    }
+    val listCodes = ctx.splitExpressions(
+      expressions = listCode,
+      funcName = "valueIn",
+      extraArguments = (ctx.javaType(value.dataType), valueArg) :: Nil)
     ev.copy(code = s"""
       ${valueGen.code}
       ${ev.value} = false;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index ee5cf925d3cef..34917ace001fa 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -73,14 +73,10 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
         }
       """
     }
-    val codes = if (ctx.INPUT_ROW != null && ctx.currentVars == null) {
-      ctx.splitExpressions(
-        expressions = inputs,
-        funcName = "valueConcat",
-        arguments = ("InternalRow", ctx.INPUT_ROW) :: ("UTF8String[]", args) :: Nil)
-    } else {
-      inputs.mkString("\n")
-    }
+    val codes = ctx.splitExpressions(
+      expressions = inputs,
+      funcName = "valueConcat",
+      extraArguments = ("UTF8String[]", args) :: Nil)
     ev.copy(s"""
       UTF8String[] $args = new UTF8String[${evals.length}];
       $codes
@@ -156,14 +152,10 @@ case class ConcatWs(children: Seq[Expression])
           ""
         }
       }
-      val codes = if (ctx.INPUT_ROW != null && ctx.currentVars == null) {
-        ctx.splitExpressions(
+      val codes = ctx.splitExpressions(
           expressions = inputs,
           funcName = "valueConcatWs",
-          arguments = ("InternalRow", ctx.INPUT_ROW) :: ("UTF8String[]", args) :: Nil)
-      } else {
-        inputs.mkString("\n")
-      }
+          extraArguments = ("UTF8String[]", args) :: Nil)
       ev.copy(s"""
         UTF8String[] $args = new UTF8String[$numArgs];
         ${separator.code}
@@ -1388,14 +1380,10 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
          $argList[$index] = $value;
        """
     }
-    val argListCodes = if (ctx.INPUT_ROW != null && ctx.currentVars == null) {
-      ctx.splitExpressions(
-        expressions = argListCode,
-        funcName = "valueFormatString",
-        arguments = ("InternalRow", ctx.INPUT_ROW) :: ("Object[]", argList) :: Nil)
-    } else {
-      argListCode.mkString("\n")
-    }
+    val argListCodes = ctx.splitExpressions(
+      expressions = argListCode,
+      funcName = "valueFormatString",
+      extraArguments = ("Object[]", argList) :: Nil)
 
     val form = ctx.freshName("formatter")
     val formatter = classOf[java.util.Formatter].getName

From 193555f79cc73873613674a09a7c371688b6dbc7 Mon Sep 17 00:00:00 2001
From: Stavros Kontopoulos <st.kontopoulos@gmail.com>
Date: Wed, 29 Nov 2017 14:15:35 -0800
Subject: [PATCH 007/356] [SPARK-18935][MESOS] Fix dynamic reservations on
 mesos

## What changes were proposed in this pull request?

- Solves the issue described in the ticket by preserving reservation and allocation info in all cases (port handling included).
- upgrades to 1.4
- Adds extra debug level logging to make debugging easier in the future, for example we add reservation info when applicable.
```
 17/09/29 14:53:07 DEBUG MesosCoarseGrainedSchedulerBackend: Accepting offer: f20de49b-dee3-45dd-a3c1-73418b7de891-O32 with attributes: Map() allocation info: role: "spark-prive"
  reservation info: name: "ports"
 type: RANGES
 ranges {
   range {
     begin: 31000
     end: 32000
   }
 }
 role: "spark-prive"
 reservation {
   principal: "test"
 }
 allocation_info {
   role: "spark-prive"
 }
```
- Some style cleanup.

## How was this patch tested?

Manually by running the example in the ticket with and without a principal. Specifically I tested it on a dc/os 1.10 cluster with 7 nodes and played with reservations. From the master node in order to reserve resources I executed:

```for i in 0 1 2 3 4 5 6
do
curl -i \
      -d slaveId=90ec65ea-1f7b-479f-a824-35d2527d6d26-S$i \
      -d resources='[
        {
          "name": "cpus",
          "type": "SCALAR",
          "scalar": { "value": 2 },
          "role": "spark-role",
          "reservation": {
            "principal": ""
          }
        },
        {
          "name": "mem",
          "type": "SCALAR",
          "scalar": { "value": 8026 },
          "role": "spark-role",
          "reservation": {
            "principal": ""
          }
        }
      ]' \
      -X POST http://master.mesos:5050/master/reserve
done
```
Nodes had 4 cpus (m3.xlarge instances)  and I reserved either 2 or 4 cpus (all for a role).
I verified it launches tasks on nodes with reserved resources under `spark-role` role  only if
a) there are remaining resources for (*) default role and the spark driver has no role assigned to it.
b) the spark driver has a role assigned to it and it is the same role used in reservations.
I also tested this locally on my machine.

Author: Stavros Kontopoulos <st.kontopoulos@gmail.com>

Closes #19390 from skonto/fix_dynamic_reservation.
---
 dev/deps/spark-deps-hadoop-2.6                |  2 +-
 dev/deps/spark-deps-hadoop-2.7                |  2 +-
 resource-managers/mesos/pom.xml               |  2 +-
 .../cluster/mesos/MesosClusterScheduler.scala |  1 -
 .../MesosCoarseGrainedSchedulerBackend.scala  | 17 +++-
 .../cluster/mesos/MesosSchedulerUtils.scala   | 99 ++++++++++++-------
 6 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 21c8a75796387..50ac6d139bbd4 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -138,7 +138,7 @@ lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
-mesos-1.3.0-shaded-protobuf.jar
+mesos-1.4.0-shaded-protobuf.jar
 metrics-core-3.1.5.jar
 metrics-graphite-3.1.5.jar
 metrics-json-3.1.5.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 7173426c7bf74..1b1e3166d53db 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -139,7 +139,7 @@ lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
 mail-1.4.7.jar
-mesos-1.3.0-shaded-protobuf.jar
+mesos-1.4.0-shaded-protobuf.jar
 metrics-core-3.1.5.jar
 metrics-graphite-3.1.5.jar
 metrics-json-3.1.5.jar
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index de8f1c913651d..70d0c1750b14e 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -29,7 +29,7 @@
   <name>Spark Project Mesos</name>
   <properties>
     <sbt.project.name>mesos</sbt.project.name>
-    <mesos.version>1.3.0</mesos.version>
+    <mesos.version>1.4.0</mesos.version>
     <mesos.classifier>shaded-protobuf</mesos.classifier>
   </properties>
 
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
index c41283e4a3e39..d224a7325820a 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosClusterScheduler.scala
@@ -36,7 +36,6 @@ import org.apache.spark.deploy.rest.{CreateSubmissionResponse, KillSubmissionRes
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.util.Utils
 
-
 /**
  * Tracks the current state of a Mesos Task that runs a Spark driver.
  * @param driverDescription Submitted driver description from
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index c392061fdb358..191415a2578b2 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -400,13 +400,20 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       val offerMem = getResource(offer.getResourcesList, "mem")
       val offerCpus = getResource(offer.getResourcesList, "cpus")
       val offerPorts = getRangeResource(offer.getResourcesList, "ports")
+      val offerReservationInfo = offer
+        .getResourcesList
+        .asScala
+        .find { r => r.getReservation != null }
       val id = offer.getId.getValue
 
       if (tasks.contains(offer.getId)) { // accept
         val offerTasks = tasks(offer.getId)
 
         logDebug(s"Accepting offer: $id with attributes: $offerAttributes " +
-          s"mem: $offerMem cpu: $offerCpus ports: $offerPorts." +
+          offerReservationInfo.map(resInfo =>
+            s"reservation info: ${resInfo.getReservation.toString}").getOrElse("") +
+          s"mem: $offerMem cpu: $offerCpus ports: $offerPorts " +
+          s"resources: ${offer.getResourcesList.asScala.mkString(",")}." +
           s"  Launching ${offerTasks.size} Mesos tasks.")
 
         for (task <- offerTasks) {
@@ -416,7 +423,7 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
           val ports = getRangeResource(task.getResourcesList, "ports").mkString(",")
 
           logDebug(s"Launching Mesos task: ${taskId.getValue} with mem: $mem cpu: $cpus" +
-            s" ports: $ports")
+            s" ports: $ports" + s" on slave with slave id: ${task.getSlaveId.getValue} ")
         }
 
         driver.launchTasks(
@@ -431,7 +438,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
       } else {
         declineOffer(
           driver,
-          offer)
+          offer,
+          Some("Offer was declined due to unmet task launch constraints."))
       }
     }
   }
@@ -513,6 +521,9 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
             totalGpusAcquired += taskGPUs
             gpusByTaskId(taskId) = taskGPUs
           }
+        } else {
+          logDebug(s"Cannot launch a task for offer with id: $offerId on slave " +
+            s"with id: $slaveId. Requirements were not met for this offer.")
         }
       }
     }
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
index 6fcb30af8a733..e75450369ad85 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerUtils.scala
@@ -28,7 +28,8 @@ import com.google.common.base.Splitter
 import org.apache.mesos.{MesosSchedulerDriver, Protos, Scheduler, SchedulerDriver}
 import org.apache.mesos.Protos.{TaskState => MesosTaskState, _}
 import org.apache.mesos.Protos.FrameworkInfo.Capability
-import org.apache.mesos.protobuf.{ByteString, GeneratedMessage}
+import org.apache.mesos.Protos.Resource.ReservationInfo
+import org.apache.mesos.protobuf.{ByteString, GeneratedMessageV3}
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException}
 import org.apache.spark.TaskState
@@ -36,8 +37,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.util.Utils
 
-
-
 /**
  * Shared trait for implementing a Mesos Scheduler. This holds common state and helper
  * methods and Mesos scheduler will use.
@@ -46,6 +45,8 @@ trait MesosSchedulerUtils extends Logging {
   // Lock used to wait for scheduler to be registered
   private final val registerLatch = new CountDownLatch(1)
 
+  private final val ANY_ROLE = "*"
+
   /**
    * Creates a new MesosSchedulerDriver that communicates to the Mesos master.
    *
@@ -175,17 +176,36 @@ trait MesosSchedulerUtils extends Logging {
     registerLatch.countDown()
   }
 
-  def createResource(name: String, amount: Double, role: Option[String] = None): Resource = {
+  private def setReservationInfo(
+       reservationInfo: Option[ReservationInfo],
+       role: Option[String],
+       builder: Resource.Builder): Unit = {
+    if (!role.contains(ANY_ROLE)) {
+      reservationInfo.foreach { res => builder.setReservation(res) }
+    }
+  }
+
+  def createResource(
+       name: String,
+       amount: Double,
+       role: Option[String] = None,
+       reservationInfo: Option[ReservationInfo] = None): Resource = {
     val builder = Resource.newBuilder()
       .setName(name)
       .setType(Value.Type.SCALAR)
       .setScalar(Value.Scalar.newBuilder().setValue(amount).build())
-
     role.foreach { r => builder.setRole(r) }
-
+    setReservationInfo(reservationInfo, role, builder)
     builder.build()
   }
 
+  private def getReservation(resource: Resource): Option[ReservationInfo] = {
+    if (resource.hasReservation) {
+      Some(resource.getReservation)
+    } else {
+      None
+    }
+  }
   /**
    * Partition the existing set of resources into two groups, those remaining to be
    * scheduled and those requested to be used for a new task.
@@ -203,14 +223,17 @@ trait MesosSchedulerUtils extends Logging {
     var requestedResources = new ArrayBuffer[Resource]
     val remainingResources = resources.asScala.map {
       case r =>
+        val reservation = getReservation(r)
         if (remain > 0 &&
           r.getType == Value.Type.SCALAR &&
           r.getScalar.getValue > 0.0 &&
           r.getName == resourceName) {
           val usage = Math.min(remain, r.getScalar.getValue)
-          requestedResources += createResource(resourceName, usage, Some(r.getRole))
+          requestedResources += createResource(resourceName, usage,
+            Option(r.getRole), reservation)
           remain -= usage
-          createResource(resourceName, r.getScalar.getValue - usage, Some(r.getRole))
+          createResource(resourceName, r.getScalar.getValue - usage,
+            Option(r.getRole), reservation)
         } else {
           r
         }
@@ -228,16 +251,6 @@ trait MesosSchedulerUtils extends Logging {
     (attr.getName, attr.getText.getValue.split(',').toSet)
   }
 
-
-  /** Build a Mesos resource protobuf object */
-  protected def createResource(resourceName: String, quantity: Double): Protos.Resource = {
-    Resource.newBuilder()
-      .setName(resourceName)
-      .setType(Value.Type.SCALAR)
-      .setScalar(Value.Scalar.newBuilder().setValue(quantity).build())
-      .build()
-  }
-
   /**
    * Converts the attributes from the resource offer into a Map of name to Attribute Value
    * The attribute values are the mesos attribute types and they are
@@ -245,7 +258,8 @@ trait MesosSchedulerUtils extends Logging {
    * @param offerAttributes the attributes offered
    * @return
    */
-  protected def toAttributeMap(offerAttributes: JList[Attribute]): Map[String, GeneratedMessage] = {
+  protected def toAttributeMap(offerAttributes: JList[Attribute])
+    : Map[String, GeneratedMessageV3] = {
     offerAttributes.asScala.map { attr =>
       val attrValue = attr.getType match {
         case Value.Type.SCALAR => attr.getScalar
@@ -266,7 +280,7 @@ trait MesosSchedulerUtils extends Logging {
    */
   def matchesAttributeRequirements(
       slaveOfferConstraints: Map[String, Set[String]],
-      offerAttributes: Map[String, GeneratedMessage]): Boolean = {
+      offerAttributes: Map[String, GeneratedMessageV3]): Boolean = {
     slaveOfferConstraints.forall {
       // offer has the required attribute and subsumes the required values for that attribute
       case (name, requiredValues) =>
@@ -427,10 +441,10 @@ trait MesosSchedulerUtils extends Logging {
       // partition port offers
       val (resourcesWithoutPorts, portResources) = filterPortResources(offeredResources)
 
-      val portsAndRoles = requestedPorts.
-        map(x => (x, findPortAndGetAssignedRangeRole(x, portResources)))
+      val portsAndResourceInfo = requestedPorts.
+        map { x => (x, findPortAndGetAssignedResourceInfo(x, portResources)) }
 
-      val assignedPortResources = createResourcesFromPorts(portsAndRoles)
+      val assignedPortResources = createResourcesFromPorts(portsAndResourceInfo)
 
       // ignore non-assigned port resources, they will be declined implicitly by mesos
       // no need for splitting port resources.
@@ -450,16 +464,25 @@ trait MesosSchedulerUtils extends Logging {
     managedPortNames.map(conf.getLong(_, 0)).filter( _ != 0)
   }
 
+  private case class RoleResourceInfo(
+      role: String,
+      resInfo: Option[ReservationInfo])
+
   /** Creates a mesos resource for a specific port number. */
-  private def createResourcesFromPorts(portsAndRoles: List[(Long, String)]) : List[Resource] = {
-    portsAndRoles.flatMap{ case (port, role) =>
-      createMesosPortResource(List((port, port)), Some(role))}
+  private def createResourcesFromPorts(
+      portsAndResourcesInfo: List[(Long, RoleResourceInfo)])
+    : List[Resource] = {
+    portsAndResourcesInfo.flatMap { case (port, rInfo) =>
+      createMesosPortResource(List((port, port)), Option(rInfo.role), rInfo.resInfo)}
   }
 
   /** Helper to create mesos resources for specific port ranges. */
   private def createMesosPortResource(
       ranges: List[(Long, Long)],
-      role: Option[String] = None): List[Resource] = {
+      role: Option[String] = None,
+      reservationInfo: Option[ReservationInfo] = None): List[Resource] = {
+    // for ranges we are going to use (user defined ports fall in there) create mesos resources
+    // for each range there is a role associated with it.
     ranges.map { case (rangeStart, rangeEnd) =>
       val rangeValue = Value.Range.newBuilder()
         .setBegin(rangeStart)
@@ -468,7 +491,8 @@ trait MesosSchedulerUtils extends Logging {
         .setName("ports")
         .setType(Value.Type.RANGES)
         .setRanges(Value.Ranges.newBuilder().addRange(rangeValue))
-      role.foreach(r => builder.setRole(r))
+      role.foreach { r => builder.setRole(r) }
+      setReservationInfo(reservationInfo, role, builder)
       builder.build()
     }
   }
@@ -477,19 +501,21 @@ trait MesosSchedulerUtils extends Logging {
   * Helper to assign a port to an offered range and get the latter's role
   * info to use it later on.
   */
-  private def findPortAndGetAssignedRangeRole(port: Long, portResources: List[Resource])
-    : String = {
+  private def findPortAndGetAssignedResourceInfo(port: Long, portResources: List[Resource])
+    : RoleResourceInfo = {
 
     val ranges = portResources.
-      map(resource =>
-        (resource.getRole, resource.getRanges.getRangeList.asScala
-          .map(r => (r.getBegin, r.getEnd)).toList))
+      map { resource =>
+        val reservation = getReservation(resource)
+        (RoleResourceInfo(resource.getRole, reservation),
+          resource.getRanges.getRangeList.asScala.map(r => (r.getBegin, r.getEnd)).toList)
+      }
 
-    val rangePortRole = ranges
-      .find { case (role, rangeList) => rangeList
+    val rangePortResourceInfo = ranges
+      .find { case (resourceInfo, rangeList) => rangeList
         .exists{ case (rangeStart, rangeEnd) => rangeStart <= port & rangeEnd >= port}}
     // this is safe since we have previously checked about the ranges (see checkPorts method)
-    rangePortRole.map{ case (role, rangeList) => role}.get
+    rangePortResourceInfo.map{ case (resourceInfo, rangeList) => resourceInfo}.get
   }
 
   /** Retrieves the port resources from a list of mesos offered resources */
@@ -564,3 +590,4 @@ trait MesosSchedulerUtils extends Logging {
     }
   }
 }
+

From 8ff474f6e543203fac5d49af7fbe98a8a98da567 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 29 Nov 2017 14:34:41 -0800
Subject: [PATCH 008/356] [SPARK-20650][CORE] Remove JobProgressListener.

The only remaining use of this class was the SparkStatusTracker, which
was modified to use the new status store. The test code to wait for
executors was moved to TestUtils and now uses the SparkStatusTracker API.

Indirectly, ConsoleProgressBar also uses this data. Because it has
some lower latency requirements, a shortcut to efficiently get the
active stages from the active listener was added to the AppStateStore.

Now that all UI code goes through the status store to get its data,
the FsHistoryProvider can be cleaned up to only replay event logs
when needed - that is, when there is no pre-existing disk store for
the application.

As part of this change I also modified the streaming UI to read the needed
data from the store, which was missed in the previous patch that made
JobProgressListener redundant.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19750 from vanzin/SPARK-20650.
---
 .../scala/org/apache/spark/SparkContext.scala |  11 +-
 .../org/apache/spark/SparkStatusTracker.scala |  76 +--
 .../scala/org/apache/spark/TestUtils.scala    |  26 +-
 .../deploy/history/FsHistoryProvider.scala    |  65 +-
 .../spark/status/AppStatusListener.scala      |  51 +-
 .../apache/spark/status/AppStatusStore.scala  |  17 +-
 .../org/apache/spark/status/LiveEntity.scala  |   8 +-
 .../spark/status/api/v1/StagesResource.scala  |   1 -
 .../apache/spark/ui/ConsoleProgressBar.scala  |  18 +-
 .../spark/ui/jobs/JobProgressListener.scala   | 612 ------------------
 .../org/apache/spark/ui/jobs/StagePage.scala  |   1 -
 .../org/apache/spark/ui/jobs/UIData.scala     | 311 ---------
 .../org/apache/spark/DistributedSuite.scala   |   2 +-
 .../spark/ExternalShuffleServiceSuite.scala   |   2 +-
 .../org/apache/spark/StatusTrackerSuite.scala |   6 +-
 .../spark/broadcast/BroadcastSuite.scala      |   2 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   |   4 +-
 .../SparkListenerWithClusterSuite.scala       |   4 +-
 .../ui/jobs/JobProgressListenerSuite.scala    | 442 -------------
 project/MimaExcludes.scala                    |   2 +
 .../apache/spark/streaming/ui/BatchPage.scala |  75 ++-
 21 files changed, 208 insertions(+), 1528 deletions(-)
 delete mode 100644 core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
 delete mode 100644 core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 23fd54f59268a..984dd0a6629a2 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -58,7 +58,6 @@ import org.apache.spark.status.{AppStatusPlugin, AppStatusStore}
 import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.TriggerThreadDump
 import org.apache.spark.ui.{ConsoleProgressBar, SparkUI}
-import org.apache.spark.ui.jobs.JobProgressListener
 import org.apache.spark.util._
 
 /**
@@ -195,7 +194,6 @@ class SparkContext(config: SparkConf) extends Logging {
   private var _eventLogCodec: Option[String] = None
   private var _listenerBus: LiveListenerBus = _
   private var _env: SparkEnv = _
-  private var _jobProgressListener: JobProgressListener = _
   private var _statusTracker: SparkStatusTracker = _
   private var _progressBar: Option[ConsoleProgressBar] = None
   private var _ui: Option[SparkUI] = None
@@ -270,8 +268,6 @@ class SparkContext(config: SparkConf) extends Logging {
     val map: ConcurrentMap[Int, RDD[_]] = new MapMaker().weakValues().makeMap[Int, RDD[_]]()
     map.asScala
   }
-  private[spark] def jobProgressListener: JobProgressListener = _jobProgressListener
-
   def statusTracker: SparkStatusTracker = _statusTracker
 
   private[spark] def progressBar: Option[ConsoleProgressBar] = _progressBar
@@ -421,11 +417,6 @@ class SparkContext(config: SparkConf) extends Logging {
 
     _listenerBus = new LiveListenerBus(_conf)
 
-    // "_jobProgressListener" should be set up before creating SparkEnv because when creating
-    // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
-    _jobProgressListener = new JobProgressListener(_conf)
-    listenerBus.addToStatusQueue(jobProgressListener)
-
     // Initialize the app status store and listener before SparkEnv is created so that it gets
     // all events.
     _statusStore = AppStatusStore.createLiveStore(conf, l => listenerBus.addToStatusQueue(l))
@@ -440,7 +431,7 @@ class SparkContext(config: SparkConf) extends Logging {
       _conf.set("spark.repl.class.uri", replUri)
     }
 
-    _statusTracker = new SparkStatusTracker(this)
+    _statusTracker = new SparkStatusTracker(this, _statusStore)
 
     _progressBar =
       if (_conf.get(UI_SHOW_CONSOLE_PROGRESS) && !log.isInfoEnabled) {
diff --git a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
index 22a553e68439a..70865cb58c571 100644
--- a/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
+++ b/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala
@@ -17,7 +17,10 @@
 
 package org.apache.spark
 
-import org.apache.spark.scheduler.TaskSchedulerImpl
+import java.util.Arrays
+
+import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.api.v1.StageStatus
 
 /**
  * Low-level status reporting APIs for monitoring job and stage progress.
@@ -33,9 +36,7 @@ import org.apache.spark.scheduler.TaskSchedulerImpl
  *
  * NOTE: this class's constructor should be considered private and may be subject to change.
  */
-class SparkStatusTracker private[spark] (sc: SparkContext) {
-
-  private val jobProgressListener = sc.jobProgressListener
+class SparkStatusTracker private[spark] (sc: SparkContext, store: AppStatusStore) {
 
   /**
    * Return a list of all known jobs in a particular job group.  If `jobGroup` is `null`, then
@@ -46,9 +47,8 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    * its result.
    */
   def getJobIdsForGroup(jobGroup: String): Array[Int] = {
-    jobProgressListener.synchronized {
-      jobProgressListener.jobGroupToJobIds.getOrElse(jobGroup, Seq.empty).toArray
-    }
+    val expected = Option(jobGroup)
+    store.jobsList(null).filter(_.jobGroup == expected).map(_.jobId).toArray
   }
 
   /**
@@ -57,9 +57,7 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    * This method does not guarantee the order of the elements in its result.
    */
   def getActiveStageIds(): Array[Int] = {
-    jobProgressListener.synchronized {
-      jobProgressListener.activeStages.values.map(_.stageId).toArray
-    }
+    store.stageList(Arrays.asList(StageStatus.ACTIVE)).map(_.stageId).toArray
   }
 
   /**
@@ -68,19 +66,15 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    * This method does not guarantee the order of the elements in its result.
    */
   def getActiveJobIds(): Array[Int] = {
-    jobProgressListener.synchronized {
-      jobProgressListener.activeJobs.values.map(_.jobId).toArray
-    }
+    store.jobsList(Arrays.asList(JobExecutionStatus.RUNNING)).map(_.jobId).toArray
   }
 
   /**
    * Returns job information, or `None` if the job info could not be found or was garbage collected.
    */
   def getJobInfo(jobId: Int): Option[SparkJobInfo] = {
-    jobProgressListener.synchronized {
-      jobProgressListener.jobIdToData.get(jobId).map { data =>
-        new SparkJobInfoImpl(jobId, data.stageIds.toArray, data.status)
-      }
+    store.asOption(store.job(jobId)).map { job =>
+      new SparkJobInfoImpl(jobId, job.stageIds.toArray, job.status)
     }
   }
 
@@ -89,21 +83,16 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    * garbage collected.
    */
   def getStageInfo(stageId: Int): Option[SparkStageInfo] = {
-    jobProgressListener.synchronized {
-      for (
-        info <- jobProgressListener.stageIdToInfo.get(stageId);
-        data <- jobProgressListener.stageIdToData.get((stageId, info.attemptId))
-      ) yield {
-        new SparkStageInfoImpl(
-          stageId,
-          info.attemptId,
-          info.submissionTime.getOrElse(0),
-          info.name,
-          info.numTasks,
-          data.numActiveTasks,
-          data.numCompleteTasks,
-          data.numFailedTasks)
-      }
+    store.asOption(store.lastStageAttempt(stageId)).map { stage =>
+      new SparkStageInfoImpl(
+        stageId,
+        stage.attemptId,
+        stage.submissionTime.map(_.getTime()).getOrElse(0L),
+        stage.name,
+        stage.numTasks,
+        stage.numActiveTasks,
+        stage.numCompleteTasks,
+        stage.numFailedTasks)
     }
   }
 
@@ -111,17 +100,20 @@ class SparkStatusTracker private[spark] (sc: SparkContext) {
    * Returns information of all known executors, including host, port, cacheSize, numRunningTasks.
    */
   def getExecutorInfos: Array[SparkExecutorInfo] = {
-    val executorIdToRunningTasks: Map[String, Int] =
-      sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].runningTasksByExecutors
+    store.executorList(true).map { exec =>
+      val (host, port) = exec.hostPort.split(":", 2) match {
+        case Array(h, p) => (h, p.toInt)
+        case Array(h) => (h, -1)
+      }
+      val cachedMem = exec.memoryMetrics.map { mem =>
+        mem.usedOnHeapStorageMemory + mem.usedOffHeapStorageMemory
+      }.getOrElse(0L)
 
-    sc.getExecutorStorageStatus.map { status =>
-      val bmId = status.blockManagerId
       new SparkExecutorInfoImpl(
-        bmId.host,
-        bmId.port,
-        status.cacheSize,
-        executorIdToRunningTasks.getOrElse(bmId.executorId, 0)
-      )
-    }
+        host,
+        port,
+        cachedMem,
+        exec.activeTasks)
+    }.toArray
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index a80016dd22fc5..93e7ee3d2a404 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -23,7 +23,7 @@ import java.nio.charset.StandardCharsets
 import java.security.SecureRandom
 import java.security.cert.X509Certificate
 import java.util.Arrays
-import java.util.concurrent.{CountDownLatch, TimeUnit}
+import java.util.concurrent.{CountDownLatch, TimeoutException, TimeUnit}
 import java.util.jar.{JarEntry, JarOutputStream}
 import javax.net.ssl._
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
@@ -232,6 +232,30 @@ private[spark] object TestUtils {
     }
   }
 
+  /**
+   * Wait until at least `numExecutors` executors are up, or throw `TimeoutException` if the waiting
+   * time elapsed before `numExecutors` executors up. Exposed for testing.
+   *
+   * @param numExecutors the number of executors to wait at least
+   * @param timeout time to wait in milliseconds
+   */
+  private[spark] def waitUntilExecutorsUp(
+      sc: SparkContext,
+      numExecutors: Int,
+      timeout: Long): Unit = {
+    val finishTime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeout)
+    while (System.nanoTime() < finishTime) {
+      if (sc.statusTracker.getExecutorInfos.length > numExecutors) {
+        return
+      }
+      // Sleep rather than using wait/notify, because this is used only for testing and wait/notify
+      // add overhead in the general case.
+      Thread.sleep(10)
+    }
+    throw new TimeoutException(
+      s"Can't find $numExecutors executors before $timeout milliseconds elapsed")
+  }
+
 }
 
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 69ccde3a8149d..6a83c106f6d84 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -299,8 +299,6 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       attempt.adminAclsGroups.getOrElse(""))
     secManager.setViewAclsGroups(attempt.viewAclsGroups.getOrElse(""))
 
-    val replayBus = new ReplayListenerBus()
-
     val uiStorePath = storePath.map { path => getStorePath(path, appId, attemptId) }
 
     val (kvstore, needReplay) = uiStorePath match {
@@ -320,48 +318,43 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         (new InMemoryStore(), true)
     }
 
-    val listener = if (needReplay) {
-      val _listener = new AppStatusListener(kvstore, conf, false,
+    if (needReplay) {
+      val replayBus = new ReplayListenerBus()
+      val listener = new AppStatusListener(kvstore, conf, false,
         lastUpdateTime = Some(attempt.info.lastUpdated.getTime()))
-      replayBus.addListener(_listener)
+      replayBus.addListener(listener)
       AppStatusPlugin.loadPlugins().foreach { plugin =>
         plugin.setupListeners(conf, kvstore, l => replayBus.addListener(l), false)
       }
-      Some(_listener)
-    } else {
-      None
+      try {
+        val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
+        replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
+        listener.flush()
+      } catch {
+        case e: Exception =>
+          try {
+            kvstore.close()
+          } catch {
+            case _e: Exception => logInfo("Error closing store.", _e)
+          }
+          uiStorePath.foreach(Utils.deleteRecursively)
+          if (e.isInstanceOf[FileNotFoundException]) {
+            return None
+          } else {
+            throw e
+          }
+      }
     }
 
-    val loadedUI = {
-      val ui = SparkUI.create(None, new AppStatusStore(kvstore), conf, secManager, app.info.name,
-        HistoryServer.getAttemptURI(appId, attempt.info.attemptId),
-        attempt.info.startTime.getTime(),
-        attempt.info.appSparkVersion)
-      LoadedAppUI(ui)
+    val ui = SparkUI.create(None, new AppStatusStore(kvstore), conf, secManager, app.info.name,
+      HistoryServer.getAttemptURI(appId, attempt.info.attemptId),
+      attempt.info.startTime.getTime(),
+      attempt.info.appSparkVersion)
+    AppStatusPlugin.loadPlugins().foreach { plugin =>
+      plugin.setupUI(ui)
     }
 
-    try {
-      AppStatusPlugin.loadPlugins().foreach { plugin =>
-        plugin.setupUI(loadedUI.ui)
-      }
-
-      val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
-      replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
-      listener.foreach(_.flush())
-    } catch {
-      case e: Exception =>
-        try {
-          kvstore.close()
-        } catch {
-          case _e: Exception => logInfo("Error closing store.", _e)
-        }
-        uiStorePath.foreach(Utils.deleteRecursively)
-        if (e.isInstanceOf[FileNotFoundException]) {
-          return None
-        } else {
-          throw e
-        }
-    }
+    val loadedUI = LoadedAppUI(ui)
 
     synchronized {
       activeUIs((appId, attemptId)) = loadedUI
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index f2d8e0a5480ba..9c23d9d8c923a 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -18,7 +18,10 @@
 package org.apache.spark.status
 
 import java.util.Date
+import java.util.concurrent.ConcurrentHashMap
+import java.util.function.Function
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
 
 import org.apache.spark._
@@ -59,7 +62,7 @@ private[spark] class AppStatusListener(
 
   // Keep track of live entities, so that task metrics can be efficiently updated (without
   // causing too many writes to the underlying store, and other expensive operations).
-  private val liveStages = new HashMap[(Int, Int), LiveStage]()
+  private val liveStages = new ConcurrentHashMap[(Int, Int), LiveStage]()
   private val liveJobs = new HashMap[Int, LiveJob]()
   private val liveExecutors = new HashMap[String, LiveExecutor]()
   private val liveTasks = new HashMap[Long, LiveTask]()
@@ -268,13 +271,15 @@ private[spark] class AppStatusListener(
       val now = System.nanoTime()
 
       // Check if there are any pending stages that match this job; mark those as skipped.
-      job.stageIds.foreach { sid =>
-        val pending = liveStages.filter { case ((id, _), _) => id == sid }
-        pending.foreach { case (key, stage) =>
+      val it = liveStages.entrySet.iterator()
+      while (it.hasNext()) {
+        val e = it.next()
+        if (job.stageIds.contains(e.getKey()._1)) {
+          val stage = e.getValue()
           stage.status = v1.StageStatus.SKIPPED
           job.skippedStages += stage.info.stageId
           job.skippedTasks += stage.info.numTasks
-          liveStages.remove(key)
+          it.remove()
           update(stage, now)
         }
       }
@@ -336,7 +341,7 @@ private[spark] class AppStatusListener(
     liveTasks.put(event.taskInfo.taskId, task)
     liveUpdate(task, now)
 
-    liveStages.get((event.stageId, event.stageAttemptId)).foreach { stage =>
+    Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       stage.activeTasks += 1
       stage.firstLaunchTime = math.min(stage.firstLaunchTime, event.taskInfo.launchTime)
       maybeUpdate(stage, now)
@@ -403,7 +408,7 @@ private[spark] class AppStatusListener(
         (0, 1, 0)
     }
 
-    liveStages.get((event.stageId, event.stageAttemptId)).foreach { stage =>
+    Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       if (metricsDelta != null) {
         stage.metrics.update(metricsDelta)
       }
@@ -466,12 +471,19 @@ private[spark] class AppStatusListener(
         }
       }
 
-      maybeUpdate(exec, now)
+      // Force an update on live applications when the number of active tasks reaches 0. This is
+      // checked in some tests (e.g. SQLTestUtilsBase) so it needs to be reliably up to date.
+      if (exec.activeTasks == 0) {
+        liveUpdate(exec, now)
+      } else {
+        maybeUpdate(exec, now)
+      }
     }
   }
 
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
-    liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)).foreach { stage =>
+    val maybeStage = Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)))
+    maybeStage.foreach { stage =>
       val now = System.nanoTime()
       stage.info = event.stageInfo
 
@@ -540,7 +552,7 @@ private[spark] class AppStatusListener(
         val delta = task.updateMetrics(metrics)
         maybeUpdate(task, now)
 
-        liveStages.get((sid, sAttempt)).foreach { stage =>
+        Option(liveStages.get((sid, sAttempt))).foreach { stage =>
           stage.metrics.update(delta)
           maybeUpdate(stage, now)
 
@@ -563,7 +575,7 @@ private[spark] class AppStatusListener(
   /** Flush all live entities' data to the underlying store. */
   def flush(): Unit = {
     val now = System.nanoTime()
-    liveStages.values.foreach { stage =>
+    liveStages.values.asScala.foreach { stage =>
       update(stage, now)
       stage.executorSummaries.values.foreach(update(_, now))
     }
@@ -574,6 +586,18 @@ private[spark] class AppStatusListener(
     pools.values.foreach(update(_, now))
   }
 
+  /**
+   * Shortcut to get active stages quickly in a live application, for use by the console
+   * progress bar.
+   */
+  def activeStages(): Seq[v1.StageData] = {
+    liveStages.values.asScala
+      .filter(_.info.submissionTime.isDefined)
+      .map(_.toApi())
+      .toList
+      .sortBy(_.stageId)
+  }
+
   private def updateRDDBlock(event: SparkListenerBlockUpdated, block: RDDBlockId): Unit = {
     val now = System.nanoTime()
     val executorId = event.blockUpdatedInfo.blockManagerId.executorId
@@ -708,7 +732,10 @@ private[spark] class AppStatusListener(
   }
 
   private def getOrCreateStage(info: StageInfo): LiveStage = {
-    val stage = liveStages.getOrElseUpdate((info.stageId, info.attemptId), new LiveStage())
+    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptId),
+      new Function[(Int, Int), LiveStage]() {
+        override def apply(key: (Int, Int)): LiveStage = new LiveStage()
+      })
     stage.info = info
     stage
   }
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index d0615e5dd0223..22d768b3cb990 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -32,7 +32,9 @@ import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
 /**
  * A wrapper around a KVStore that provides methods for accessing the API data stored within.
  */
-private[spark] class AppStatusStore(val store: KVStore) {
+private[spark] class AppStatusStore(
+    val store: KVStore,
+    listener: Option[AppStatusListener] = None) {
 
   def applicationInfo(): v1.ApplicationInfo = {
     store.view(classOf[ApplicationInfoWrapper]).max(1).iterator().next().info
@@ -70,6 +72,14 @@ private[spark] class AppStatusStore(val store: KVStore) {
     store.read(classOf[ExecutorSummaryWrapper], executorId).info
   }
 
+  /**
+   * This is used by ConsoleProgressBar to quickly fetch active stages for drawing the progress
+   * bar. It will only return anything useful when called from a live application.
+   */
+  def activeStages(): Seq[v1.StageData] = {
+    listener.map(_.activeStages()).getOrElse(Nil)
+  }
+
   def stageList(statuses: JList[v1.StageStatus]): Seq[v1.StageData] = {
     val it = store.view(classOf[StageDataWrapper]).reverse().asScala.map(_.info)
     if (statuses != null && !statuses.isEmpty()) {
@@ -338,11 +348,12 @@ private[spark] object AppStatusStore {
    */
   def createLiveStore(conf: SparkConf, addListenerFn: SparkListener => Unit): AppStatusStore = {
     val store = new InMemoryStore()
-    addListenerFn(new AppStatusListener(store, conf, true))
+    val listener = new AppStatusListener(store, conf, true)
+    addListenerFn(listener)
     AppStatusPlugin.loadPlugins().foreach { p =>
       p.setupListeners(conf, store, addListenerFn, true)
     }
-    new AppStatusStore(store)
+    new AppStatusStore(store, listener = Some(listener))
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index ef2936c9b69a4..983c58a607aa8 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -408,8 +408,8 @@ private class LiveStage extends LiveEntity {
       new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
   }
 
-  override protected def doUpdate(): Any = {
-    val update = new v1.StageData(
+  def toApi(): v1.StageData = {
+    new v1.StageData(
       status,
       info.stageId,
       info.attemptId,
@@ -449,8 +449,10 @@ private class LiveStage extends LiveEntity {
       None,
       None,
       killedSummary)
+  }
 
-    new StageDataWrapper(update, jobIds)
+  override protected def doUpdate(): Any = {
+    new StageDataWrapper(toApi(), jobIds)
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
index bd4dfe3c68885..b3561109bc636 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
@@ -25,7 +25,6 @@ import org.apache.spark.scheduler.StageInfo
 import org.apache.spark.status.api.v1.StageStatus._
 import org.apache.spark.status.api.v1.TaskSorting._
 import org.apache.spark.ui.SparkUI
-import org.apache.spark.ui.jobs.UIData.StageUIData
 
 @Produces(Array(MediaType.APPLICATION_JSON))
 private[v1] class StagesResource extends BaseAppResource {
diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
index 3ae80ecfd22e6..3c4ee4eb6bbb9 100644
--- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
+++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
@@ -21,10 +21,11 @@ import java.util.{Timer, TimerTask}
 
 import org.apache.spark._
 import org.apache.spark.internal.Logging
+import org.apache.spark.status.api.v1.StageData
 
 /**
  * ConsoleProgressBar shows the progress of stages in the next line of the console. It poll the
- * status of active stages from `sc.statusTracker` periodically, the progress bar will be showed
+ * status of active stages from the app state store periodically, the progress bar will be showed
  * up after the stage has ran at least 500ms. If multiple stages run in the same time, the status
  * of them will be combined together, showed in one line.
  */
@@ -64,9 +65,8 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
     if (now - lastFinishTime < firstDelayMSec) {
       return
     }
-    val stageIds = sc.statusTracker.getActiveStageIds()
-    val stages = stageIds.flatMap(sc.statusTracker.getStageInfo).filter(_.numTasks() > 1)
-      .filter(now - _.submissionTime() > firstDelayMSec).sortBy(_.stageId())
+    val stages = sc.statusStore.activeStages()
+      .filter { s => now - s.submissionTime.get.getTime() > firstDelayMSec }
     if (stages.length > 0) {
       show(now, stages.take(3))  // display at most 3 stages in same time
     }
@@ -77,15 +77,15 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
    * after your last output, keeps overwriting itself to hold in one line. The logging will follow
    * the progress bar, then progress bar will be showed in next line without overwrite logs.
    */
-  private def show(now: Long, stages: Seq[SparkStageInfo]) {
+  private def show(now: Long, stages: Seq[StageData]) {
     val width = TerminalWidth / stages.size
     val bar = stages.map { s =>
-      val total = s.numTasks()
-      val header = s"[Stage ${s.stageId()}:"
-      val tailer = s"(${s.numCompletedTasks()} + ${s.numActiveTasks()}) / $total]"
+      val total = s.numTasks
+      val header = s"[Stage ${s.stageId}:"
+      val tailer = s"(${s.numCompleteTasks} + ${s.numActiveTasks}) / $total]"
       val w = width - header.length - tailer.length
       val bar = if (w > 0) {
-        val percent = w * s.numCompletedTasks() / total
+        val percent = w * s.numCompleteTasks / total
         (0 until w).map { i =>
           if (i < percent) "=" else if (i == percent) ">" else " "
         }.mkString("")
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
deleted file mode 100644
index a18e86ec0a73b..0000000000000
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ui.jobs
-
-import java.util.concurrent.TimeoutException
-
-import scala.collection.mutable.{HashMap, HashSet, LinkedHashMap, ListBuffer}
-
-import org.apache.spark._
-import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config._
-import org.apache.spark.scheduler._
-import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
-import org.apache.spark.storage.BlockManagerId
-import org.apache.spark.ui.SparkUI
-import org.apache.spark.ui.jobs.UIData._
-
-/**
- * :: DeveloperApi ::
- * Tracks task-level information to be displayed in the UI.
- *
- * All access to the data structures in this class must be synchronized on the
- * class, since the UI thread and the EventBus loop may otherwise be reading and
- * updating the internal data structures concurrently.
- */
-@DeveloperApi
-@deprecated("This class will be removed in a future release.", "2.2.0")
-class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
-
-  // Define a handful of type aliases so that data structures' types can serve as documentation.
-  // These type aliases are public because they're used in the types of public fields:
-
-  type JobId = Int
-  type JobGroupId = String
-  type StageId = Int
-  type StageAttemptId = Int
-  type PoolName = String
-  type ExecutorId = String
-
-  // Application:
-  @volatile var startTime = -1L
-  @volatile var endTime = -1L
-
-  // Jobs:
-  val activeJobs = new HashMap[JobId, JobUIData]
-  val completedJobs = ListBuffer[JobUIData]()
-  val failedJobs = ListBuffer[JobUIData]()
-  val jobIdToData = new HashMap[JobId, JobUIData]
-  val jobGroupToJobIds = new HashMap[JobGroupId, HashSet[JobId]]
-
-  // Stages:
-  val pendingStages = new HashMap[StageId, StageInfo]
-  val activeStages = new HashMap[StageId, StageInfo]
-  val completedStages = ListBuffer[StageInfo]()
-  val skippedStages = ListBuffer[StageInfo]()
-  val failedStages = ListBuffer[StageInfo]()
-  val stageIdToData = new HashMap[(StageId, StageAttemptId), StageUIData]
-  val stageIdToInfo = new HashMap[StageId, StageInfo]
-  val stageIdToActiveJobIds = new HashMap[StageId, HashSet[JobId]]
-  val poolToActiveStages = HashMap[PoolName, HashMap[StageId, StageInfo]]()
-  // Total of completed and failed stages that have ever been run.  These may be greater than
-  // `completedStages.size` and `failedStages.size` if we have run more stages or jobs than
-  // JobProgressListener's retention limits.
-  var numCompletedStages = 0
-  var numFailedStages = 0
-  var numCompletedJobs = 0
-  var numFailedJobs = 0
-
-  // Misc:
-  val executorIdToBlockManagerId = HashMap[ExecutorId, BlockManagerId]()
-
-  def blockManagerIds: Seq[BlockManagerId] = executorIdToBlockManagerId.values.toSeq
-
-  var schedulingMode: Option[SchedulingMode] = None
-
-  // To limit the total memory usage of JobProgressListener, we only track information for a fixed
-  // number of non-active jobs and stages (there is no limit for active jobs and stages):
-
-  val retainedStages = conf.getInt("spark.ui.retainedStages", SparkUI.DEFAULT_RETAINED_STAGES)
-  val retainedJobs = conf.getInt("spark.ui.retainedJobs", SparkUI.DEFAULT_RETAINED_JOBS)
-  val retainedTasks = conf.get(UI_RETAINED_TASKS)
-
-  // We can test for memory leaks by ensuring that collections that track non-active jobs and
-  // stages do not grow without bound and that collections for active jobs/stages eventually become
-  // empty once Spark is idle.  Let's partition our collections into ones that should be empty
-  // once Spark is idle and ones that should have a hard- or soft-limited sizes.
-  // These methods are used by unit tests, but they're defined here so that people don't forget to
-  // update the tests when adding new collections.  Some collections have multiple levels of
-  // nesting, etc, so this lets us customize our notion of "size" for each structure:
-
-  // These collections should all be empty once Spark is idle (no active stages / jobs):
-  private[spark] def getSizesOfActiveStateTrackingCollections: Map[String, Int] = {
-    Map(
-      "activeStages" -> activeStages.size,
-      "activeJobs" -> activeJobs.size,
-      "poolToActiveStages" -> poolToActiveStages.values.map(_.size).sum,
-      "stageIdToActiveJobIds" -> stageIdToActiveJobIds.values.map(_.size).sum
-    )
-  }
-
-  // These collections should stop growing once we have run at least `spark.ui.retainedStages`
-  // stages and `spark.ui.retainedJobs` jobs:
-  private[spark] def getSizesOfHardSizeLimitedCollections: Map[String, Int] = {
-    Map(
-      "completedJobs" -> completedJobs.size,
-      "failedJobs" -> failedJobs.size,
-      "completedStages" -> completedStages.size,
-      "skippedStages" -> skippedStages.size,
-      "failedStages" -> failedStages.size
-    )
-  }
-
-  // These collections may grow arbitrarily, but once Spark becomes idle they should shrink back to
-  // some bound based on the `spark.ui.retainedStages` and `spark.ui.retainedJobs` settings:
-  private[spark] def getSizesOfSoftSizeLimitedCollections: Map[String, Int] = {
-    Map(
-      "jobIdToData" -> jobIdToData.size,
-      "stageIdToData" -> stageIdToData.size,
-      "stageIdToStageInfo" -> stageIdToInfo.size,
-      "jobGroupToJobIds" -> jobGroupToJobIds.values.map(_.size).sum,
-      // Since jobGroupToJobIds is map of sets, check that we don't leak keys with empty values:
-      "jobGroupToJobIds keySet" -> jobGroupToJobIds.keys.size
-    )
-  }
-
-  /** If stages is too large, remove and garbage collect old stages */
-  private def trimStagesIfNecessary(stages: ListBuffer[StageInfo]) = synchronized {
-    if (stages.size > retainedStages) {
-      val toRemove = calculateNumberToRemove(stages.size, retainedStages)
-      stages.take(toRemove).foreach { s =>
-        stageIdToData.remove((s.stageId, s.attemptId))
-        stageIdToInfo.remove(s.stageId)
-      }
-      stages.trimStart(toRemove)
-    }
-  }
-
-  /** If jobs is too large, remove and garbage collect old jobs */
-  private def trimJobsIfNecessary(jobs: ListBuffer[JobUIData]) = synchronized {
-    if (jobs.size > retainedJobs) {
-      val toRemove = calculateNumberToRemove(jobs.size, retainedJobs)
-      jobs.take(toRemove).foreach { job =>
-        // Remove the job's UI data, if it exists
-        jobIdToData.remove(job.jobId).foreach { removedJob =>
-          // A null jobGroupId is used for jobs that are run without a job group
-          val jobGroupId = removedJob.jobGroup.orNull
-          // Remove the job group -> job mapping entry, if it exists
-          jobGroupToJobIds.get(jobGroupId).foreach { jobsInGroup =>
-            jobsInGroup.remove(job.jobId)
-            // If this was the last job in this job group, remove the map entry for the job group
-            if (jobsInGroup.isEmpty) {
-              jobGroupToJobIds.remove(jobGroupId)
-            }
-          }
-        }
-      }
-      jobs.trimStart(toRemove)
-    }
-  }
-
-  override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized {
-    val jobGroup = for (
-      props <- Option(jobStart.properties);
-      group <- Option(props.getProperty(SparkContext.SPARK_JOB_GROUP_ID))
-    ) yield group
-    val jobData: JobUIData =
-      new JobUIData(
-        jobId = jobStart.jobId,
-        submissionTime = Option(jobStart.time).filter(_ >= 0),
-        stageIds = jobStart.stageIds,
-        jobGroup = jobGroup,
-        status = JobExecutionStatus.RUNNING)
-    // A null jobGroupId is used for jobs that are run without a job group
-    jobGroupToJobIds.getOrElseUpdate(jobGroup.orNull, new HashSet[JobId]).add(jobStart.jobId)
-    jobStart.stageInfos.foreach(x => pendingStages(x.stageId) = x)
-    // Compute (a potential underestimate of) the number of tasks that will be run by this job.
-    // This may be an underestimate because the job start event references all of the result
-    // stages' transitive stage dependencies, but some of these stages might be skipped if their
-    // output is available from earlier runs.
-    // See https://github.com/apache/spark/pull/3009 for a more extensive discussion.
-    jobData.numTasks = {
-      val allStages = jobStart.stageInfos
-      val missingStages = allStages.filter(_.completionTime.isEmpty)
-      missingStages.map(_.numTasks).sum
-    }
-    jobIdToData(jobStart.jobId) = jobData
-    activeJobs(jobStart.jobId) = jobData
-    for (stageId <- jobStart.stageIds) {
-      stageIdToActiveJobIds.getOrElseUpdate(stageId, new HashSet[StageId]).add(jobStart.jobId)
-    }
-    // If there's no information for a stage, store the StageInfo received from the scheduler
-    // so that we can display stage descriptions for pending stages:
-    for (stageInfo <- jobStart.stageInfos) {
-      stageIdToInfo.getOrElseUpdate(stageInfo.stageId, stageInfo)
-      stageIdToData.getOrElseUpdate((stageInfo.stageId, stageInfo.attemptId), new StageUIData)
-    }
-  }
-
-  override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = synchronized {
-    val jobData = activeJobs.remove(jobEnd.jobId).getOrElse {
-      logWarning(s"Job completed for unknown job ${jobEnd.jobId}")
-      new JobUIData(jobId = jobEnd.jobId)
-    }
-    jobData.completionTime = Option(jobEnd.time).filter(_ >= 0)
-
-    jobData.stageIds.foreach(pendingStages.remove)
-    jobEnd.jobResult match {
-      case JobSucceeded =>
-        completedJobs += jobData
-        trimJobsIfNecessary(completedJobs)
-        jobData.status = JobExecutionStatus.SUCCEEDED
-        numCompletedJobs += 1
-      case JobFailed(_) =>
-        failedJobs += jobData
-        trimJobsIfNecessary(failedJobs)
-        jobData.status = JobExecutionStatus.FAILED
-        numFailedJobs += 1
-    }
-    for (stageId <- jobData.stageIds) {
-      stageIdToActiveJobIds.get(stageId).foreach { jobsUsingStage =>
-        jobsUsingStage.remove(jobEnd.jobId)
-        if (jobsUsingStage.isEmpty) {
-          stageIdToActiveJobIds.remove(stageId)
-        }
-        stageIdToInfo.get(stageId).foreach { stageInfo =>
-          if (stageInfo.submissionTime.isEmpty) {
-            // if this stage is pending, it won't complete, so mark it as "skipped":
-            skippedStages += stageInfo
-            trimStagesIfNecessary(skippedStages)
-            jobData.numSkippedStages += 1
-            jobData.numSkippedTasks += stageInfo.numTasks
-          }
-        }
-      }
-    }
-  }
-
-  override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized {
-    val stage = stageCompleted.stageInfo
-    stageIdToInfo(stage.stageId) = stage
-    val stageData = stageIdToData.getOrElseUpdate((stage.stageId, stage.attemptId), {
-      logWarning("Stage completed for unknown stage " + stage.stageId)
-      new StageUIData
-    })
-
-    for ((id, info) <- stageCompleted.stageInfo.accumulables) {
-      stageData.accumulables(id) = info
-    }
-
-    poolToActiveStages.get(stageData.schedulingPool).foreach { hashMap =>
-      hashMap.remove(stage.stageId)
-    }
-    activeStages.remove(stage.stageId)
-    if (stage.failureReason.isEmpty) {
-      completedStages += stage
-      numCompletedStages += 1
-      trimStagesIfNecessary(completedStages)
-    } else {
-      failedStages += stage
-      numFailedStages += 1
-      trimStagesIfNecessary(failedStages)
-    }
-
-    for (
-      activeJobsDependentOnStage <- stageIdToActiveJobIds.get(stage.stageId);
-      jobId <- activeJobsDependentOnStage;
-      jobData <- jobIdToData.get(jobId)
-    ) {
-      jobData.numActiveStages -= 1
-      if (stage.failureReason.isEmpty) {
-        if (stage.submissionTime.isDefined) {
-          jobData.completedStageIndices.add(stage.stageId)
-        }
-      } else {
-        jobData.numFailedStages += 1
-      }
-    }
-  }
-
-  /** For FIFO, all stages are contained by "default" pool but "default" pool here is meaningless */
-  override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized {
-    val stage = stageSubmitted.stageInfo
-    activeStages(stage.stageId) = stage
-    pendingStages.remove(stage.stageId)
-    val poolName = Option(stageSubmitted.properties).map {
-      p => p.getProperty("spark.scheduler.pool", SparkUI.DEFAULT_POOL_NAME)
-    }.getOrElse(SparkUI.DEFAULT_POOL_NAME)
-
-    stageIdToInfo(stage.stageId) = stage
-    val stageData = stageIdToData.getOrElseUpdate((stage.stageId, stage.attemptId), new StageUIData)
-    stageData.schedulingPool = poolName
-
-    stageData.description = Option(stageSubmitted.properties).flatMap {
-      p => Option(p.getProperty(SparkContext.SPARK_JOB_DESCRIPTION))
-    }
-
-    val stages = poolToActiveStages.getOrElseUpdate(poolName, new HashMap[Int, StageInfo])
-    stages(stage.stageId) = stage
-
-    for (
-      activeJobsDependentOnStage <- stageIdToActiveJobIds.get(stage.stageId);
-      jobId <- activeJobsDependentOnStage;
-      jobData <- jobIdToData.get(jobId)
-    ) {
-      jobData.numActiveStages += 1
-
-      // If a stage retries again, it should be removed from completedStageIndices set
-      jobData.completedStageIndices.remove(stage.stageId)
-    }
-  }
-
-  override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
-    val taskInfo = taskStart.taskInfo
-    if (taskInfo != null) {
-      val stageData = stageIdToData.getOrElseUpdate((taskStart.stageId, taskStart.stageAttemptId), {
-        logWarning("Task start for unknown stage " + taskStart.stageId)
-        new StageUIData
-      })
-      stageData.numActiveTasks += 1
-      stageData.taskData.put(taskInfo.taskId, TaskUIData(taskInfo))
-    }
-    for (
-      activeJobsDependentOnStage <- stageIdToActiveJobIds.get(taskStart.stageId);
-      jobId <- activeJobsDependentOnStage;
-      jobData <- jobIdToData.get(jobId)
-    ) {
-      jobData.numActiveTasks += 1
-    }
-  }
-
-  override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult) {
-    // Do nothing: because we don't do a deep copy of the TaskInfo, the TaskInfo in
-    // stageToTaskInfos already has the updated status.
-  }
-
-  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
-    val info = taskEnd.taskInfo
-    // If stage attempt id is -1, it means the DAGScheduler had no idea which attempt this task
-    // completion event is for. Let's just drop it here. This means we might have some speculation
-    // tasks on the web ui that's never marked as complete.
-    if (info != null && taskEnd.stageAttemptId != -1) {
-      val stageData = stageIdToData.getOrElseUpdate((taskEnd.stageId, taskEnd.stageAttemptId), {
-        logWarning("Task end for unknown stage " + taskEnd.stageId)
-        new StageUIData
-      })
-
-      for (accumulableInfo <- info.accumulables) {
-        stageData.accumulables(accumulableInfo.id) = accumulableInfo
-      }
-
-      val execSummaryMap = stageData.executorSummary
-      val execSummary = execSummaryMap.getOrElseUpdate(info.executorId, new ExecutorSummary)
-
-      taskEnd.reason match {
-        case Success =>
-          execSummary.succeededTasks += 1
-        case kill: TaskKilled =>
-          execSummary.reasonToNumKilled = execSummary.reasonToNumKilled.updated(
-            kill.reason, execSummary.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
-        case commitDenied: TaskCommitDenied =>
-          execSummary.reasonToNumKilled = execSummary.reasonToNumKilled.updated(
-            commitDenied.toErrorString, execSummary.reasonToNumKilled.getOrElse(
-              commitDenied.toErrorString, 0) + 1)
-        case _ =>
-          execSummary.failedTasks += 1
-      }
-      execSummary.taskTime += info.duration
-      stageData.numActiveTasks -= 1
-
-      val errorMessage: Option[String] =
-        taskEnd.reason match {
-          case org.apache.spark.Success =>
-            stageData.completedIndices.add(info.index)
-            stageData.numCompleteTasks += 1
-            None
-          case kill: TaskKilled =>
-            stageData.reasonToNumKilled = stageData.reasonToNumKilled.updated(
-              kill.reason, stageData.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
-            Some(kill.toErrorString)
-          case commitDenied: TaskCommitDenied =>
-            stageData.reasonToNumKilled = stageData.reasonToNumKilled.updated(
-              commitDenied.toErrorString, stageData.reasonToNumKilled.getOrElse(
-                commitDenied.toErrorString, 0) + 1)
-            Some(commitDenied.toErrorString)
-          case e: ExceptionFailure => // Handle ExceptionFailure because we might have accumUpdates
-            stageData.numFailedTasks += 1
-            Some(e.toErrorString)
-          case e: TaskFailedReason => // All other failure cases
-            stageData.numFailedTasks += 1
-            Some(e.toErrorString)
-        }
-
-      val taskMetrics = Option(taskEnd.taskMetrics)
-      taskMetrics.foreach { m =>
-        val oldMetrics = stageData.taskData.get(info.taskId).flatMap(_.metrics)
-        updateAggregateMetrics(stageData, info.executorId, m, oldMetrics)
-      }
-
-      val taskData = stageData.taskData.getOrElseUpdate(info.taskId, TaskUIData(info))
-      taskData.updateTaskInfo(info)
-      taskData.updateTaskMetrics(taskMetrics)
-      taskData.errorMessage = errorMessage
-
-      // If Tasks is too large, remove and garbage collect old tasks
-      if (stageData.taskData.size > retainedTasks) {
-        stageData.taskData = stageData.taskData.drop(
-          calculateNumberToRemove(stageData.taskData.size, retainedTasks))
-      }
-
-      for (
-        activeJobsDependentOnStage <- stageIdToActiveJobIds.get(taskEnd.stageId);
-        jobId <- activeJobsDependentOnStage;
-        jobData <- jobIdToData.get(jobId)
-      ) {
-        jobData.numActiveTasks -= 1
-        taskEnd.reason match {
-          case Success =>
-            jobData.completedIndices.add((taskEnd.stageId, info.index))
-            jobData.numCompletedTasks += 1
-          case kill: TaskKilled =>
-            jobData.reasonToNumKilled = jobData.reasonToNumKilled.updated(
-              kill.reason, jobData.reasonToNumKilled.getOrElse(kill.reason, 0) + 1)
-          case commitDenied: TaskCommitDenied =>
-            jobData.reasonToNumKilled = jobData.reasonToNumKilled.updated(
-              commitDenied.toErrorString, jobData.reasonToNumKilled.getOrElse(
-                commitDenied.toErrorString, 0) + 1)
-          case _ =>
-            jobData.numFailedTasks += 1
-        }
-      }
-    }
-  }
-
-  /**
-   * Remove at least (maxRetained / 10) items to reduce friction.
-   */
-  private def calculateNumberToRemove(dataSize: Int, retainedSize: Int): Int = {
-    math.max(retainedSize / 10, dataSize - retainedSize)
-  }
-
-  /**
-   * Upon receiving new metrics for a task, updates the per-stage and per-executor-per-stage
-   * aggregate metrics by calculating deltas between the currently recorded metrics and the new
-   * metrics.
-   */
-  def updateAggregateMetrics(
-      stageData: StageUIData,
-      execId: String,
-      taskMetrics: TaskMetrics,
-      oldMetrics: Option[TaskMetricsUIData]) {
-    val execSummary = stageData.executorSummary.getOrElseUpdate(execId, new ExecutorSummary)
-
-    val shuffleWriteDelta =
-      taskMetrics.shuffleWriteMetrics.bytesWritten -
-        oldMetrics.map(_.shuffleWriteMetrics.bytesWritten).getOrElse(0L)
-    stageData.shuffleWriteBytes += shuffleWriteDelta
-    execSummary.shuffleWrite += shuffleWriteDelta
-
-    val shuffleWriteRecordsDelta =
-      taskMetrics.shuffleWriteMetrics.recordsWritten -
-        oldMetrics.map(_.shuffleWriteMetrics.recordsWritten).getOrElse(0L)
-    stageData.shuffleWriteRecords += shuffleWriteRecordsDelta
-    execSummary.shuffleWriteRecords += shuffleWriteRecordsDelta
-
-    val shuffleReadDelta =
-      taskMetrics.shuffleReadMetrics.totalBytesRead -
-        oldMetrics.map(_.shuffleReadMetrics.totalBytesRead).getOrElse(0L)
-    stageData.shuffleReadTotalBytes += shuffleReadDelta
-    execSummary.shuffleRead += shuffleReadDelta
-
-    val shuffleReadRecordsDelta =
-      taskMetrics.shuffleReadMetrics.recordsRead -
-        oldMetrics.map(_.shuffleReadMetrics.recordsRead).getOrElse(0L)
-    stageData.shuffleReadRecords += shuffleReadRecordsDelta
-    execSummary.shuffleReadRecords += shuffleReadRecordsDelta
-
-    val inputBytesDelta =
-      taskMetrics.inputMetrics.bytesRead -
-        oldMetrics.map(_.inputMetrics.bytesRead).getOrElse(0L)
-    stageData.inputBytes += inputBytesDelta
-    execSummary.inputBytes += inputBytesDelta
-
-    val inputRecordsDelta =
-      taskMetrics.inputMetrics.recordsRead -
-        oldMetrics.map(_.inputMetrics.recordsRead).getOrElse(0L)
-    stageData.inputRecords += inputRecordsDelta
-    execSummary.inputRecords += inputRecordsDelta
-
-    val outputBytesDelta =
-      taskMetrics.outputMetrics.bytesWritten -
-        oldMetrics.map(_.outputMetrics.bytesWritten).getOrElse(0L)
-    stageData.outputBytes += outputBytesDelta
-    execSummary.outputBytes += outputBytesDelta
-
-    val outputRecordsDelta =
-      taskMetrics.outputMetrics.recordsWritten -
-        oldMetrics.map(_.outputMetrics.recordsWritten).getOrElse(0L)
-    stageData.outputRecords += outputRecordsDelta
-    execSummary.outputRecords += outputRecordsDelta
-
-    val diskSpillDelta =
-      taskMetrics.diskBytesSpilled - oldMetrics.map(_.diskBytesSpilled).getOrElse(0L)
-    stageData.diskBytesSpilled += diskSpillDelta
-    execSummary.diskBytesSpilled += diskSpillDelta
-
-    val memorySpillDelta =
-      taskMetrics.memoryBytesSpilled - oldMetrics.map(_.memoryBytesSpilled).getOrElse(0L)
-    stageData.memoryBytesSpilled += memorySpillDelta
-    execSummary.memoryBytesSpilled += memorySpillDelta
-
-    val timeDelta =
-      taskMetrics.executorRunTime - oldMetrics.map(_.executorRunTime).getOrElse(0L)
-    stageData.executorRunTime += timeDelta
-
-    val cpuTimeDelta =
-      taskMetrics.executorCpuTime - oldMetrics.map(_.executorCpuTime).getOrElse(0L)
-    stageData.executorCpuTime += cpuTimeDelta
-  }
-
-  override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate) {
-    for ((taskId, sid, sAttempt, accumUpdates) <- executorMetricsUpdate.accumUpdates) {
-      val stageData = stageIdToData.getOrElseUpdate((sid, sAttempt), {
-        logWarning("Metrics update for task in unknown stage " + sid)
-        new StageUIData
-      })
-      val taskData = stageData.taskData.get(taskId)
-      val metrics = TaskMetrics.fromAccumulatorInfos(accumUpdates)
-      taskData.foreach { t =>
-        if (!t.taskInfo.finished) {
-          updateAggregateMetrics(stageData, executorMetricsUpdate.execId, metrics, t.metrics)
-          // Overwrite task metrics
-          t.updateTaskMetrics(Some(metrics))
-        }
-      }
-    }
-  }
-
-  override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) {
-    synchronized {
-      schedulingMode = environmentUpdate
-        .environmentDetails("Spark Properties").toMap
-        .get("spark.scheduler.mode")
-        .map(SchedulingMode.withName)
-    }
-  }
-
-  override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) {
-    synchronized {
-      val blockManagerId = blockManagerAdded.blockManagerId
-      val executorId = blockManagerId.executorId
-      executorIdToBlockManagerId(executorId) = blockManagerId
-    }
-  }
-
-  override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) {
-    synchronized {
-      val executorId = blockManagerRemoved.blockManagerId.executorId
-      executorIdToBlockManagerId.remove(executorId)
-    }
-  }
-
-  override def onApplicationStart(appStarted: SparkListenerApplicationStart) {
-    startTime = appStarted.time
-  }
-
-  override def onApplicationEnd(appEnded: SparkListenerApplicationEnd) {
-    endTime = appEnded.time
-  }
-
-  /**
-   * For testing only. Wait until at least `numExecutors` executors are up, or throw
-   * `TimeoutException` if the waiting time elapsed before `numExecutors` executors up.
-   * Exposed for testing.
-   *
-   * @param numExecutors the number of executors to wait at least
-   * @param timeout time to wait in milliseconds
-   */
-  private[spark] def waitUntilExecutorsUp(numExecutors: Int, timeout: Long): Unit = {
-    val finishTime = System.currentTimeMillis() + timeout
-    while (System.currentTimeMillis() < finishTime) {
-      val numBlockManagers = synchronized {
-        blockManagerIds.size
-      }
-      if (numBlockManagers >= numExecutors + 1) {
-        // Need to count the block manager in driver
-        return
-      }
-      // Sleep rather than using wait/notify, because this is used only for testing and wait/notify
-      // add overhead in the general case.
-      Thread.sleep(10)
-    }
-    throw new TimeoutException(
-      s"Can't find $numExecutors executors before $timeout milliseconds elapsed")
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 5f93f2ffb412f..11a6a34344976 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -32,7 +32,6 @@ import org.apache.spark.scheduler.TaskLocality
 import org.apache.spark.status.AppStatusStore
 import org.apache.spark.status.api.v1._
 import org.apache.spark.ui._
-import org.apache.spark.ui.jobs.UIData._
 import org.apache.spark.util.{Distribution, Utils}
 
 /** Page showing statistics and task list for a given stage */
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala b/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
deleted file mode 100644
index 5acec0d0f54c9..0000000000000
--- a/core/src/main/scala/org/apache/spark/ui/jobs/UIData.scala
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ui.jobs
-
-import scala.collection.mutable
-import scala.collection.mutable.{HashMap, LinkedHashMap}
-
-import com.google.common.collect.Interners
-
-import org.apache.spark.JobExecutionStatus
-import org.apache.spark.executor._
-import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
-import org.apache.spark.util.AccumulatorContext
-import org.apache.spark.util.collection.OpenHashSet
-
-private[spark] object UIData {
-
-  class ExecutorSummary {
-    var taskTime : Long = 0
-    var failedTasks : Int = 0
-    var succeededTasks : Int = 0
-    var reasonToNumKilled : Map[String, Int] = Map.empty
-    var inputBytes : Long = 0
-    var inputRecords : Long = 0
-    var outputBytes : Long = 0
-    var outputRecords : Long = 0
-    var shuffleRead : Long = 0
-    var shuffleReadRecords : Long = 0
-    var shuffleWrite : Long = 0
-    var shuffleWriteRecords : Long = 0
-    var memoryBytesSpilled : Long = 0
-    var diskBytesSpilled : Long = 0
-    var isBlacklisted : Int = 0
-  }
-
-  class JobUIData(
-    var jobId: Int = -1,
-    var submissionTime: Option[Long] = None,
-    var completionTime: Option[Long] = None,
-    var stageIds: Seq[Int] = Seq.empty,
-    var jobGroup: Option[String] = None,
-    var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
-    /* Tasks */
-    // `numTasks` is a potential underestimate of the true number of tasks that this job will run.
-    // This may be an underestimate because the job start event references all of the result
-    // stages' transitive stage dependencies, but some of these stages might be skipped if their
-    // output is available from earlier runs.
-    // See https://github.com/apache/spark/pull/3009 for a more extensive discussion.
-    var numTasks: Int = 0,
-    var numActiveTasks: Int = 0,
-    var numCompletedTasks: Int = 0,
-    var completedIndices: OpenHashSet[(Int, Int)] = new OpenHashSet[(Int, Int)](),
-    var numSkippedTasks: Int = 0,
-    var numFailedTasks: Int = 0,
-    var reasonToNumKilled: Map[String, Int] = Map.empty,
-    /* Stages */
-    var numActiveStages: Int = 0,
-    // This needs to be a set instead of a simple count to prevent double-counting of rerun stages:
-    var completedStageIndices: mutable.HashSet[Int] = new mutable.HashSet[Int](),
-    var numSkippedStages: Int = 0,
-    var numFailedStages: Int = 0
-  )
-
-  class StageUIData {
-    var numActiveTasks: Int = _
-    var numCompleteTasks: Int = _
-    var completedIndices = new OpenHashSet[Int]()
-    var numFailedTasks: Int = _
-    var reasonToNumKilled: Map[String, Int] = Map.empty
-
-    var executorRunTime: Long = _
-    var executorCpuTime: Long = _
-
-    var inputBytes: Long = _
-    var inputRecords: Long = _
-    var outputBytes: Long = _
-    var outputRecords: Long = _
-    var shuffleReadTotalBytes: Long = _
-    var shuffleReadRecords : Long = _
-    var shuffleWriteBytes: Long = _
-    var shuffleWriteRecords: Long = _
-    var memoryBytesSpilled: Long = _
-    var diskBytesSpilled: Long = _
-    var isBlacklisted: Int = _
-    var lastUpdateTime: Option[Long] = None
-
-    var schedulingPool: String = ""
-    var description: Option[String] = None
-
-    var accumulables = new HashMap[Long, AccumulableInfo]
-    var taskData = new LinkedHashMap[Long, TaskUIData]
-    var executorSummary = new HashMap[String, ExecutorSummary]
-
-    def hasInput: Boolean = inputBytes > 0
-    def hasOutput: Boolean = outputBytes > 0
-    def hasShuffleRead: Boolean = shuffleReadTotalBytes > 0
-    def hasShuffleWrite: Boolean = shuffleWriteBytes > 0
-    def hasBytesSpilled: Boolean = memoryBytesSpilled > 0 || diskBytesSpilled > 0
-  }
-
-  /**
-   * These are kept mutable and reused throughout a task's lifetime to avoid excessive reallocation.
-   */
-  class TaskUIData private(private var _taskInfo: TaskInfo) {
-
-    private[this] var _metrics: Option[TaskMetricsUIData] = Some(TaskMetricsUIData.EMPTY)
-
-    var errorMessage: Option[String] = None
-
-    def taskInfo: TaskInfo = _taskInfo
-
-    def metrics: Option[TaskMetricsUIData] = _metrics
-
-    def updateTaskInfo(taskInfo: TaskInfo): Unit = {
-      _taskInfo = TaskUIData.dropInternalAndSQLAccumulables(taskInfo)
-    }
-
-    def updateTaskMetrics(metrics: Option[TaskMetrics]): Unit = {
-      _metrics = metrics.map(TaskMetricsUIData.fromTaskMetrics)
-    }
-
-    def taskDuration(lastUpdateTime: Option[Long] = None): Option[Long] = {
-      if (taskInfo.status == "RUNNING") {
-        Some(_taskInfo.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis)))
-      } else {
-        _metrics.map(_.executorRunTime)
-      }
-    }
-  }
-
-  object TaskUIData {
-
-    private val stringInterner = Interners.newWeakInterner[String]()
-
-    /** String interning to reduce the memory usage. */
-    private def weakIntern(s: String): String = {
-      stringInterner.intern(s)
-    }
-
-    def apply(taskInfo: TaskInfo): TaskUIData = {
-      new TaskUIData(dropInternalAndSQLAccumulables(taskInfo))
-    }
-
-    /**
-     * We don't need to store internal or SQL accumulables as their values will be shown in other
-     * places, so drop them to reduce the memory usage.
-     */
-    private[spark] def dropInternalAndSQLAccumulables(taskInfo: TaskInfo): TaskInfo = {
-      val newTaskInfo = new TaskInfo(
-        taskId = taskInfo.taskId,
-        index = taskInfo.index,
-        attemptNumber = taskInfo.attemptNumber,
-        launchTime = taskInfo.launchTime,
-        executorId = weakIntern(taskInfo.executorId),
-        host = weakIntern(taskInfo.host),
-        taskLocality = taskInfo.taskLocality,
-        speculative = taskInfo.speculative
-      )
-      newTaskInfo.gettingResultTime = taskInfo.gettingResultTime
-      newTaskInfo.setAccumulables(taskInfo.accumulables.filter {
-        accum => !accum.internal && accum.metadata != Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)
-      })
-      newTaskInfo.finishTime = taskInfo.finishTime
-      newTaskInfo.failed = taskInfo.failed
-      newTaskInfo.killed = taskInfo.killed
-      newTaskInfo
-    }
-  }
-
-  case class TaskMetricsUIData(
-      executorDeserializeTime: Long,
-      executorDeserializeCpuTime: Long,
-      executorRunTime: Long,
-      executorCpuTime: Long,
-      resultSize: Long,
-      jvmGCTime: Long,
-      resultSerializationTime: Long,
-      memoryBytesSpilled: Long,
-      diskBytesSpilled: Long,
-      peakExecutionMemory: Long,
-      inputMetrics: InputMetricsUIData,
-      outputMetrics: OutputMetricsUIData,
-      shuffleReadMetrics: ShuffleReadMetricsUIData,
-      shuffleWriteMetrics: ShuffleWriteMetricsUIData)
-
-  object TaskMetricsUIData {
-    def fromTaskMetrics(m: TaskMetrics): TaskMetricsUIData = {
-      TaskMetricsUIData(
-        executorDeserializeTime = m.executorDeserializeTime,
-        executorDeserializeCpuTime = m.executorDeserializeCpuTime,
-        executorRunTime = m.executorRunTime,
-        executorCpuTime = m.executorCpuTime,
-        resultSize = m.resultSize,
-        jvmGCTime = m.jvmGCTime,
-        resultSerializationTime = m.resultSerializationTime,
-        memoryBytesSpilled = m.memoryBytesSpilled,
-        diskBytesSpilled = m.diskBytesSpilled,
-        peakExecutionMemory = m.peakExecutionMemory,
-        inputMetrics = InputMetricsUIData(m.inputMetrics),
-        outputMetrics = OutputMetricsUIData(m.outputMetrics),
-        shuffleReadMetrics = ShuffleReadMetricsUIData(m.shuffleReadMetrics),
-        shuffleWriteMetrics = ShuffleWriteMetricsUIData(m.shuffleWriteMetrics))
-    }
-
-    val EMPTY: TaskMetricsUIData = fromTaskMetrics(TaskMetrics.empty)
-  }
-
-  case class InputMetricsUIData(bytesRead: Long, recordsRead: Long)
-  object InputMetricsUIData {
-    def apply(metrics: InputMetrics): InputMetricsUIData = {
-      if (metrics.bytesRead == 0 && metrics.recordsRead == 0) {
-        EMPTY
-      } else {
-        new InputMetricsUIData(
-          bytesRead = metrics.bytesRead,
-          recordsRead = metrics.recordsRead)
-      }
-    }
-    private val EMPTY = InputMetricsUIData(0, 0)
-  }
-
-  case class OutputMetricsUIData(bytesWritten: Long, recordsWritten: Long)
-  object OutputMetricsUIData {
-    def apply(metrics: OutputMetrics): OutputMetricsUIData = {
-      if (metrics.bytesWritten == 0 && metrics.recordsWritten == 0) {
-        EMPTY
-      } else {
-        new OutputMetricsUIData(
-          bytesWritten = metrics.bytesWritten,
-          recordsWritten = metrics.recordsWritten)
-      }
-    }
-    private val EMPTY = OutputMetricsUIData(0, 0)
-  }
-
-  case class ShuffleReadMetricsUIData(
-      remoteBlocksFetched: Long,
-      localBlocksFetched: Long,
-      remoteBytesRead: Long,
-      remoteBytesReadToDisk: Long,
-      localBytesRead: Long,
-      fetchWaitTime: Long,
-      recordsRead: Long,
-      totalBytesRead: Long,
-      totalBlocksFetched: Long)
-
-  object ShuffleReadMetricsUIData {
-    def apply(metrics: ShuffleReadMetrics): ShuffleReadMetricsUIData = {
-      if (
-          metrics.remoteBlocksFetched == 0 &&
-          metrics.localBlocksFetched == 0 &&
-          metrics.remoteBytesRead == 0 &&
-          metrics.localBytesRead == 0 &&
-          metrics.fetchWaitTime == 0 &&
-          metrics.recordsRead == 0 &&
-          metrics.totalBytesRead == 0 &&
-          metrics.totalBlocksFetched == 0) {
-        EMPTY
-      } else {
-        new ShuffleReadMetricsUIData(
-          remoteBlocksFetched = metrics.remoteBlocksFetched,
-          localBlocksFetched = metrics.localBlocksFetched,
-          remoteBytesRead = metrics.remoteBytesRead,
-          remoteBytesReadToDisk = metrics.remoteBytesReadToDisk,
-          localBytesRead = metrics.localBytesRead,
-          fetchWaitTime = metrics.fetchWaitTime,
-          recordsRead = metrics.recordsRead,
-          totalBytesRead = metrics.totalBytesRead,
-          totalBlocksFetched = metrics.totalBlocksFetched
-        )
-      }
-    }
-    private val EMPTY = ShuffleReadMetricsUIData(0, 0, 0, 0, 0, 0, 0, 0, 0)
-  }
-
-  case class ShuffleWriteMetricsUIData(
-      bytesWritten: Long,
-      recordsWritten: Long,
-      writeTime: Long)
-
-  object ShuffleWriteMetricsUIData {
-    def apply(metrics: ShuffleWriteMetrics): ShuffleWriteMetricsUIData = {
-      if (metrics.bytesWritten == 0 && metrics.recordsWritten == 0 && metrics.writeTime == 0) {
-        EMPTY
-      } else {
-        new ShuffleWriteMetricsUIData(
-          bytesWritten = metrics.bytesWritten,
-          recordsWritten = metrics.recordsWritten,
-          writeTime = metrics.writeTime
-        )
-      }
-    }
-    private val EMPTY = ShuffleWriteMetricsUIData(0, 0, 0)
-  }
-
-}
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index ea9f6d2fc20f4..e09d5f59817b9 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -156,7 +156,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
 
   private def testCaching(conf: SparkConf, storageLevel: StorageLevel): Unit = {
     sc = new SparkContext(conf.setMaster(clusterUrl).setAppName("test"))
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 30000)
+    TestUtils.waitUntilExecutorsUp(sc, 2, 30000)
     val data = sc.parallelize(1 to 1000, 10)
     val cachedData = data.persist(storageLevel)
     assert(cachedData.count === 1000)
diff --git a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
index fe944031bc948..472952addf353 100644
--- a/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExternalShuffleServiceSuite.scala
@@ -66,7 +66,7 @@ class ExternalShuffleServiceSuite extends ShuffleSuite with BeforeAndAfterAll {
     // local blocks from the local BlockManager and won't send requests to ExternalShuffleService.
     // In this case, we won't receive FetchFailed. And it will make this test fail.
     // Therefore, we should wait until all slaves are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
+    TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
     val rdd = sc.parallelize(0 until 1000, 10).map(i => (i, 1)).reduceByKey(_ + _)
 
diff --git a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
index 5483f2b8434aa..a15ae040d43a9 100644
--- a/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/StatusTrackerSuite.scala
@@ -44,13 +44,13 @@ class StatusTrackerSuite extends SparkFunSuite with Matchers with LocalSparkCont
     stageIds.size should be(2)
 
     val firstStageInfo = eventually(timeout(10 seconds)) {
-      sc.statusTracker.getStageInfo(stageIds(0)).get
+      sc.statusTracker.getStageInfo(stageIds.min).get
     }
-    firstStageInfo.stageId() should be(stageIds(0))
+    firstStageInfo.stageId() should be(stageIds.min)
     firstStageInfo.currentAttemptId() should be(0)
     firstStageInfo.numTasks() should be(2)
     eventually(timeout(10 seconds)) {
-      val updatedFirstStageInfo = sc.statusTracker.getStageInfo(stageIds(0)).get
+      val updatedFirstStageInfo = sc.statusTracker.getStageInfo(stageIds.min).get
       updatedFirstStageInfo.numCompletedTasks() should be(2)
       updatedFirstStageInfo.numActiveTasks() should be(0)
       updatedFirstStageInfo.numFailedTasks() should be(0)
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 46f9ac6b0273a..159629825c677 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -224,7 +224,7 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio
         new SparkContext("local-cluster[%d, 1, 1024]".format(numSlaves), "test")
       // Wait until all salves are up
       try {
-        _sc.jobProgressListener.waitUntilExecutorsUp(numSlaves, 60000)
+        TestUtils.waitUntilExecutorsUp(_sc, numSlaves, 60000)
         _sc
       } catch {
         case e: Throwable =>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index d395e09969453..feefb6a4d73f0 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -2406,13 +2406,13 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
   // OutputCommitCoordinator requires the task info itself to not be null.
   private def createFakeTaskInfo(): TaskInfo = {
     val info = new TaskInfo(0, 0, 0, 0L, "", "", TaskLocality.ANY, false)
-    info.finishTime = 1  // to prevent spurious errors in JobProgressListener
+    info.finishTime = 1
     info
   }
 
   private def createFakeTaskInfoWithId(taskId: Long): TaskInfo = {
     val info = new TaskInfo(taskId, 0, 0, 0L, "", "", TaskLocality.ANY, false)
-    info.finishTime = 1  // to prevent spurious errors in JobProgressListener
+    info.finishTime = 1
     info
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
index 9fa8859382911..123f7f49d21b5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerWithClusterSuite.scala
@@ -21,7 +21,7 @@ import scala.collection.mutable
 
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll}
 
-import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
+import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite, TestUtils}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
 
 /**
@@ -43,7 +43,7 @@ class SparkListenerWithClusterSuite extends SparkFunSuite with LocalSparkContext
 
     // This test will check if the number of executors received by "SparkListener" is same as the
     // number of all executors, so we need to wait until all executors are up
-    sc.jobProgressListener.waitUntilExecutorsUp(2, 60000)
+    TestUtils.waitUntilExecutorsUp(sc, 2, 60000)
 
     val rdd1 = sc.parallelize(1 to 100, 4)
     val rdd2 = rdd1.map(_.toString)
diff --git a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala b/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
deleted file mode 100644
index 48be3be81755a..0000000000000
--- a/core/src/test/scala/org/apache/spark/ui/jobs/JobProgressListenerSuite.scala
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.ui.jobs
-
-import java.util.Properties
-
-import org.scalatest.Matchers
-
-import org.apache.spark._
-import org.apache.spark.{LocalSparkContext, SparkConf, Success}
-import org.apache.spark.executor._
-import org.apache.spark.scheduler._
-import org.apache.spark.ui.jobs.UIData.TaskUIData
-import org.apache.spark.util.{AccumulatorContext, Utils}
-
-class JobProgressListenerSuite extends SparkFunSuite with LocalSparkContext with Matchers {
-
-  val jobSubmissionTime = 1421191042750L
-  val jobCompletionTime = 1421191296660L
-
-  private def createStageStartEvent(stageId: Int) = {
-    val stageInfo = new StageInfo(stageId, 0, stageId.toString, 0, null, null, "")
-    SparkListenerStageSubmitted(stageInfo)
-  }
-
-  private def createStageEndEvent(stageId: Int, failed: Boolean = false) = {
-    val stageInfo = new StageInfo(stageId, 0, stageId.toString, 0, null, null, "")
-    if (failed) {
-      stageInfo.failureReason = Some("Failed!")
-    }
-    SparkListenerStageCompleted(stageInfo)
-  }
-
-  private def createJobStartEvent(
-      jobId: Int,
-      stageIds: Seq[Int],
-      jobGroup: Option[String] = None): SparkListenerJobStart = {
-    val stageInfos = stageIds.map { stageId =>
-      new StageInfo(stageId, 0, stageId.toString, 0, null, null, "")
-    }
-    val properties: Option[Properties] = jobGroup.map { groupId =>
-      val props = new Properties()
-      props.setProperty(SparkContext.SPARK_JOB_GROUP_ID, groupId)
-      props
-    }
-    SparkListenerJobStart(jobId, jobSubmissionTime, stageInfos, properties.orNull)
-  }
-
-  private def createJobEndEvent(jobId: Int, failed: Boolean = false) = {
-    val result = if (failed) JobFailed(new Exception("dummy failure")) else JobSucceeded
-    SparkListenerJobEnd(jobId, jobCompletionTime, result)
-  }
-
-  private def runJob(listener: SparkListener, jobId: Int, shouldFail: Boolean = false) {
-    val stagesThatWontBeRun = jobId * 200 to jobId * 200 + 10
-    val stageIds = jobId * 100 to jobId * 100 + 50
-    listener.onJobStart(createJobStartEvent(jobId, stageIds ++ stagesThatWontBeRun))
-    for (stageId <- stageIds) {
-      listener.onStageSubmitted(createStageStartEvent(stageId))
-      listener.onStageCompleted(createStageEndEvent(stageId, failed = stageId % 2 == 0))
-    }
-    listener.onJobEnd(createJobEndEvent(jobId, shouldFail))
-  }
-
-  private def assertActiveJobsStateIsEmpty(listener: JobProgressListener) {
-    listener.getSizesOfActiveStateTrackingCollections.foreach { case (fieldName, size) =>
-      assert(size === 0, s"$fieldName was not empty")
-    }
-  }
-
-  test("test LRU eviction of stages") {
-    def runWithListener(listener: JobProgressListener) : Unit = {
-      for (i <- 1 to 50) {
-        listener.onStageSubmitted(createStageStartEvent(i))
-        listener.onStageCompleted(createStageEndEvent(i))
-      }
-      assertActiveJobsStateIsEmpty(listener)
-    }
-    val conf = new SparkConf()
-    conf.set("spark.ui.retainedStages", 5.toString)
-    var listener = new JobProgressListener(conf)
-
-    // Test with 5 retainedStages
-    runWithListener(listener)
-    listener.completedStages.size should be (5)
-    listener.completedStages.map(_.stageId).toSet should be (Set(50, 49, 48, 47, 46))
-
-    // Test with 0 retainedStages
-    conf.set("spark.ui.retainedStages", 0.toString)
-    listener = new JobProgressListener(conf)
-    runWithListener(listener)
-    listener.completedStages.size should be (0)
-  }
-
-  test("test clearing of stageIdToActiveJobs") {
-    val conf = new SparkConf()
-    conf.set("spark.ui.retainedStages", 5.toString)
-    val listener = new JobProgressListener(conf)
-    val jobId = 0
-    val stageIds = 1 to 50
-    // Start a job with 50 stages
-    listener.onJobStart(createJobStartEvent(jobId, stageIds))
-    for (stageId <- stageIds) {
-      listener.onStageSubmitted(createStageStartEvent(stageId))
-    }
-    listener.stageIdToActiveJobIds.size should be > 0
-
-    // Complete the stages and job
-    for (stageId <- stageIds) {
-      listener.onStageCompleted(createStageEndEvent(stageId, failed = false))
-    }
-    listener.onJobEnd(createJobEndEvent(jobId, false))
-    assertActiveJobsStateIsEmpty(listener)
-    listener.stageIdToActiveJobIds.size should be (0)
-  }
-
-  test("test clearing of jobGroupToJobIds") {
-    def runWithListener(listener: JobProgressListener): Unit = {
-      // Run 50 jobs, each with one stage
-      for (jobId <- 0 to 50) {
-        listener.onJobStart(createJobStartEvent(jobId, Seq(0), jobGroup = Some(jobId.toString)))
-        listener.onStageSubmitted(createStageStartEvent(0))
-        listener.onStageCompleted(createStageEndEvent(0, failed = false))
-        listener.onJobEnd(createJobEndEvent(jobId, false))
-      }
-      assertActiveJobsStateIsEmpty(listener)
-    }
-    val conf = new SparkConf()
-    conf.set("spark.ui.retainedJobs", 5.toString)
-
-    var listener = new JobProgressListener(conf)
-    runWithListener(listener)
-    // This collection won't become empty, but it should be bounded by spark.ui.retainedJobs
-    listener.jobGroupToJobIds.size should be (5)
-
-    // Test with 0 jobs
-    conf.set("spark.ui.retainedJobs", 0.toString)
-    listener = new JobProgressListener(conf)
-    runWithListener(listener)
-    listener.jobGroupToJobIds.size should be (0)
-  }
-
-  test("test LRU eviction of jobs") {
-    val conf = new SparkConf()
-    conf.set("spark.ui.retainedStages", 5.toString)
-    conf.set("spark.ui.retainedJobs", 5.toString)
-    val listener = new JobProgressListener(conf)
-
-    // Run a bunch of jobs to get the listener into a state where we've exceeded both the
-    // job and stage retention limits:
-    for (jobId <- 1 to 10) {
-      runJob(listener, jobId, shouldFail = false)
-    }
-    for (jobId <- 200 to 210) {
-      runJob(listener, jobId, shouldFail = true)
-    }
-    assertActiveJobsStateIsEmpty(listener)
-    // Snapshot the sizes of various soft- and hard-size-limited collections:
-    val softLimitSizes = listener.getSizesOfSoftSizeLimitedCollections
-    val hardLimitSizes = listener.getSizesOfHardSizeLimitedCollections
-    // Run some more jobs:
-    for (jobId <- 11 to 50) {
-      runJob(listener, jobId, shouldFail = false)
-      // We shouldn't exceed the hard / soft limit sizes after the jobs have finished:
-      listener.getSizesOfSoftSizeLimitedCollections should be (softLimitSizes)
-      listener.getSizesOfHardSizeLimitedCollections should be (hardLimitSizes)
-    }
-
-    listener.completedJobs.size should be (5)
-    listener.completedJobs.map(_.jobId).toSet should be (Set(50, 49, 48, 47, 46))
-
-    for (jobId <- 51 to 100) {
-      runJob(listener, jobId, shouldFail = true)
-      // We shouldn't exceed the hard / soft limit sizes after the jobs have finished:
-      listener.getSizesOfSoftSizeLimitedCollections should be (softLimitSizes)
-      listener.getSizesOfHardSizeLimitedCollections should be (hardLimitSizes)
-    }
-    assertActiveJobsStateIsEmpty(listener)
-
-    // Completed and failed jobs each their own size limits, so this should still be the same:
-    listener.completedJobs.size should be (5)
-    listener.completedJobs.map(_.jobId).toSet should be (Set(50, 49, 48, 47, 46))
-    listener.failedJobs.size should be (5)
-    listener.failedJobs.map(_.jobId).toSet should be (Set(100, 99, 98, 97, 96))
-  }
-
-  test("test executor id to summary") {
-    val conf = new SparkConf()
-    val listener = new JobProgressListener(conf)
-    val taskMetrics = TaskMetrics.empty
-    val shuffleReadMetrics = taskMetrics.createTempShuffleReadMetrics()
-    assert(listener.stageIdToData.size === 0)
-
-    // finish this task, should get updated shuffleRead
-    shuffleReadMetrics.incRemoteBytesRead(1000)
-    taskMetrics.mergeShuffleReadMetrics()
-    var taskInfo = new TaskInfo(1234L, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
-    taskInfo.finishTime = 1
-    var task = new ShuffleMapTask(0)
-    val taskType = Utils.getFormattedClassName(task)
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToData.getOrElse((0, 0), fail())
-      .executorSummary.getOrElse("exe-1", fail()).shuffleRead === 1000)
-
-    // finish a task with unknown executor-id, nothing should happen
-    taskInfo =
-      new TaskInfo(1234L, 0, 1, 1000L, "exe-unknown", "host1", TaskLocality.NODE_LOCAL, true)
-    taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0)
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToData.size === 1)
-
-    // finish this task, should get updated duration
-    taskInfo = new TaskInfo(1235L, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
-    taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0)
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToData.getOrElse((0, 0), fail())
-      .executorSummary.getOrElse("exe-1", fail()).shuffleRead === 2000)
-
-    // finish this task, should get updated duration
-    taskInfo = new TaskInfo(1236L, 0, 2, 0L, "exe-2", "host1", TaskLocality.NODE_LOCAL, false)
-    taskInfo.finishTime = 1
-    task = new ShuffleMapTask(0)
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    assert(listener.stageIdToData.getOrElse((0, 0), fail())
-      .executorSummary.getOrElse("exe-2", fail()).shuffleRead === 1000)
-  }
-
-  test("test task success vs failure counting for different task end reasons") {
-    val conf = new SparkConf()
-    val listener = new JobProgressListener(conf)
-    val metrics = TaskMetrics.empty
-    val taskInfo = new TaskInfo(1234L, 0, 3, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
-    taskInfo.finishTime = 1
-    val task = new ShuffleMapTask(0)
-    val taskType = Utils.getFormattedClassName(task)
-
-    // Go through all the failure cases to make sure we are counting them as failures.
-    val taskFailedReasons = Seq(
-      Resubmitted,
-      new FetchFailed(null, 0, 0, 0, "ignored"),
-      ExceptionFailure("Exception", "description", null, null, None),
-      TaskResultLost,
-      ExecutorLostFailure("0", true, Some("Induced failure")),
-      UnknownReason)
-    var failCount = 0
-    for (reason <- taskFailedReasons) {
-      listener.onTaskEnd(
-        SparkListenerTaskEnd(task.stageId, 0, taskType, reason, taskInfo, metrics))
-      failCount += 1
-      assert(listener.stageIdToData((task.stageId, 0)).numCompleteTasks === 0)
-      assert(listener.stageIdToData((task.stageId, 0)).numFailedTasks === failCount)
-    }
-
-    // Make sure killed tasks are accounted for correctly.
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(
-        task.stageId, 0, taskType, TaskKilled("test"), taskInfo, metrics))
-    assert(listener.stageIdToData((task.stageId, 0)).reasonToNumKilled === Map("test" -> 1))
-
-    // Make sure we count success as success.
-    listener.onTaskEnd(
-      SparkListenerTaskEnd(task.stageId, 1, taskType, Success, taskInfo, metrics))
-    assert(listener.stageIdToData((task.stageId, 1)).numCompleteTasks === 1)
-    assert(listener.stageIdToData((task.stageId, 0)).numFailedTasks === failCount)
-  }
-
-  test("test update metrics") {
-    val conf = new SparkConf()
-    val listener = new JobProgressListener(conf)
-
-    val taskType = Utils.getFormattedClassName(new ShuffleMapTask(0))
-    val execId = "exe-1"
-
-    def makeTaskMetrics(base: Int): TaskMetrics = {
-      val taskMetrics = TaskMetrics.registered
-      val shuffleReadMetrics = taskMetrics.createTempShuffleReadMetrics()
-      val shuffleWriteMetrics = taskMetrics.shuffleWriteMetrics
-      val inputMetrics = taskMetrics.inputMetrics
-      val outputMetrics = taskMetrics.outputMetrics
-      shuffleReadMetrics.incRemoteBytesRead(base + 1)
-      shuffleReadMetrics.incLocalBytesRead(base + 9)
-      shuffleReadMetrics.incRemoteBlocksFetched(base + 2)
-      taskMetrics.mergeShuffleReadMetrics()
-      shuffleWriteMetrics.incBytesWritten(base + 3)
-      taskMetrics.setExecutorRunTime(base + 4)
-      taskMetrics.incDiskBytesSpilled(base + 5)
-      taskMetrics.incMemoryBytesSpilled(base + 6)
-      inputMetrics.setBytesRead(base + 7)
-      outputMetrics.setBytesWritten(base + 8)
-      taskMetrics
-    }
-
-    def makeTaskInfo(taskId: Long, finishTime: Int = 0): TaskInfo = {
-      val taskInfo = new TaskInfo(taskId, 0, 1, 0L, execId, "host1", TaskLocality.NODE_LOCAL,
-        false)
-      taskInfo.finishTime = finishTime
-      taskInfo
-    }
-
-    listener.onTaskStart(SparkListenerTaskStart(0, 0, makeTaskInfo(1234L)))
-    listener.onTaskStart(SparkListenerTaskStart(0, 0, makeTaskInfo(1235L)))
-    listener.onTaskStart(SparkListenerTaskStart(1, 0, makeTaskInfo(1236L)))
-    listener.onTaskStart(SparkListenerTaskStart(1, 0, makeTaskInfo(1237L)))
-
-    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(execId, Array(
-      (1234L, 0, 0, makeTaskMetrics(0).accumulators().map(AccumulatorSuite.makeInfo)),
-      (1235L, 0, 0, makeTaskMetrics(100).accumulators().map(AccumulatorSuite.makeInfo)),
-      (1236L, 1, 0, makeTaskMetrics(200).accumulators().map(AccumulatorSuite.makeInfo)))))
-
-    var stage0Data = listener.stageIdToData.get((0, 0)).get
-    var stage1Data = listener.stageIdToData.get((1, 0)).get
-    assert(stage0Data.shuffleReadTotalBytes == 220)
-    assert(stage1Data.shuffleReadTotalBytes == 410)
-    assert(stage0Data.shuffleWriteBytes == 106)
-    assert(stage1Data.shuffleWriteBytes == 203)
-    assert(stage0Data.executorRunTime == 108)
-    assert(stage1Data.executorRunTime == 204)
-    assert(stage0Data.diskBytesSpilled == 110)
-    assert(stage1Data.diskBytesSpilled == 205)
-    assert(stage0Data.memoryBytesSpilled == 112)
-    assert(stage1Data.memoryBytesSpilled == 206)
-    assert(stage0Data.inputBytes == 114)
-    assert(stage1Data.inputBytes == 207)
-    assert(stage0Data.outputBytes == 116)
-    assert(stage1Data.outputBytes == 208)
-
-    assert(
-      stage0Data.taskData.get(1234L).get.metrics.get.shuffleReadMetrics.totalBlocksFetched == 2)
-    assert(
-      stage0Data.taskData.get(1235L).get.metrics.get.shuffleReadMetrics.totalBlocksFetched == 102)
-    assert(
-      stage1Data.taskData.get(1236L).get.metrics.get.shuffleReadMetrics.totalBlocksFetched == 202)
-
-    // task that was included in a heartbeat
-    listener.onTaskEnd(SparkListenerTaskEnd(0, 0, taskType, Success, makeTaskInfo(1234L, 1),
-      makeTaskMetrics(300)))
-    // task that wasn't included in a heartbeat
-    listener.onTaskEnd(SparkListenerTaskEnd(1, 0, taskType, Success, makeTaskInfo(1237L, 1),
-      makeTaskMetrics(400)))
-
-    stage0Data = listener.stageIdToData.get((0, 0)).get
-    stage1Data = listener.stageIdToData.get((1, 0)).get
-    // Task 1235 contributed (100+1)+(100+9) = 210 shuffle bytes, and task 1234 contributed
-    // (300+1)+(300+9) = 610 total shuffle bytes, so the total for the stage is 820.
-    assert(stage0Data.shuffleReadTotalBytes == 820)
-    // Task 1236 contributed 410 shuffle bytes, and task 1237 contributed 810 shuffle bytes.
-    assert(stage1Data.shuffleReadTotalBytes == 1220)
-    assert(stage0Data.shuffleWriteBytes == 406)
-    assert(stage1Data.shuffleWriteBytes == 606)
-    assert(stage0Data.executorRunTime == 408)
-    assert(stage1Data.executorRunTime == 608)
-    assert(stage0Data.diskBytesSpilled == 410)
-    assert(stage1Data.diskBytesSpilled == 610)
-    assert(stage0Data.memoryBytesSpilled == 412)
-    assert(stage1Data.memoryBytesSpilled == 612)
-    assert(stage0Data.inputBytes == 414)
-    assert(stage1Data.inputBytes == 614)
-    assert(stage0Data.outputBytes == 416)
-    assert(stage1Data.outputBytes == 616)
-    assert(
-      stage0Data.taskData.get(1234L).get.metrics.get.shuffleReadMetrics.totalBlocksFetched == 302)
-    assert(
-      stage1Data.taskData.get(1237L).get.metrics.get.shuffleReadMetrics.totalBlocksFetched == 402)
-  }
-
-  test("drop internal and sql accumulators") {
-    val taskInfo = new TaskInfo(0, 0, 0, 0, "", "", TaskLocality.ANY, false)
-    val internalAccum =
-      AccumulableInfo(id = 1, name = Some("internal"), None, None, true, false, None)
-    val sqlAccum = AccumulableInfo(
-      id = 2,
-      name = Some("sql"),
-      update = None,
-      value = None,
-      internal = false,
-      countFailedValues = false,
-      metadata = Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
-    val userAccum = AccumulableInfo(
-      id = 3,
-      name = Some("user"),
-      update = None,
-      value = None,
-      internal = false,
-      countFailedValues = false,
-      metadata = None)
-    taskInfo.setAccumulables(List(internalAccum, sqlAccum, userAccum))
-
-    val newTaskInfo = TaskUIData.dropInternalAndSQLAccumulables(taskInfo)
-    assert(newTaskInfo.accumulables === Seq(userAccum))
-  }
-
-  test("SPARK-19146 drop more elements when stageData.taskData.size > retainedTasks") {
-    val conf = new SparkConf()
-    conf.set("spark.ui.retainedTasks", "100")
-    val taskMetrics = TaskMetrics.empty
-    taskMetrics.mergeShuffleReadMetrics()
-    val task = new ShuffleMapTask(0)
-    val taskType = Utils.getFormattedClassName(task)
-
-    val listener1 = new JobProgressListener(conf)
-    for (t <- 1 to 101) {
-      val taskInfo = new TaskInfo(t, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
-      taskInfo.finishTime = 1
-      listener1.onTaskEnd(
-        SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    }
-    // 101 - math.max(100 / 10, 101 - 100) = 91
-    assert(listener1.stageIdToData((task.stageId, task.stageAttemptId)).taskData.size === 91)
-
-    val listener2 = new JobProgressListener(conf)
-    for (t <- 1 to 150) {
-      val taskInfo = new TaskInfo(t, 0, 1, 0L, "exe-1", "host1", TaskLocality.NODE_LOCAL, false)
-      taskInfo.finishTime = 1
-      listener2.onTaskEnd(
-        SparkListenerTaskEnd(task.stageId, 0, taskType, Success, taskInfo, taskMetrics))
-    }
-    // 150 - math.max(100 / 10, 150 - 100) = 100
-    assert(listener2.stageIdToData((task.stageId, task.stageAttemptId)).taskData.size === 100)
-  }
-
-}
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 915c7e2e2fda3..5b8dcd0338cce 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -45,6 +45,8 @@ object MimaExcludes {
     ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.storage.StorageStatusListener"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.ExecutorStageSummary.this"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.JobData.this"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.SparkStatusTracker.this"),
+    ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.ui.jobs.JobProgressListener"),
 
     // [SPARK-20495][SQL] Add StorageLevel to cacheTable API
     ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.catalog.Catalog.cacheTable"),
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 69e15655ad790..6748dd4ec48e3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -23,16 +23,16 @@ import scala.xml._
 
 import org.apache.commons.lang3.StringEscapeUtils
 
+import org.apache.spark.status.api.v1.{JobData, StageData}
 import org.apache.spark.streaming.Time
 import org.apache.spark.streaming.ui.StreamingJobProgressListener.SparkJobId
 import org.apache.spark.ui.{UIUtils => SparkUIUtils, WebUIPage}
-import org.apache.spark.ui.jobs.UIData.JobUIData
 
-private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobUIData: Option[JobUIData])
+private[ui] case class SparkJobIdWithUIData(sparkJobId: SparkJobId, jobData: Option[JobData])
 
 private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   private val streamingListener = parent.listener
-  private val sparkListener = parent.ssc.sc.jobProgressListener
+  private val store = parent.parent.store
 
   private def columns: Seq[Node] = {
     <th>Output Op Id</th>
@@ -52,13 +52,13 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       formattedOutputOpDuration: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
-      sparkJob: SparkJobIdWithUIData): Seq[Node] = {
-    if (sparkJob.jobUIData.isDefined) {
+      jobIdWithData: SparkJobIdWithUIData): Seq[Node] = {
+    if (jobIdWithData.jobData.isDefined) {
       generateNormalJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
-        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
+        numSparkJobRowsInOutputOp, isFirstRow, jobIdWithData.jobData.get)
     } else {
       generateDroppedJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
-        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
+        numSparkJobRowsInOutputOp, isFirstRow, jobIdWithData.sparkJobId)
     }
   }
 
@@ -94,15 +94,15 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       formattedOutputOpDuration: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
-      sparkJob: JobUIData): Seq[Node] = {
+      sparkJob: JobData): Seq[Node] = {
     val duration: Option[Long] = {
       sparkJob.submissionTime.map { start =>
-        val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis())
-        end - start
+        val end = sparkJob.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis())
+        end - start.getTime()
       }
     }
     val lastFailureReason =
-      sparkJob.stageIds.sorted.reverse.flatMap(sparkListener.stageIdToInfo.get).
+      sparkJob.stageIds.sorted.reverse.flatMap(getStageData).
       dropWhile(_.failureReason == None).take(1). // get the first info that contains failure
       flatMap(info => info.failureReason).headOption.getOrElse("")
     val formattedDuration = duration.map(d => SparkUIUtils.formatDuration(d)).getOrElse("-")
@@ -135,7 +135,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         {formattedDuration}
       </td>
       <td class="stage-progress-cell">
-        {sparkJob.completedStageIndices.size}/{sparkJob.stageIds.size - sparkJob.numSkippedStages}
+        {sparkJob.numCompletedStages}/{sparkJob.stageIds.size - sparkJob.numSkippedStages}
         {if (sparkJob.numFailedStages > 0) s"(${sparkJob.numFailedStages} failed)"}
         {if (sparkJob.numSkippedStages > 0) s"(${sparkJob.numSkippedStages} skipped)"}
       </td>
@@ -146,7 +146,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
             completed = sparkJob.numCompletedTasks,
             failed = sparkJob.numFailedTasks,
             skipped = sparkJob.numSkippedTasks,
-            reasonToNumKilled = sparkJob.reasonToNumKilled,
+            reasonToNumKilled = sparkJob.killedTasksSummary,
             total = sparkJob.numTasks - sparkJob.numSkippedTasks)
         }
       </td>
@@ -246,11 +246,19 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     </div>
   }
 
-  private def getJobData(sparkJobId: SparkJobId): Option[JobUIData] = {
-    sparkListener.activeJobs.get(sparkJobId).orElse {
-      sparkListener.completedJobs.find(_.jobId == sparkJobId).orElse {
-        sparkListener.failedJobs.find(_.jobId == sparkJobId)
-      }
+  private def getJobData(sparkJobId: SparkJobId): Option[JobData] = {
+    try {
+      Some(store.job(sparkJobId))
+    } catch {
+      case _: NoSuchElementException => None
+    }
+  }
+
+  private def getStageData(stageId: Int): Option[StageData] = {
+    try {
+      Some(store.lastStageAttempt(stageId))
+    } catch {
+      case _: NoSuchElementException => None
     }
   }
 
@@ -282,25 +290,22 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
         (outputOperation, sparkJobIds)
       }.toSeq.sortBy(_._1.id)
-    sparkListener.synchronized {
-      val outputOpWithJobs = outputOps.map { case (outputOpData, sparkJobIds) =>
-          (outputOpData,
-            sparkJobIds.map(sparkJobId => SparkJobIdWithUIData(sparkJobId, getJobData(sparkJobId))))
-        }
+    val outputOpWithJobs = outputOps.map { case (outputOpData, sparkJobIds) =>
+        (outputOpData, sparkJobIds.map { jobId => SparkJobIdWithUIData(jobId, getJobData(jobId)) })
+      }
 
-      <table id="batch-job-table" class="table table-bordered table-striped table-condensed">
-        <thead>
-          {columns}
-        </thead>
-        <tbody>
-          {
-            outputOpWithJobs.map { case (outputOpData, sparkJobIds) =>
-              generateOutputOpIdRow(outputOpData, sparkJobIds)
-            }
+    <table id="batch-job-table" class="table table-bordered table-striped table-condensed">
+      <thead>
+        {columns}
+      </thead>
+      <tbody>
+        {
+          outputOpWithJobs.map { case (outputOpData, sparkJobs) =>
+            generateOutputOpIdRow(outputOpData, sparkJobs)
           }
-        </tbody>
-      </table>
-    }
+        }
+      </tbody>
+    </table>
   }
 
   def render(request: HttpServletRequest): Seq[Node] = streamingListener.synchronized {

From ab6f60c4d6417cbb0240216a6b492aadcca3043e Mon Sep 17 00:00:00 2001
From: Jakub Dubovsky <jakub.dubovsky@seznam.cz>
Date: Thu, 30 Nov 2017 10:24:30 +0900
Subject: [PATCH 009/356] [SPARK-22585][CORE] Path in addJar is not url encoded

## What changes were proposed in this pull request?

This updates a behavior of `addJar` method of `sparkContext` class. If path without any scheme is passed as input it is used literally without url encoding/decoding it.

## How was this patch tested?

A unit test is added for this.

Author: Jakub Dubovsky <jakub.dubovsky@seznam.cz>

Closes #19834 from james64/SPARK-22585-encode-add-jar.
---
 .../main/scala/org/apache/spark/SparkContext.scala    |  6 +++++-
 .../scala/org/apache/spark/SparkContextSuite.scala    | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 984dd0a6629a2..c174939ca2e54 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1837,7 +1837,11 @@ class SparkContext(config: SparkConf) extends Logging {
         Utils.validateURL(uri)
         uri.getScheme match {
           // A JAR file which exists only on the driver node
-          case null | "file" => addJarFile(new File(uri.getPath))
+          case null =>
+            // SPARK-22585 path without schema is not url encoded
+            addJarFile(new File(uri.getRawPath))
+          // A JAR file which exists only on the driver node
+          case "file" => addJarFile(new File(uri.getPath))
           // A JAR file which exists locally on every worker node
           case "local" => "file:" + uri.getPath
           case _ => path
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 0ed5f26863dad..2bde8757dae5d 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -309,6 +309,17 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
     assert(sc.listJars().head.contains(tmpJar.getName))
   }
 
+  test("SPARK-22585 addJar argument without scheme is interpreted literally without url decoding") {
+    val tmpDir = new File(Utils.createTempDir(), "host%3A443")
+    tmpDir.mkdirs()
+    val tmpJar = File.createTempFile("t%2F", ".jar", tmpDir)
+
+    sc = new SparkContext("local", "test")
+
+    sc.addJar(tmpJar.getAbsolutePath)
+    assert(sc.listJars().size === 1)
+  }
+
   test("Cancelling job group should not cause SparkContext to shutdown (SPARK-6414)") {
     try {
       sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))

From 92cfbeeb5ce9e2c618a76b3fe60ce84b9d38605b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 30 Nov 2017 10:26:55 +0900
Subject: [PATCH 010/356] [SPARK-21866][ML][PYTHON][FOLLOWUP] Few cleanups and
 fix image test failure in Python 3.6.0 / NumPy 1.13.3

## What changes were proposed in this pull request?

Image test seems failed in Python 3.6.0 / NumPy 1.13.3. I manually tested as below:

```
======================================================================
ERROR: test_read_images (pyspark.ml.tests.ImageReaderTest)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/.../spark/python/pyspark/ml/tests.py", line 1831, in test_read_images
    self.assertEqual(ImageSchema.toImage(array, origin=first_row[0]), first_row)
  File "/.../spark/python/pyspark/ml/image.py", line 149, in toImage
    data = bytearray(array.astype(dtype=np.uint8).ravel())
TypeError: only integer scalar arrays can be converted to a scalar index

----------------------------------------------------------------------
Ran 1 test in 7.606s
```

To be clear, I think the error seems from NumPy - https://github.com/numpy/numpy/blob/75b2d5d427afdb1392f2a0b2092e0767e4bab53d/numpy/core/src/multiarray/number.c#L947

For a smaller scope:

```python
>>> import numpy as np
>>> bytearray(np.array([1]).astype(dtype=np.uint8))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: only integer scalar arrays can be converted to a scalar index
```

In Python 2.7 / NumPy 1.13.1, it prints:

```
bytearray(b'\x01')
```

So, here, I simply worked around it by converting it to bytes as below:

```python
>>> bytearray(np.array([1]).astype(dtype=np.uint8).tobytes())
bytearray(b'\x01')
```

Also, while looking into it again, I realised few arguments could be quite confusing, for example, `Row` that needs some specific attributes and `numpy.ndarray`. I added few type checking and added some tests accordingly. So, it shows an error message as below:

```
TypeError: array argument should be numpy.ndarray; however, it got [<class 'str'>].
```

## How was this patch tested?

Manually tested with `./python/run-tests`.

And also:

```
PYSPARK_PYTHON=python3 SPARK_TESTING=1 bin/pyspark pyspark.ml.tests ImageReaderTest
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19835 from HyukjinKwon/SPARK-21866-followup.
---
 python/pyspark/ml/image.py | 27 ++++++++++++++++++++++++---
 python/pyspark/ml/tests.py | 20 +++++++++++++++++++-
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 7d14f05295572..2b61aa9c0d9e9 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -108,12 +108,23 @@ def toNDArray(self, image):
         """
         Converts an image to an array with metadata.
 
-        :param image: The image to be converted.
+        :param `Row` image: A row that contains the image to be converted. It should
+            have the attributes specified in `ImageSchema.imageSchema`.
         :return: a `numpy.ndarray` that is an image.
 
         .. versionadded:: 2.3.0
         """
 
+        if not isinstance(image, Row):
+            raise TypeError(
+                "image argument should be pyspark.sql.types.Row; however, "
+                "it got [%s]." % type(image))
+
+        if any(not hasattr(image, f) for f in self.imageFields):
+            raise ValueError(
+                "image argument should have attributes specified in "
+                "ImageSchema.imageSchema [%s]." % ", ".join(self.imageFields))
+
         height = image.height
         width = image.width
         nChannels = image.nChannels
@@ -127,15 +138,20 @@ def toImage(self, array, origin=""):
         """
         Converts an array with metadata to a two-dimensional image.
 
-        :param array array: The array to convert to image.
+        :param `numpy.ndarray` array: The array to convert to image.
         :param str origin: Path to the image, optional.
         :return: a :class:`Row` that is a two dimensional image.
 
         .. versionadded:: 2.3.0
         """
 
+        if not isinstance(array, np.ndarray):
+            raise TypeError(
+                "array argument should be numpy.ndarray; however, it got [%s]." % type(array))
+
         if array.ndim != 3:
             raise ValueError("Invalid array shape")
+
         height, width, nChannels = array.shape
         ocvTypes = ImageSchema.ocvTypes
         if nChannels == 1:
@@ -146,7 +162,12 @@ def toImage(self, array, origin=""):
             mode = ocvTypes["CV_8UC4"]
         else:
             raise ValueError("Invalid number of channels")
-        data = bytearray(array.astype(dtype=np.uint8).ravel())
+
+        # Running `bytearray(numpy.array([1]))` fails in specific Python versions
+        # with a specific Numpy version, for example in Python 3.6.0 and NumPy 1.13.3.
+        # Here, it avoids it by converting it to bytes.
+        data = bytearray(array.astype(dtype=np.uint8).ravel().tobytes())
+
         # Creating new Row with _create_row(), because Row(name = value, ... )
         # orders fields by name, which conflicts with expected schema order
         # when the new DataFrame is created by UDF
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 2258d61c95333..89ef555cf3442 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -71,7 +71,7 @@
 from pyspark.sql.functions import rand
 from pyspark.sql.types import DoubleType, IntegerType
 from pyspark.storagelevel import *
-from pyspark.tests import ReusedPySparkTestCase as PySparkTestCase
+from pyspark.tests import QuietTest, ReusedPySparkTestCase as PySparkTestCase
 
 ser = PickleSerializer()
 
@@ -1836,6 +1836,24 @@ def test_read_images(self):
         self.assertEqual(ImageSchema.imageFields, expected)
         self.assertEqual(ImageSchema.undefinedImageType, "Undefined")
 
+        with QuietTest(self.sc):
+            self.assertRaisesRegexp(
+                TypeError,
+                "image argument should be pyspark.sql.types.Row; however",
+                lambda: ImageSchema.toNDArray("a"))
+
+        with QuietTest(self.sc):
+            self.assertRaisesRegexp(
+                ValueError,
+                "image argument should have attributes specified in",
+                lambda: ImageSchema.toNDArray(Row(a=1)))
+
+        with QuietTest(self.sc):
+            self.assertRaisesRegexp(
+                TypeError,
+                "array argument should be numpy.ndarray; however, it got",
+                lambda: ImageSchema.toImage("a"))
+
 
 class ALSTest(SparkSessionTestCase):
 

From 444a2bbb67c2548d121152bc922b4c3337ddc8e8 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 30 Nov 2017 18:28:58 +0800
Subject: [PATCH 011/356] [SPARK-22652][SQL] remove set methods in ColumnarRow

## What changes were proposed in this pull request?

As a step to make `ColumnVector` public, the `ColumnarRow` returned by `ColumnVector#getStruct` should be immutable.

However we do need the mutability of `ColumnaRow` for the fast vectorized hashmap in hash aggregate. To solve this, this PR introduces a `MutableColumnarRow` for this use case.

## How was this patch tested?

existing test.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19847 from cloud-fan/mutable-row.
---
 .../sql/execution/vectorized/ColumnarRow.java | 102 +------
 .../vectorized/MutableColumnarRow.java        | 278 ++++++++++++++++++
 .../aggregate/HashAggregateExec.scala         |   3 +-
 .../VectorizedHashMapGenerator.scala          |  82 +++---
 .../vectorized/ColumnVectorSuite.scala        |  12 +
 .../vectorized/ColumnarBatchSuite.scala       |  23 --
 6 files changed, 336 insertions(+), 164 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
index 98a907322713b..cabb7479525d9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
@@ -16,8 +16,6 @@
  */
 package org.apache.spark.sql.execution.vectorized;
 
-import java.math.BigDecimal;
-
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.MapData;
@@ -32,17 +30,10 @@
 public final class ColumnarRow extends InternalRow {
   protected int rowId;
   private final ColumnVector[] columns;
-  private final WritableColumnVector[] writableColumns;
 
   // Ctor used if this is a struct.
   ColumnarRow(ColumnVector[] columns) {
     this.columns = columns;
-    this.writableColumns = new WritableColumnVector[this.columns.length];
-    for (int i = 0; i < this.columns.length; i++) {
-      if (this.columns[i] instanceof WritableColumnVector) {
-        this.writableColumns[i] = (WritableColumnVector) this.columns[i];
-      }
-    }
   }
 
   public ColumnVector[] columns() { return columns; }
@@ -205,97 +196,8 @@ public Object get(int ordinal, DataType dataType) {
   }
 
   @Override
-  public void update(int ordinal, Object value) {
-    if (value == null) {
-      setNullAt(ordinal);
-    } else {
-      DataType dt = columns[ordinal].dataType();
-      if (dt instanceof BooleanType) {
-        setBoolean(ordinal, (boolean) value);
-      } else if (dt instanceof IntegerType) {
-        setInt(ordinal, (int) value);
-      } else if (dt instanceof ShortType) {
-        setShort(ordinal, (short) value);
-      } else if (dt instanceof LongType) {
-        setLong(ordinal, (long) value);
-      } else if (dt instanceof FloatType) {
-        setFloat(ordinal, (float) value);
-      } else if (dt instanceof DoubleType) {
-        setDouble(ordinal, (double) value);
-      } else if (dt instanceof DecimalType) {
-        DecimalType t = (DecimalType) dt;
-        setDecimal(ordinal, Decimal.apply((BigDecimal) value, t.precision(), t.scale()),
-                t.precision());
-      } else {
-        throw new UnsupportedOperationException("Datatype not supported " + dt);
-      }
-    }
-  }
-
-  @Override
-  public void setNullAt(int ordinal) {
-    getWritableColumn(ordinal).putNull(rowId);
-  }
-
-  @Override
-  public void setBoolean(int ordinal, boolean value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putBoolean(rowId, value);
-  }
+  public void update(int ordinal, Object value) { throw new UnsupportedOperationException(); }
 
   @Override
-  public void setByte(int ordinal, byte value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putByte(rowId, value);
-  }
-
-  @Override
-  public void setShort(int ordinal, short value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putShort(rowId, value);
-  }
-
-  @Override
-  public void setInt(int ordinal, int value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putInt(rowId, value);
-  }
-
-  @Override
-  public void setLong(int ordinal, long value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putLong(rowId, value);
-  }
-
-  @Override
-  public void setFloat(int ordinal, float value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putFloat(rowId, value);
-  }
-
-  @Override
-  public void setDouble(int ordinal, double value) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putDouble(rowId, value);
-  }
-
-  @Override
-  public void setDecimal(int ordinal, Decimal value, int precision) {
-    WritableColumnVector column = getWritableColumn(ordinal);
-    column.putNotNull(rowId);
-    column.putDecimal(rowId, value, precision);
-  }
-
-  private WritableColumnVector getWritableColumn(int ordinal) {
-    WritableColumnVector column = writableColumns[ordinal];
-    assert (!column.isConstant);
-    return column;
-  }
+  public void setNullAt(int ordinal) { throw new UnsupportedOperationException(); }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
new file mode 100644
index 0000000000000..f272cc163611b
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.vectorized;
+
+import java.math.BigDecimal;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.catalyst.util.MapData;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.unsafe.types.CalendarInterval;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A mutable version of {@link ColumnarRow}, which is used in the vectorized hash map for hash
+ * aggregate.
+ *
+ * Note that this class intentionally has a lot of duplicated code with {@link ColumnarRow}, to
+ * avoid java polymorphism overhead by keeping {@link ColumnarRow} and this class final classes.
+ */
+public final class MutableColumnarRow extends InternalRow {
+  public int rowId;
+  private final WritableColumnVector[] columns;
+
+  public MutableColumnarRow(WritableColumnVector[] columns) {
+    this.columns = columns;
+  }
+
+  @Override
+  public int numFields() { return columns.length; }
+
+  @Override
+  public InternalRow copy() {
+    GenericInternalRow row = new GenericInternalRow(columns.length);
+    for (int i = 0; i < numFields(); i++) {
+      if (isNullAt(i)) {
+        row.setNullAt(i);
+      } else {
+        DataType dt = columns[i].dataType();
+        if (dt instanceof BooleanType) {
+          row.setBoolean(i, getBoolean(i));
+        } else if (dt instanceof ByteType) {
+          row.setByte(i, getByte(i));
+        } else if (dt instanceof ShortType) {
+          row.setShort(i, getShort(i));
+        } else if (dt instanceof IntegerType) {
+          row.setInt(i, getInt(i));
+        } else if (dt instanceof LongType) {
+          row.setLong(i, getLong(i));
+        } else if (dt instanceof FloatType) {
+          row.setFloat(i, getFloat(i));
+        } else if (dt instanceof DoubleType) {
+          row.setDouble(i, getDouble(i));
+        } else if (dt instanceof StringType) {
+          row.update(i, getUTF8String(i).copy());
+        } else if (dt instanceof BinaryType) {
+          row.update(i, getBinary(i));
+        } else if (dt instanceof DecimalType) {
+          DecimalType t = (DecimalType)dt;
+          row.setDecimal(i, getDecimal(i, t.precision(), t.scale()), t.precision());
+        } else if (dt instanceof DateType) {
+          row.setInt(i, getInt(i));
+        } else if (dt instanceof TimestampType) {
+          row.setLong(i, getLong(i));
+        } else {
+          throw new RuntimeException("Not implemented. " + dt);
+        }
+      }
+    }
+    return row;
+  }
+
+  @Override
+  public boolean anyNull() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
+
+  @Override
+  public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
+
+  @Override
+  public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
+
+  @Override
+  public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
+
+  @Override
+  public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
+
+  @Override
+  public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
+
+  @Override
+  public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
+
+  @Override
+  public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
+
+  @Override
+  public Decimal getDecimal(int ordinal, int precision, int scale) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getDecimal(rowId, precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int ordinal) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getUTF8String(rowId);
+  }
+
+  @Override
+  public byte[] getBinary(int ordinal) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getBinary(rowId);
+  }
+
+  @Override
+  public CalendarInterval getInterval(int ordinal) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    final int months = columns[ordinal].getChildColumn(0).getInt(rowId);
+    final long microseconds = columns[ordinal].getChildColumn(1).getLong(rowId);
+    return new CalendarInterval(months, microseconds);
+  }
+
+  @Override
+  public ColumnarRow getStruct(int ordinal, int numFields) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getStruct(rowId);
+  }
+
+  @Override
+  public ColumnarArray getArray(int ordinal) {
+    if (columns[ordinal].isNullAt(rowId)) return null;
+    return columns[ordinal].getArray(rowId);
+  }
+
+  @Override
+  public MapData getMap(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Object get(int ordinal, DataType dataType) {
+    if (dataType instanceof BooleanType) {
+      return getBoolean(ordinal);
+    } else if (dataType instanceof ByteType) {
+      return getByte(ordinal);
+    } else if (dataType instanceof ShortType) {
+      return getShort(ordinal);
+    } else if (dataType instanceof IntegerType) {
+      return getInt(ordinal);
+    } else if (dataType instanceof LongType) {
+      return getLong(ordinal);
+    } else if (dataType instanceof FloatType) {
+      return getFloat(ordinal);
+    } else if (dataType instanceof DoubleType) {
+      return getDouble(ordinal);
+    } else if (dataType instanceof StringType) {
+      return getUTF8String(ordinal);
+    } else if (dataType instanceof BinaryType) {
+      return getBinary(ordinal);
+    } else if (dataType instanceof DecimalType) {
+      DecimalType t = (DecimalType) dataType;
+      return getDecimal(ordinal, t.precision(), t.scale());
+    } else if (dataType instanceof DateType) {
+      return getInt(ordinal);
+    } else if (dataType instanceof TimestampType) {
+      return getLong(ordinal);
+    } else if (dataType instanceof ArrayType) {
+      return getArray(ordinal);
+    } else if (dataType instanceof StructType) {
+      return getStruct(ordinal, ((StructType)dataType).fields().length);
+    } else if (dataType instanceof MapType) {
+      return getMap(ordinal);
+    } else {
+      throw new UnsupportedOperationException("Datatype not supported " + dataType);
+    }
+  }
+
+  @Override
+  public void update(int ordinal, Object value) {
+    if (value == null) {
+      setNullAt(ordinal);
+    } else {
+      DataType dt = columns[ordinal].dataType();
+      if (dt instanceof BooleanType) {
+        setBoolean(ordinal, (boolean) value);
+      } else if (dt instanceof IntegerType) {
+        setInt(ordinal, (int) value);
+      } else if (dt instanceof ShortType) {
+        setShort(ordinal, (short) value);
+      } else if (dt instanceof LongType) {
+        setLong(ordinal, (long) value);
+      } else if (dt instanceof FloatType) {
+        setFloat(ordinal, (float) value);
+      } else if (dt instanceof DoubleType) {
+        setDouble(ordinal, (double) value);
+      } else if (dt instanceof DecimalType) {
+        DecimalType t = (DecimalType) dt;
+        Decimal d = Decimal.apply((BigDecimal) value, t.precision(), t.scale());
+        setDecimal(ordinal, d, t.precision());
+      } else {
+        throw new UnsupportedOperationException("Datatype not supported " + dt);
+      }
+    }
+  }
+
+  @Override
+  public void setNullAt(int ordinal) {
+    columns[ordinal].putNull(rowId);
+  }
+
+  @Override
+  public void setBoolean(int ordinal, boolean value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putBoolean(rowId, value);
+  }
+
+  @Override
+  public void setByte(int ordinal, byte value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putByte(rowId, value);
+  }
+
+  @Override
+  public void setShort(int ordinal, short value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putShort(rowId, value);
+  }
+
+  @Override
+  public void setInt(int ordinal, int value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putInt(rowId, value);
+  }
+
+  @Override
+  public void setLong(int ordinal, long value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putLong(rowId, value);
+  }
+
+  @Override
+  public void setFloat(int ordinal, float value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putFloat(rowId, value);
+  }
+
+  @Override
+  public void setDouble(int ordinal, double value) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putDouble(rowId, value);
+  }
+
+  @Override
+  public void setDecimal(int ordinal, Decimal value, int precision) {
+    columns[ordinal].putNotNull(rowId);
+    columns[ordinal].putDecimal(rowId, value, precision);
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index dc8aecf185a96..913978892cd8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.Utils
@@ -894,7 +895,7 @@ case class HashAggregateExec(
      ${
         if (isVectorizedHashMapEnabled) {
           s"""
-             | org.apache.spark.sql.execution.vectorized.ColumnarRow $fastRowBuffer = null;
+             | ${classOf[MutableColumnarRow].getName} $fastRowBuffer = null;
            """.stripMargin
         } else {
           s"""
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index fd783d905b776..44ba539ebf7c2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnarRow, MutableColumnarRow, OnHeapColumnVector}
 import org.apache.spark.sql.types._
 
 /**
@@ -76,10 +77,9 @@ class VectorizedHashMapGenerator(
         }.mkString("\n").concat(";")
 
     s"""
-       |  private org.apache.spark.sql.execution.vectorized.OnHeapColumnVector[] batchVectors;
-       |  private org.apache.spark.sql.execution.vectorized.OnHeapColumnVector[] bufferVectors;
-       |  private org.apache.spark.sql.execution.vectorized.ColumnarBatch batch;
-       |  private org.apache.spark.sql.execution.vectorized.ColumnarBatch aggregateBufferBatch;
+       |  private ${classOf[OnHeapColumnVector].getName}[] vectors;
+       |  private ${classOf[ColumnarBatch].getName} batch;
+       |  private ${classOf[MutableColumnarRow].getName} aggBufferRow;
        |  private int[] buckets;
        |  private int capacity = 1 << 16;
        |  private double loadFactor = 0.5;
@@ -91,19 +91,16 @@ class VectorizedHashMapGenerator(
        |    $generatedAggBufferSchema
        |
        |  public $generatedClassName() {
-       |    batchVectors = org.apache.spark.sql.execution.vectorized
-       |      .OnHeapColumnVector.allocateColumns(capacity, schema);
-       |    batch = new org.apache.spark.sql.execution.vectorized.ColumnarBatch(
-       |      schema, batchVectors, capacity);
+       |    vectors = ${classOf[OnHeapColumnVector].getName}.allocateColumns(capacity, schema);
+       |    batch = new ${classOf[ColumnarBatch].getName}(schema, vectors, capacity);
        |
-       |    bufferVectors = new org.apache.spark.sql.execution.vectorized
-       |      .OnHeapColumnVector[aggregateBufferSchema.fields().length];
+       |    // Generates a projection to return the aggregate buffer only.
+       |    ${classOf[OnHeapColumnVector].getName}[] aggBufferVectors =
+       |      new ${classOf[OnHeapColumnVector].getName}[aggregateBufferSchema.fields().length];
        |    for (int i = 0; i < aggregateBufferSchema.fields().length; i++) {
-       |      bufferVectors[i] = batchVectors[i + ${groupingKeys.length}];
+       |      aggBufferVectors[i] = vectors[i + ${groupingKeys.length}];
        |    }
-       |    // TODO: Possibly generate this projection in HashAggregate directly
-       |    aggregateBufferBatch = new org.apache.spark.sql.execution.vectorized.ColumnarBatch(
-       |      aggregateBufferSchema, bufferVectors, capacity);
+       |    aggBufferRow = new ${classOf[MutableColumnarRow].getName}(aggBufferVectors);
        |
        |    buckets = new int[numBuckets];
        |    java.util.Arrays.fill(buckets, -1);
@@ -114,13 +111,13 @@ class VectorizedHashMapGenerator(
 
   /**
    * Generates a method that returns true if the group-by keys exist at a given index in the
-   * associated [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]. For instance, if we
-   * have 2 long group-by keys, the generated function would be of the form:
+   * associated [[org.apache.spark.sql.execution.vectorized.OnHeapColumnVector]]. For instance,
+   * if we have 2 long group-by keys, the generated function would be of the form:
    *
    * {{{
    * private boolean equals(int idx, long agg_key, long agg_key1) {
-   *   return batchVectors[0].getLong(buckets[idx]) == agg_key &&
-   *     batchVectors[1].getLong(buckets[idx]) == agg_key1;
+   *   return vectors[0].getLong(buckets[idx]) == agg_key &&
+   *     vectors[1].getLong(buckets[idx]) == agg_key1;
    * }
    * }}}
    */
@@ -128,7 +125,7 @@ class VectorizedHashMapGenerator(
 
     def genEqualsForKeys(groupingKeys: Seq[Buffer]): String = {
       groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        s"""(${ctx.genEqual(key.dataType, ctx.getValue(s"batchVectors[$ordinal]", "buckets[idx]",
+        s"""(${ctx.genEqual(key.dataType, ctx.getValue(s"vectors[$ordinal]", "buckets[idx]",
           key.dataType), key.name)})"""
       }.mkString(" && ")
     }
@@ -141,29 +138,35 @@ class VectorizedHashMapGenerator(
   }
 
   /**
-   * Generates a method that returns a mutable
-   * [[org.apache.spark.sql.execution.vectorized.ColumnarRow]] which keeps track of the
+   * Generates a method that returns a
+   * [[org.apache.spark.sql.execution.vectorized.MutableColumnarRow]] which keeps track of the
    * aggregate value(s) for a given set of keys. If the corresponding row doesn't exist, the
    * generated method adds the corresponding row in the associated
-   * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]. For instance, if we
+   * [[org.apache.spark.sql.execution.vectorized.OnHeapColumnVector]]. For instance, if we
    * have 2 long group-by keys, the generated function would be of the form:
    *
    * {{{
-   * public org.apache.spark.sql.execution.vectorized.ColumnarRow findOrInsert(
-   *     long agg_key, long agg_key1) {
+   * public MutableColumnarRow findOrInsert(long agg_key, long agg_key1) {
    *   long h = hash(agg_key, agg_key1);
    *   int step = 0;
    *   int idx = (int) h & (numBuckets - 1);
    *   while (step < maxSteps) {
    *     // Return bucket index if it's either an empty slot or already contains the key
    *     if (buckets[idx] == -1) {
-   *       batchVectors[0].putLong(numRows, agg_key);
-   *       batchVectors[1].putLong(numRows, agg_key1);
-   *       batchVectors[2].putLong(numRows, 0);
-   *       buckets[idx] = numRows++;
-   *       return batch.getRow(buckets[idx]);
+   *       if (numRows < capacity) {
+   *         vectors[0].putLong(numRows, agg_key);
+   *         vectors[1].putLong(numRows, agg_key1);
+   *         vectors[2].putLong(numRows, 0);
+   *         buckets[idx] = numRows++;
+   *         aggBufferRow.rowId = numRows;
+   *         return aggBufferRow;
+   *       } else {
+   *         // No more space
+   *         return null;
+   *       }
    *     } else if (equals(idx, agg_key, agg_key1)) {
-   *       return batch.getRow(buckets[idx]);
+   *       aggBufferRow.rowId = buckets[idx];
+   *       return aggBufferRow;
    *     }
    *     idx = (idx + 1) & (numBuckets - 1);
    *     step++;
@@ -177,20 +180,19 @@ class VectorizedHashMapGenerator(
 
     def genCodeToSetKeys(groupingKeys: Seq[Buffer]): Seq[String] = {
       groupingKeys.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        ctx.setValue(s"batchVectors[$ordinal]", "numRows", key.dataType, key.name)
+        ctx.setValue(s"vectors[$ordinal]", "numRows", key.dataType, key.name)
       }
     }
 
     def genCodeToSetAggBuffers(bufferValues: Seq[Buffer]): Seq[String] = {
       bufferValues.zipWithIndex.map { case (key: Buffer, ordinal: Int) =>
-        ctx.updateColumn(s"batchVectors[${groupingKeys.length + ordinal}]", "numRows", key.dataType,
+        ctx.updateColumn(s"vectors[${groupingKeys.length + ordinal}]", "numRows", key.dataType,
           buffVars(ordinal), nullable = true)
       }
     }
 
     s"""
-       |public org.apache.spark.sql.execution.vectorized.ColumnarRow findOrInsert(${
-            groupingKeySignature}) {
+       |public ${classOf[MutableColumnarRow].getName} findOrInsert($groupingKeySignature) {
        |  long h = hash(${groupingKeys.map(_.name).mkString(", ")});
        |  int step = 0;
        |  int idx = (int) h & (numBuckets - 1);
@@ -208,15 +210,15 @@ class VectorizedHashMapGenerator(
        |        ${genCodeToSetAggBuffers(bufferValues).mkString("\n")}
        |
        |        buckets[idx] = numRows++;
-       |        batch.setNumRows(numRows);
-       |        aggregateBufferBatch.setNumRows(numRows);
-       |        return aggregateBufferBatch.getRow(buckets[idx]);
+       |        aggBufferRow.rowId = buckets[idx];
+       |        return aggBufferRow;
        |      } else {
        |        // No more space
        |        return null;
        |      }
        |    } else if (equals(idx, ${groupingKeys.map(_.name).mkString(", ")})) {
-       |      return aggregateBufferBatch.getRow(buckets[idx]);
+       |      aggBufferRow.rowId = buckets[idx];
+       |      return aggBufferRow;
        |    }
        |    idx = (idx + 1) & (numBuckets - 1);
        |    step++;
@@ -229,8 +231,8 @@ class VectorizedHashMapGenerator(
 
   protected def generateRowIterator(): String = {
     s"""
-       |public java.util.Iterator<org.apache.spark.sql.execution.vectorized.ColumnarRow>
-       |    rowIterator() {
+       |public java.util.Iterator<${classOf[ColumnarRow].getName}> rowIterator() {
+       |  batch.setNumRows(numRows);
        |  return batch.rowIterator();
        |}
      """.stripMargin
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 3c76ca79f5dda..e28ab710f5a99 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -163,6 +163,18 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     }
   }
 
+  testVectors("mutable ColumnarRow", 10, IntegerType) { testVector =>
+    val mutableRow = new MutableColumnarRow(Array(testVector))
+    (0 until 10).foreach { i =>
+      mutableRow.rowId = i
+      mutableRow.setInt(0, 10 - i)
+    }
+    (0 until 10).foreach { i =>
+      mutableRow.rowId = i
+      assert(mutableRow.getInt(0) === (10 - i))
+    }
+  }
+
   val arrayType: ArrayType = ArrayType(IntegerType, containsNull = true)
   testVectors("array", 10, arrayType) { testVector =>
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 80a50866aa504..1b4e2bad09a20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -1129,29 +1129,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
     testRandomRows(false, 30)
   }
 
-  test("mutable ColumnarBatch rows") {
-    val NUM_ITERS = 10
-    val types = Array(
-      BooleanType, FloatType, DoubleType, IntegerType, LongType, ShortType,
-      DecimalType.ShortDecimal, DecimalType.IntDecimal, DecimalType.ByteDecimal,
-      DecimalType.FloatDecimal, DecimalType.LongDecimal, new DecimalType(5, 2),
-      new DecimalType(12, 2), new DecimalType(30, 10))
-    for (i <- 0 to NUM_ITERS) {
-      val random = new Random(System.nanoTime())
-      val schema = RandomDataGenerator.randomSchema(random, numFields = 20, types)
-      val oldRow = RandomDataGenerator.randomRow(random, schema)
-      val newRow = RandomDataGenerator.randomRow(random, schema)
-
-      (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
-        val batch = ColumnVectorUtils.toBatch(schema, memMode, (oldRow :: Nil).iterator.asJava)
-        val columnarBatchRow = batch.getRow(0)
-        newRow.toSeq.zipWithIndex.foreach(i => columnarBatchRow.update(i._2, i._1))
-        compareStruct(schema, columnarBatchRow, newRow, 0)
-        batch.close()
-      }
-    }
-  }
-
   test("exceeding maximum capacity should throw an error") {
     (MemoryMode.ON_HEAP :: MemoryMode.OFF_HEAP :: Nil).foreach { memMode =>
       val column = allocate(1, ByteType, memMode)

From 9c29c557635caf739fde942f53255273aac0d7b1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 30 Nov 2017 18:34:38 +0800
Subject: [PATCH 012/356] [SPARK-22643][SQL] ColumnarArray should be an
 immutable view

## What changes were proposed in this pull request?

To make `ColumnVector` public, `ColumnarArray` need to be public too, and we should not have mutable public fields in a public class. This PR proposes to make `ColumnarArray` an immutable view of the data, and always create a new instance of `ColumnarArray` in `ColumnVector#getArray`

## How was this patch tested?

new benchmark in `ColumnarBatchBenchmark`

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19842 from cloud-fan/column-vector.
---
 .../parquet/VectorizedColumnReader.java       |   2 +-
 .../vectorized/ArrowColumnVector.java         |   1 -
 .../execution/vectorized/ColumnVector.java    |  14 +-
 .../vectorized/ColumnVectorUtils.java         |  18 +--
 .../execution/vectorized/ColumnarArray.java   |  10 +-
 .../vectorized/OffHeapColumnVector.java       |   2 +-
 .../vectorized/OnHeapColumnVector.java        |   2 +-
 .../vectorized/WritableColumnVector.java      |  13 +-
 .../vectorized/ColumnVectorSuite.scala        |  41 +++--
 .../vectorized/ColumnarBatchBenchmark.scala   | 142 ++++++++++++++----
 .../vectorized/ColumnarBatchSuite.scala       |  18 +--
 11 files changed, 164 insertions(+), 99 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 0f1f470dc597e..71ca8b1b96a98 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -425,7 +425,7 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
     // This is where we implement support for the valid type conversions.
     // TODO: implement remaining type conversions
     VectorizedValuesReader data = (VectorizedValuesReader) dataColumn;
-    if (column.isArray()) {
+    if (column.dataType() == DataTypes.StringType || column.dataType() == DataTypes.BinaryType) {
       defColumn.readBinarys(num, column, rowId, maxDefLevel, data);
     } else if (column.dataType() == DataTypes.TimestampType) {
       for (int i = 0; i < num; i++) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 5c502c9d91be4..0071bd66760be 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -315,7 +315,6 @@ public ArrowColumnVector(ValueVector vector) {
 
       childColumns = new ArrowColumnVector[1];
       childColumns[0] = new ArrowColumnVector(listVector.getDataVector());
-      resultArray = new ColumnarArray(childColumns[0]);
     } else if (vector instanceof MapVector) {
       MapVector mapVector = (MapVector) vector;
       accessor = new StructAccessor(mapVector);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index 940457f2e3363..cca14911fbb28 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -175,9 +175,7 @@ public ColumnarRow getStruct(int rowId, int size) {
    * Returns the array at rowid.
    */
   public final ColumnarArray getArray(int rowId) {
-    resultArray.length = getArrayLength(rowId);
-    resultArray.offset = getArrayOffset(rowId);
-    return resultArray;
+    return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
   /**
@@ -213,21 +211,11 @@ public MapData getMap(int ordinal) {
    */
   public abstract ColumnVector getChildColumn(int ordinal);
 
-  /**
-   * Returns true if this column is an array.
-   */
-  public final boolean isArray() { return resultArray != null; }
-
   /**
    * Data type for this column.
    */
   protected DataType type;
 
-  /**
-   * Reusable Array holder for getArray().
-   */
-  protected ColumnarArray resultArray;
-
   /**
    * Reusable Struct holder for getStruct().
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index b4b5f0a265934..bc62bc43484e5 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -98,21 +98,13 @@ public static void populate(WritableColumnVector col, InternalRow row, int field
    * For example, an array of IntegerType will return an int[].
    * Throws exceptions for unhandled schemas.
    */
-  public static Object toPrimitiveJavaArray(ColumnarArray array) {
-    DataType dt = array.data.dataType();
-    if (dt instanceof IntegerType) {
-      int[] result = new int[array.length];
-      ColumnVector data = array.data;
-      for (int i = 0; i < result.length; i++) {
-        if (data.isNullAt(array.offset + i)) {
-          throw new RuntimeException("Cannot handle NULL values.");
-        }
-        result[i] = data.getInt(array.offset + i);
+  public static int[] toJavaIntArray(ColumnarArray array) {
+    for (int i = 0; i < array.numElements(); i++) {
+      if (array.isNullAt(i)) {
+        throw new RuntimeException("Cannot handle NULL values.");
       }
-      return result;
-    } else {
-      throw new UnsupportedOperationException();
     }
+    return array.toIntArray();
   }
 
   private static void appendValue(WritableColumnVector dst, DataType t, Object o) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
index b9da641fc66c8..cbc39d1d0aec2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
@@ -29,12 +29,14 @@
 public final class ColumnarArray extends ArrayData {
   // The data for this array. This array contains elements from
   // data[offset] to data[offset + length).
-  public final ColumnVector data;
-  public int length;
-  public int offset;
+  private final ColumnVector data;
+  private final int offset;
+  private final int length;
 
-  ColumnarArray(ColumnVector data) {
+  ColumnarArray(ColumnVector data, int offset, int length) {
     this.data = data;
+    this.offset = offset;
+    this.length = length;
   }
 
   @Override
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 1cbaf08569334..806d0291a6c49 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -532,7 +532,7 @@ public int putByteArray(int rowId, byte[] value, int offset, int length) {
   @Override
   protected void reserveInternal(int newCapacity) {
     int oldCapacity = (nulls == 0L) ? 0 : capacity;
-    if (this.resultArray != null) {
+    if (isArray()) {
       this.lengthData =
           Platform.reallocateMemory(lengthData, oldCapacity * 4, newCapacity * 4);
       this.offsetData =
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 85d72295ab9b8..6e7f74ce12f16 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -505,7 +505,7 @@ public int putByteArray(int rowId, byte[] value, int offset, int length) {
   // Spilt this function out since it is the slow path.
   @Override
   protected void reserveInternal(int newCapacity) {
-    if (this.resultArray != null || DecimalType.isByteArrayDecimalType(type)) {
+    if (isArray()) {
       int[] newLengths = new int[newCapacity];
       int[] newOffsets = new int[newCapacity];
       if (this.arrayLengths != null) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index e7653f0c00b9a..0bea4cc97142d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -75,7 +75,6 @@ public void close() {
     }
     dictionary = null;
     resultStruct = null;
-    resultArray = null;
   }
 
   public void reserve(int requiredCapacity) {
@@ -650,6 +649,11 @@ public WritableColumnVector getDictionaryIds() {
    */
   protected abstract WritableColumnVector reserveNewColumn(int capacity, DataType type);
 
+  protected boolean isArray() {
+    return type instanceof ArrayType || type instanceof BinaryType || type instanceof StringType ||
+      DecimalType.isByteArrayDecimalType(type);
+  }
+
   /**
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
@@ -658,8 +662,7 @@ protected WritableColumnVector(int capacity, DataType type) {
     super(type);
     this.capacity = capacity;
 
-    if (type instanceof ArrayType || type instanceof BinaryType || type instanceof StringType
-        || DecimalType.isByteArrayDecimalType(type)) {
+    if (isArray()) {
       DataType childType;
       int childCapacity = capacity;
       if (type instanceof ArrayType) {
@@ -670,7 +673,6 @@ protected WritableColumnVector(int capacity, DataType type) {
       }
       this.childColumns = new WritableColumnVector[1];
       this.childColumns[0] = reserveNewColumn(childCapacity, childType);
-      this.resultArray = new ColumnarArray(this.childColumns[0]);
       this.resultStruct = null;
     } else if (type instanceof StructType) {
       StructType st = (StructType)type;
@@ -678,18 +680,15 @@ protected WritableColumnVector(int capacity, DataType type) {
       for (int i = 0; i < childColumns.length; ++i) {
         this.childColumns[i] = reserveNewColumn(capacity, st.fields()[i].dataType());
       }
-      this.resultArray = null;
       this.resultStruct = new ColumnarRow(this.childColumns);
     } else if (type instanceof CalendarIntervalType) {
       // Two columns. Months as int. Microseconds as Long.
       this.childColumns = new WritableColumnVector[2];
       this.childColumns[0] = reserveNewColumn(capacity, DataTypes.IntegerType);
       this.childColumns[1] = reserveNewColumn(capacity, DataTypes.LongType);
-      this.resultArray = null;
       this.resultStruct = new ColumnarRow(this.childColumns);
     } else {
       this.childColumns = null;
-      this.resultArray = null;
       this.resultStruct = null;
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index e28ab710f5a99..54b31cee031f6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -57,7 +57,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendBoolean(i % 2 == 0)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, BooleanType) === (i % 2 == 0))
@@ -69,7 +69,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendByte(i.toByte)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, ByteType) === i.toByte)
@@ -81,7 +81,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendShort(i.toShort)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, ShortType) === i.toShort)
@@ -93,7 +93,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendInt(i)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, IntegerType) === i)
@@ -105,7 +105,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendLong(i)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, LongType) === i)
@@ -117,7 +117,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendFloat(i.toFloat)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, FloatType) === i.toFloat)
@@ -129,7 +129,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendDouble(i.toDouble)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, DoubleType) === i.toDouble)
@@ -142,7 +142,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendByteArray(utf8, 0, utf8.length)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       assert(array.get(i, StringType) === UTF8String.fromString(s"str$i"))
@@ -155,7 +155,7 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.appendByteArray(utf8, 0, utf8.length)
     }
 
-    val array = new ColumnarArray(testVector)
+    val array = new ColumnarArray(testVector, 0, 10)
 
     (0 until 10).foreach { i =>
       val utf8 = s"str$i".getBytes("utf8")
@@ -191,12 +191,10 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     testVector.putArray(2, 3, 0)
     testVector.putArray(3, 3, 3)
 
-    val array = new ColumnarArray(testVector)
-
-    assert(array.getArray(0).toIntArray() === Array(0))
-    assert(array.getArray(1).toIntArray() === Array(1, 2))
-    assert(array.getArray(2).toIntArray() === Array.empty[Int])
-    assert(array.getArray(3).toIntArray() === Array(3, 4, 5))
+    assert(testVector.getArray(0).toIntArray() === Array(0))
+    assert(testVector.getArray(1).toIntArray() === Array(1, 2))
+    assert(testVector.getArray(2).toIntArray() === Array.empty[Int])
+    assert(testVector.getArray(3).toIntArray() === Array(3, 4, 5))
   }
 
   val structType: StructType = new StructType().add("int", IntegerType).add("double", DoubleType)
@@ -208,12 +206,10 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     c1.putInt(1, 456)
     c2.putDouble(1, 5.67)
 
-    val array = new ColumnarArray(testVector)
-
-    assert(array.getStruct(0, structType.length).get(0, IntegerType) === 123)
-    assert(array.getStruct(0, structType.length).get(1, DoubleType) === 3.45)
-    assert(array.getStruct(1, structType.length).get(0, IntegerType) === 456)
-    assert(array.getStruct(1, structType.length).get(1, DoubleType) === 5.67)
+    assert(testVector.getStruct(0, structType.length).get(0, IntegerType) === 123)
+    assert(testVector.getStruct(0, structType.length).get(1, DoubleType) === 3.45)
+    assert(testVector.getStruct(1, structType.length).get(0, IntegerType) === 456)
+    assert(testVector.getStruct(1, structType.length).get(1, DoubleType) === 5.67)
   }
 
   test("[SPARK-22092] off-heap column vector reallocation corrupts array data") {
@@ -226,9 +222,8 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
       testVector.reserve(16)
 
       // Check that none of the values got lost/overwritten.
-      val array = new ColumnarArray(testVector)
       (0 until 8).foreach { i =>
-        assert(array.get(i, arrayType).asInstanceOf[ArrayData].toIntArray() === Array(i))
+        assert(testVector.getArray(i).toIntArray() === Array(i))
       }
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
index 705b26b8c91e6..38ea2e47fdef8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchBenchmark.scala
@@ -23,11 +23,9 @@ import scala.util.Random
 
 import org.apache.spark.memory.MemoryMode
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.execution.vectorized.ColumnVector
 import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector
-import org.apache.spark.sql.execution.vectorized.WritableColumnVector
-import org.apache.spark.sql.types.{BinaryType, DataType, IntegerType}
+import org.apache.spark.sql.types.{ArrayType, BinaryType, IntegerType}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util.Benchmark
 import org.apache.spark.util.collection.BitSet
@@ -265,20 +263,22 @@ object ColumnarBatchBenchmark {
     }
 
     /*
-    Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
-    Int Read/Write:              Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-    -------------------------------------------------------------------------
-    Java Array                          248.8          1317.04         1.00 X
-    ByteBuffer Unsafe                   435.6           752.25         0.57 X
-    ByteBuffer API                     1752.0           187.03         0.14 X
-    DirectByteBuffer                    595.4           550.35         0.42 X
-    Unsafe Buffer                       235.2          1393.20         1.06 X
-    Column(on heap)                     189.8          1726.45         1.31 X
-    Column(off heap)                    408.4           802.35         0.61 X
-    Column(off heap direct)             237.6          1379.12         1.05 X
-    UnsafeRow (on heap)                 414.6           790.35         0.60 X
-    UnsafeRow (off heap)                487.2           672.58         0.51 X
-    Column On Heap Append               530.1           618.14         0.59 X
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+    Int Read/Write:                          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    Java Array                                     177 /  181       1856.4           0.5       1.0X
+    ByteBuffer Unsafe                              318 /  322       1032.0           1.0       0.6X
+    ByteBuffer API                                1411 / 1418        232.2           4.3       0.1X
+    DirectByteBuffer                               467 /  474        701.8           1.4       0.4X
+    Unsafe Buffer                                  178 /  185       1843.6           0.5       1.0X
+    Column(on heap)                                178 /  184       1840.8           0.5       1.0X
+    Column(off heap)                               341 /  344        961.8           1.0       0.5X
+    Column(off heap direct)                        178 /  184       1845.4           0.5       1.0X
+    UnsafeRow (on heap)                            378 /  389        866.3           1.2       0.5X
+    UnsafeRow (off heap)                           393 /  402        834.0           1.2       0.4X
+    Column On Heap Append                          309 /  318       1059.1           0.9       0.6X
     */
     val benchmark = new Benchmark("Int Read/Write", count * iters)
     benchmark.addCase("Java Array")(javaArray)
@@ -332,11 +332,13 @@ object ColumnarBatchBenchmark {
       }
     }}
     /*
-    Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
-    Boolean Read/Write:          Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-    -------------------------------------------------------------------------
-    Bitset                             895.88           374.54         1.00 X
-    Byte Array                         578.96           579.56         1.55 X
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+    Boolean Read/Write:                      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    Bitset                                         726 /  727        462.4           2.2       1.0X
+    Byte Array                                     530 /  542        632.7           1.6       1.4X
     */
     benchmark.run()
   }
@@ -387,10 +389,13 @@ object ColumnarBatchBenchmark {
     }
 
     /*
-    String Read/Write:                       Avg Time(ms)    Avg Rate(M/s)  Relative Rate
-    -------------------------------------------------------------------------------------
-    On Heap                                         457.0            35.85         1.00 X
-    Off Heap                                       1206.0            13.59         0.38 X
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+    String Read/Write:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    On Heap                                        332 /  338         49.3          20.3       1.0X
+    Off Heap                                       466 /  467         35.2          28.4       0.7X
     */
     val benchmark = new Benchmark("String Read/Write", count * iters)
     benchmark.addCase("On Heap")(column(MemoryMode.ON_HEAP))
@@ -398,9 +403,94 @@ object ColumnarBatchBenchmark {
     benchmark.run
   }
 
+  def arrayAccess(iters: Int): Unit = {
+    val random = new Random(0)
+    val count = 4 * 1000
+
+    val onHeapVector = new OnHeapColumnVector(count, ArrayType(IntegerType))
+    val offHeapVector = new OffHeapColumnVector(count, ArrayType(IntegerType))
+
+    val minSize = 3
+    val maxSize = 32
+    var arraysCount = 0
+    var elementsCount = 0
+    while (arraysCount < count) {
+      val size = random.nextInt(maxSize - minSize) + minSize
+      val onHeapArrayData = onHeapVector.arrayData()
+      val offHeapArrayData = offHeapVector.arrayData()
+
+      var i = 0
+      while (i < size) {
+        val value = random.nextInt()
+        onHeapArrayData.appendInt(value)
+        offHeapArrayData.appendInt(value)
+        i += 1
+      }
+
+      onHeapVector.putArray(arraysCount, elementsCount, size)
+      offHeapVector.putArray(arraysCount, elementsCount, size)
+      elementsCount += size
+      arraysCount += 1
+    }
+
+    def readArrays(onHeap: Boolean): Unit = {
+      System.gc()
+      val vector = if (onHeap) onHeapVector else offHeapVector
+
+      var sum = 0L
+      for (_ <- 0 until iters) {
+        var i = 0
+        while (i < count) {
+          sum += vector.getArray(i).numElements()
+          i += 1
+        }
+      }
+    }
+
+    def readArrayElements(onHeap: Boolean): Unit = {
+      System.gc()
+      val vector = if (onHeap) onHeapVector else offHeapVector
+
+      var sum = 0L
+      for (_ <- 0 until iters) {
+        var i = 0
+        while (i < count) {
+          val array = vector.getArray(i)
+          val size = array.numElements()
+          var j = 0
+          while (j < size) {
+            sum += array.getInt(j)
+            j += 1
+          }
+          i += 1
+        }
+      }
+    }
+
+    val benchmark = new Benchmark("Array Vector Read", count * iters)
+    benchmark.addCase("On Heap Read Size Only") { _ => readArrays(true) }
+    benchmark.addCase("Off Heap Read Size Only") { _ => readArrays(false) }
+    benchmark.addCase("On Heap Read Elements") { _ => readArrayElements(true) }
+    benchmark.addCase("Off Heap Read Elements") { _ => readArrayElements(false) }
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+    Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
+
+    Array Vector Read:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    On Heap Read Size Only                         415 /  422        394.7           2.5       1.0X
+    Off Heap Read Size Only                        394 /  402        415.9           2.4       1.1X
+    On Heap Read Elements                         2558 / 2593         64.0          15.6       0.2X
+    Off Heap Read Elements                        3316 / 3317         49.4          20.2       0.1X
+    */
+    benchmark.run
+  }
+
   def main(args: Array[String]): Unit = {
     intAccess(1024 * 40)
     booleanAccess(1024 * 40)
     stringAccess(1024 * 4)
+    arrayAccess(1024 * 40)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 1b4e2bad09a20..0ae4f2d117609 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -645,26 +645,26 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putArray(2, 2, 0)
       column.putArray(3, 3, 3)
 
-      val a1 = ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(0)).asInstanceOf[Array[Int]]
-      val a2 = ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(1)).asInstanceOf[Array[Int]]
-      val a3 = ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(2)).asInstanceOf[Array[Int]]
-      val a4 = ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(3)).asInstanceOf[Array[Int]]
+      val a1 = ColumnVectorUtils.toJavaIntArray(column.getArray(0))
+      val a2 = ColumnVectorUtils.toJavaIntArray(column.getArray(1))
+      val a3 = ColumnVectorUtils.toJavaIntArray(column.getArray(2))
+      val a4 = ColumnVectorUtils.toJavaIntArray(column.getArray(3))
       assert(a1 === Array(0))
       assert(a2 === Array(1, 2))
       assert(a3 === Array.empty[Int])
       assert(a4 === Array(3, 4, 5))
 
       // Verify the ArrayData APIs
-      assert(column.getArray(0).length == 1)
+      assert(column.getArray(0).numElements() == 1)
       assert(column.getArray(0).getInt(0) == 0)
 
-      assert(column.getArray(1).length == 2)
+      assert(column.getArray(1).numElements() == 2)
       assert(column.getArray(1).getInt(0) == 1)
       assert(column.getArray(1).getInt(1) == 2)
 
-      assert(column.getArray(2).length == 0)
+      assert(column.getArray(2).numElements() == 0)
 
-      assert(column.getArray(3).length == 3)
+      assert(column.getArray(3).numElements() == 3)
       assert(column.getArray(3).getInt(0) == 3)
       assert(column.getArray(3).getInt(1) == 4)
       assert(column.getArray(3).getInt(2) == 5)
@@ -677,7 +677,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(data.capacity == array.length * 2)
       data.putInts(0, array.length, array, 0)
       column.putArray(0, 0, array.length)
-      assert(ColumnVectorUtils.toPrimitiveJavaArray(column.getArray(0)).asInstanceOf[Array[Int]]
+      assert(ColumnVectorUtils.toJavaIntArray(column.getArray(0))
         === array)
   }
 

From 6eb203fae7bbc9940710da40f314b89ffb4dd324 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Fri, 1 Dec 2017 01:21:52 +0900
Subject: [PATCH 013/356] [SPARK-22654][TESTS] Retry Spark tarball download if
 failed in HiveExternalCatalogVersionsSuite

## What changes were proposed in this pull request?

Adds a simple loop to retry download of Spark tarballs from different mirrors if the download fails.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19851 from srowen/SPARK-22654.
---
 .../HiveExternalCatalogVersionsSuite.scala    | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index 6859432c406a9..a3d5b941a6761 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -20,6 +20,8 @@ package org.apache.spark.sql.hive
 import java.io.File
 import java.nio.file.Files
 
+import scala.sys.process._
+
 import org.apache.spark.TestUtils
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
@@ -50,14 +52,24 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     super.afterAll()
   }
 
-  private def downloadSpark(version: String): Unit = {
-    import scala.sys.process._
+  private def tryDownloadSpark(version: String, path: String): Unit = {
+    // Try mirrors a few times until one succeeds
+    for (i <- 0 until 3) {
+      val preferredMirror =
+        Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
+      val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
+      logInfo(s"Downloading Spark $version from $url")
+      if (Seq("wget", url, "-q", "-P", path).! == 0) {
+        return
+      }
+      logWarning(s"Failed to download Spark $version from $url")
+    }
+    fail(s"Unable to download Spark $version")
+  }
 
-    val preferredMirror =
-      Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
-    val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
 
-    Seq("wget", url, "-q", "-P", sparkTestingDir.getCanonicalPath).!
+  private def downloadSpark(version: String): Unit = {
+    tryDownloadSpark(version, sparkTestingDir.getCanonicalPath)
 
     val downloaded = new File(sparkTestingDir, s"spark-$version-bin-hadoop2.7.tgz").getCanonicalPath
     val targetDir = new File(sparkTestingDir, s"spark-$version").getCanonicalPath

From 932bd09c80dc2dc113e94f59f4dcb77e77de7c58 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Fri, 1 Dec 2017 01:24:15 +0900
Subject: [PATCH 014/356] [SPARK-22635][SQL][ORC] FileNotFoundException while
 reading ORC files containing special characters

## What changes were proposed in this pull request?

SPARK-22146 fix the FileNotFoundException issue only for the `inferSchema` method, ie. only for the schema inference, but it doesn't fix the problem when actually reading the data. Thus nearly the same exception happens when someone tries to use the data. This PR covers fixing the problem also there.

## How was this patch tested?

enhanced UT

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19844 from mgaido91/SPARK-22635.
---
 .../org/apache/spark/sql/hive/orc/OrcFileFormat.scala | 11 +++++------
 .../spark/sql/hive/MetastoreDataSourcesSuite.scala    |  3 ++-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 3b33a9ff082f3..95741c7b30289 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -133,10 +133,12 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(file.filePath), Some(conf)).isEmpty
+      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf)).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
@@ -146,15 +148,12 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
           val job = Job.getInstance(conf)
           FileInputFormat.setInputPaths(job, file.filePath)
 
-          val fileSplit = new FileSplit(
-            new Path(new URI(file.filePath)), file.start, file.length, Array.empty
-          )
+          val fileSplit = new FileSplit(filePath, file.start, file.length, Array.empty)
           // Custom OrcRecordReader is used to get
           // ObjectInspector during recordReader creation itself and can
           // avoid NameNode call in unwrapOrcStructs per file.
           // Specifically would be helpful for partitioned datasets.
-          val orcReader = OrcFile.createReader(
-            new Path(new URI(file.filePath)), OrcFile.readerOptions(conf))
+          val orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf))
           new SparkOrcNewRecordReader(orcReader, conf, fileSplit.getStart, fileSplit.getLength)
         }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index a1060476f2211..c8caba83bf365 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -1350,7 +1350,8 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
       withTempDir { dir =>
         val tmpFile = s"$dir/$nameWithSpecialChars"
         spark.createDataset(Seq("a", "b")).write.format(format).save(tmpFile)
-        spark.read.format(format).load(tmpFile)
+        val fileContent = spark.read.format(format).load(tmpFile)
+        checkAnswer(fileContent, Seq(Row("a"), Row("b")))
       }
     }
   }

From 999ec137a97844abbbd483dd98c7ded2f8ff356c Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 1 Dec 2017 02:28:24 +0800
Subject: [PATCH 015/356] [SPARK-22570][SQL] Avoid to create a lot of global
 variables by using a local variable with allocation of an object in generated
 code

## What changes were proposed in this pull request?

This PR reduces # of global variables in generated code by replacing a global variable with a local variable with an allocation of an object every time. When a lot of global variables were generated, the generated code may meet 64K constant pool limit.
This PR reduces # of generated global variables in the following three operations:
* `Cast` with String to primitive byte/short/int/long
* `RegExpReplace`
* `CreateArray`

I intentionally leave [this part](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala#L595-L603). This is because this variable keeps a class that is dynamically generated. In other word, it is not possible to reuse one class.

## How was this patch tested?

Added test cases

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19797 from kiszk/SPARK-22570.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 24 ++++++-------
 .../expressions/complexTypeCreator.scala      | 36 +++++++++++--------
 .../expressions/regexpExpressions.scala       |  8 ++---
 .../sql/catalyst/expressions/CastSuite.scala  |  8 +++++
 .../expressions/RegexpExpressionsSuite.scala  | 11 +++++-
 .../optimizer/complexTypesSuite.scala         |  7 ++++
 6 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 8cafaef61c7d1..f4ecbdb8393ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -799,16 +799,16 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
 
   private[this] def castToByteCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
-      val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
-        s"$wrapper = new UTF8String.IntWrapper();")
+      val wrapper = ctx.freshName("intWrapper")
       (c, evPrim, evNull) =>
         s"""
+          UTF8String.IntWrapper $wrapper = new UTF8String.IntWrapper();
           if ($c.toByte($wrapper)) {
             $evPrim = (byte) $wrapper.value;
           } else {
             $evNull = true;
           }
+          $wrapper = null;
         """
     case BooleanType =>
       (c, evPrim, evNull) => s"$evPrim = $c ? (byte) 1 : (byte) 0;"
@@ -826,16 +826,16 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       from: DataType,
       ctx: CodegenContext): CastFunction = from match {
     case StringType =>
-      val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
-        s"$wrapper = new UTF8String.IntWrapper();")
+      val wrapper = ctx.freshName("intWrapper")
       (c, evPrim, evNull) =>
         s"""
+          UTF8String.IntWrapper $wrapper = new UTF8String.IntWrapper();
           if ($c.toShort($wrapper)) {
             $evPrim = (short) $wrapper.value;
           } else {
             $evNull = true;
           }
+          $wrapper = null;
         """
     case BooleanType =>
       (c, evPrim, evNull) => s"$evPrim = $c ? (short) 1 : (short) 0;"
@@ -851,16 +851,16 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
 
   private[this] def castToIntCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
-      val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.IntWrapper", wrapper,
-        s"$wrapper = new UTF8String.IntWrapper();")
+      val wrapper = ctx.freshName("intWrapper")
       (c, evPrim, evNull) =>
         s"""
+          UTF8String.IntWrapper $wrapper = new UTF8String.IntWrapper();
           if ($c.toInt($wrapper)) {
             $evPrim = $wrapper.value;
           } else {
             $evNull = true;
           }
+          $wrapper = null;
         """
     case BooleanType =>
       (c, evPrim, evNull) => s"$evPrim = $c ? 1 : 0;"
@@ -876,17 +876,17 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
 
   private[this] def castToLongCode(from: DataType, ctx: CodegenContext): CastFunction = from match {
     case StringType =>
-      val wrapper = ctx.freshName("wrapper")
-      ctx.addMutableState("UTF8String.LongWrapper", wrapper,
-        s"$wrapper = new UTF8String.LongWrapper();")
+      val wrapper = ctx.freshName("longWrapper")
 
       (c, evPrim, evNull) =>
         s"""
+          UTF8String.LongWrapper $wrapper = new UTF8String.LongWrapper();
           if ($c.toLong($wrapper)) {
             $evPrim = $wrapper.value;
           } else {
             $evNull = true;
           }
+          $wrapper = null;
         """
     case BooleanType =>
       (c, evPrim, evNull) => s"$evPrim = $c ? 1L : 0L;"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 57a7f2e207738..fc68bf478e1c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -63,7 +63,7 @@ case class CreateArray(children: Seq[Expression]) extends Expression {
     val (preprocess, assigns, postprocess, arrayData) =
       GenArrayData.genCodeToCreateArrayData(ctx, et, evals, false)
     ev.copy(
-      code = preprocess + ctx.splitExpressions(assigns) + postprocess,
+      code = preprocess + assigns + postprocess,
       value = arrayData,
       isNull = "false")
   }
@@ -77,24 +77,22 @@ private [sql] object GenArrayData {
    *
    * @param ctx a [[CodegenContext]]
    * @param elementType data type of underlying array elements
-   * @param elementsCode a set of [[ExprCode]] for each element of an underlying array
+   * @param elementsCode concatenated set of [[ExprCode]] for each element of an underlying array
    * @param isMapKey if true, throw an exception when the element is null
-   * @return (code pre-assignments, assignments to each array elements, code post-assignments,
-   *           arrayData name)
+   * @return (code pre-assignments, concatenated assignments to each array elements,
+   *           code post-assignments, arrayData name)
    */
   def genCodeToCreateArrayData(
       ctx: CodegenContext,
       elementType: DataType,
       elementsCode: Seq[ExprCode],
-      isMapKey: Boolean): (String, Seq[String], String, String) = {
-    val arrayName = ctx.freshName("array")
+      isMapKey: Boolean): (String, String, String, String) = {
     val arrayDataName = ctx.freshName("arrayData")
     val numElements = elementsCode.length
 
     if (!ctx.isPrimitiveType(elementType)) {
+      val arrayName = ctx.freshName("arrayObject")
       val genericArrayClass = classOf[GenericArrayData].getName
-      ctx.addMutableState("Object[]", arrayName,
-        s"$arrayName = new Object[$numElements];")
 
       val assignments = elementsCode.zipWithIndex.map { case (eval, i) =>
         val isNullAssignment = if (!isMapKey) {
@@ -110,17 +108,21 @@ private [sql] object GenArrayData {
          }
        """
       }
+      val assignmentString = ctx.splitExpressions(
+        expressions = assignments,
+        funcName = "apply",
+        extraArguments = ("Object[]", arrayDataName) :: Nil)
 
-      ("",
-       assignments,
+      (s"Object[] $arrayName = new Object[$numElements];",
+       assignmentString,
        s"final ArrayData $arrayDataName = new $genericArrayClass($arrayName);",
        arrayDataName)
     } else {
+      val arrayName = ctx.freshName("array")
       val unsafeArraySizeInBytes =
         UnsafeArrayData.calculateHeaderPortionInBytes(numElements) +
         ByteArrayMethods.roundNumberOfBytesToNearestWord(elementType.defaultSize * numElements)
       val baseOffset = Platform.BYTE_ARRAY_OFFSET
-      ctx.addMutableState("UnsafeArrayData", arrayDataName)
 
       val primitiveValueTypeName = ctx.primitiveTypeName(elementType)
       val assignments = elementsCode.zipWithIndex.map { case (eval, i) =>
@@ -137,14 +139,18 @@ private [sql] object GenArrayData {
          }
        """
       }
+      val assignmentString = ctx.splitExpressions(
+        expressions = assignments,
+        funcName = "apply",
+        extraArguments = ("UnsafeArrayData", arrayDataName) :: Nil)
 
       (s"""
         byte[] $arrayName = new byte[$unsafeArraySizeInBytes];
-        $arrayDataName = new UnsafeArrayData();
+        UnsafeArrayData $arrayDataName = new UnsafeArrayData();
         Platform.putLong($arrayName, $baseOffset, $numElements);
         $arrayDataName.pointTo($arrayName, $baseOffset, $unsafeArraySizeInBytes);
       """,
-       assignments,
+       assignmentString,
        "",
        arrayDataName)
     }
@@ -216,10 +222,10 @@ case class CreateMap(children: Seq[Expression]) extends Expression {
       s"""
        final boolean ${ev.isNull} = false;
        $preprocessKeyData
-       ${ctx.splitExpressions(assignKeys)}
+       $assignKeys
        $postprocessKeyData
        $preprocessValueData
-       ${ctx.splitExpressions(assignValues)}
+       $assignValues
        $postprocessValueData
        final MapData ${ev.value} = new $mapClass($keyArrayData, $valueArrayData);
       """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index d0d663f63f5db..53d7096dd87d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -321,8 +321,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
 
     val termLastReplacement = ctx.freshName("lastReplacement")
     val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
-
-    val termResult = ctx.freshName("result")
+    val termResult = ctx.freshName("termResult")
 
     val classNamePattern = classOf[Pattern].getCanonicalName
     val classNameStringBuffer = classOf[java.lang.StringBuffer].getCanonicalName
@@ -334,8 +333,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
     ctx.addMutableState("String", termLastReplacement, s"${termLastReplacement} = null;")
     ctx.addMutableState("UTF8String",
       termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
-    ctx.addMutableState(classNameStringBuffer,
-      termResult, s"${termResult} = new $classNameStringBuffer();")
 
     val setEvNotNull = if (nullable) {
       s"${ev.isNull} = false;"
@@ -355,7 +352,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
         ${termLastReplacementInUTF8} = $rep.clone();
         ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
       }
-      ${termResult}.delete(0, ${termResult}.length());
+      $classNameStringBuffer ${termResult} = new $classNameStringBuffer();
       java.util.regex.Matcher ${matcher} = ${termPattern}.matcher($subject.toString());
 
       while (${matcher}.find()) {
@@ -363,6 +360,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
       }
       ${matcher}.appendTail(${termResult});
       ${ev.value} = UTF8String.fromString(${termResult}.toString());
+      ${termResult} = null;
       $setEvNotNull
     """
     })
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 7837d6529d12b..65617be05a434 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -23,6 +23,7 @@ import java.util.{Calendar, Locale, TimeZone}
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneGMT
@@ -845,4 +846,11 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val outputOuter = Row.fromSeq((1 to N).map(_ => outputInner))
     checkEvaluation(cast(Literal.create(inputOuter, fromOuter), toOuter), outputOuter)
   }
+
+  test("SPARK-22570: Cast should not create a lot of global variables") {
+    val ctx = new CodegenContext
+    cast("1", IntegerType).genCode(ctx)
+    cast("2", LongType).genCode(ctx)
+    assert(ctx.mutableStates.length == 0)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 1ce150e091981..4fa61fbaf66c2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.types.{IntegerType, StringType}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import org.apache.spark.sql.types.StringType
 
 /**
  * Unit tests for regular expression (regexp) related SQL expressions.
@@ -178,6 +179,14 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(nonNullExpr, "num-num", row1)
   }
 
+  test("SPARK-22570: RegExpReplace should not create a lot of global variables") {
+    val ctx = new CodegenContext
+    RegExpReplace(Literal("100"), Literal("(\\d+)"), Literal("num")).genCode(ctx)
+    // four global variables (lastRegex, pattern, lastReplacement, and lastReplacementInUTF8)
+    // are always required
+    assert(ctx.mutableStates.length == 4)
+  }
+
   test("RegexExtract") {
     val row1 = create_row("100-200", "(\\d+)-(\\d+)", 1)
     val row2 = create_row("100-200", "(\\d+)-(\\d+)", 2)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index 3634accf1ec21..e3675367d78e4 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Range}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -164,6 +165,12 @@ class ComplexTypesSuite extends PlanTest{
     comparePlans(Optimizer execute query, expected)
   }
 
+  test("SPARK-22570: CreateArray should not create a lot of global variables") {
+    val ctx = new CodegenContext
+    CreateArray(Seq(Literal(1))).genCode(ctx)
+    assert(ctx.mutableStates.length == 0)
+  }
+
   test("simplify map ops") {
     val rel = relation
       .select(

From 6ac57fd0d1c82b834eb4bf0dd57596b92a99d6de Mon Sep 17 00:00:00 2001
From: aokolnychyi <anton.okolnychyi@sap.com>
Date: Thu, 30 Nov 2017 14:25:10 -0800
Subject: [PATCH 016/356] [SPARK-21417][SQL] Infer join conditions using
 propagated constraints

## What changes were proposed in this pull request?

This PR adds an optimization rule that infers join conditions using propagated constraints.

For instance, if there is a join, where the left relation has 'a = 1' and the right relation has 'b = 1', then the rule infers 'a = b' as a join predicate. Only semantically new predicates are appended to the existing join condition.

Refer to the corresponding ticket and tests for more details.

## How was this patch tested?

This patch comes with a new test suite to cover the implemented logic.

Author: aokolnychyi <anton.okolnychyi@sap.com>

Closes #18692 from aokolnychyi/spark-21417.
---
 .../expressions/EquivalentExpressionMap.scala |  66 +++++
 .../catalyst/expressions/ExpressionSet.scala  |   2 +
 .../sql/catalyst/optimizer/Optimizer.scala    |   1 +
 .../spark/sql/catalyst/optimizer/joins.scala  |  60 +++++
 .../EquivalentExpressionMapSuite.scala        |  56 +++++
 .../optimizer/EliminateCrossJoinSuite.scala   | 238 ++++++++++++++++++
 6 files changed, 423 insertions(+)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
new file mode 100644
index 0000000000000..cf1614afb1a76
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions.EquivalentExpressionMap.SemanticallyEqualExpr
+
+/**
+ * A class that allows you to map an expression into a set of equivalent expressions. The keys are
+ * handled based on their semantic meaning and ignoring cosmetic differences. The values are
+ * represented as [[ExpressionSet]]s.
+ *
+ * The underlying representation of keys depends on the [[Expression.semanticHash]] and
+ * [[Expression.semanticEquals]] methods.
+ *
+ * {{{
+ *   val map = new EquivalentExpressionMap()
+ *
+ *   map.put(1 + 2, a)
+ *   map.put(rand(), b)
+ *
+ *   map.get(2 + 1) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
+ *   map.get(1 + 2) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
+ *   map.get(rand()) => Set() // non-deterministic expressions are not equivalent
+ * }}}
+ */
+class EquivalentExpressionMap {
+
+  private val equivalenceMap = mutable.HashMap.empty[SemanticallyEqualExpr, ExpressionSet]
+
+  def put(expression: Expression, equivalentExpression: Expression): Unit = {
+    val equivalentExpressions = equivalenceMap.getOrElseUpdate(expression, ExpressionSet.empty)
+    equivalenceMap(expression) = equivalentExpressions + equivalentExpression
+  }
+
+  def get(expression: Expression): Set[Expression] =
+    equivalenceMap.getOrElse(expression, ExpressionSet.empty)
+}
+
+object EquivalentExpressionMap {
+
+  private implicit class SemanticallyEqualExpr(val expr: Expression) {
+    override def equals(obj: Any): Boolean = obj match {
+      case other: SemanticallyEqualExpr => expr.semanticEquals(other.expr)
+      case _ => false
+    }
+
+    override def hashCode: Int = expr.semanticHash()
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
index 7e8e7b8cd5f18..e9890837af07d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
@@ -27,6 +27,8 @@ object ExpressionSet {
     expressions.foreach(set.add)
     set
   }
+
+  val empty: ExpressionSet = ExpressionSet(Nil)
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0d961bf2e6e5e..8a5c486912abf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -87,6 +87,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       PushProjectionThroughUnion,
       ReorderJoin,
       EliminateOuterJoin,
+      EliminateCrossJoin,
       InferFiltersFromConstraints,
       BooleanSimplification,
       PushPredicateThroughJoin,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index edbeaf273fd6f..29a3a7f109b80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.annotation.tailrec
+import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins
@@ -152,3 +153,62 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper {
       if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType))
   }
 }
+
+/**
+ * A rule that eliminates CROSS joins by inferring join conditions from propagated constraints.
+ *
+ * The optimization is applicable only to CROSS joins. For other join types, adding inferred join
+ * conditions would potentially shuffle children as child node's partitioning won't satisfy the JOIN
+ * node's requirements which otherwise could have.
+ *
+ * For instance, given a CROSS join with the constraint 'a = 1' from the left child and the
+ * constraint 'b = 1' from the right child, this rule infers a new join predicate 'a = b' and
+ * converts it to an Inner join.
+ */
+object EliminateCrossJoin extends Rule[LogicalPlan] with PredicateHelper {
+
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (SQLConf.get.constraintPropagationEnabled) {
+      eliminateCrossJoin(plan)
+    } else {
+      plan
+    }
+  }
+
+  private def eliminateCrossJoin(plan: LogicalPlan): LogicalPlan = plan transform {
+    case join @ Join(leftPlan, rightPlan, Cross, None) =>
+      val leftConstraints = join.constraints.filter(_.references.subsetOf(leftPlan.outputSet))
+      val rightConstraints = join.constraints.filter(_.references.subsetOf(rightPlan.outputSet))
+      val inferredJoinPredicates = inferJoinPredicates(leftConstraints, rightConstraints)
+      val joinConditionOpt = inferredJoinPredicates.reduceOption(And)
+      if (joinConditionOpt.isDefined) Join(leftPlan, rightPlan, Inner, joinConditionOpt) else join
+  }
+
+  private def inferJoinPredicates(
+      leftConstraints: Set[Expression],
+      rightConstraints: Set[Expression]): mutable.Set[EqualTo] = {
+
+    val equivalentExpressionMap = new EquivalentExpressionMap()
+
+    leftConstraints.foreach {
+      case EqualTo(attr: Attribute, expr: Expression) =>
+        equivalentExpressionMap.put(expr, attr)
+      case EqualTo(expr: Expression, attr: Attribute) =>
+        equivalentExpressionMap.put(expr, attr)
+      case _ =>
+    }
+
+    val joinConditions = mutable.Set.empty[EqualTo]
+
+    rightConstraints.foreach {
+      case EqualTo(attr: Attribute, expr: Expression) =>
+        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
+      case EqualTo(expr: Expression, attr: Attribute) =>
+        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
+      case _ =>
+    }
+
+    joinConditions
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
new file mode 100644
index 0000000000000..bad7e17bb6cf2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.dsl.expressions._
+
+class EquivalentExpressionMapSuite extends SparkFunSuite {
+
+  private val onePlusTwo = Literal(1) + Literal(2)
+  private val twoPlusOne = Literal(2) + Literal(1)
+  private val rand = Rand(10)
+
+  test("behaviour of the equivalent expression map") {
+    val equivalentExpressionMap = new EquivalentExpressionMap()
+    equivalentExpressionMap.put(onePlusTwo, 'a)
+    equivalentExpressionMap.put(Literal(1) + Literal(3), 'b)
+    equivalentExpressionMap.put(rand, 'c)
+
+    // 1 + 2 should be equivalent to 2 + 1
+    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
+    // non-deterministic expressions should not be equivalent
+    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
+
+    // if the same (key, value) is added several times, the map still returns only one entry
+    equivalentExpressionMap.put(onePlusTwo, 'a)
+    equivalentExpressionMap.put(twoPlusOne, 'a)
+    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
+
+    // get several equivalent attributes
+    equivalentExpressionMap.put(onePlusTwo, 'e)
+    assertResult(ExpressionSet(Seq('a, 'e)))(equivalentExpressionMap.get(onePlusTwo))
+    assertResult(2)(equivalentExpressionMap.get(onePlusTwo).size)
+
+    // several non-deterministic expressions should not be equivalent
+    equivalentExpressionMap.put(rand, 'd)
+    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
+    assertResult(0)(equivalentExpressionMap.get(rand).size)
+  }
+
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala
new file mode 100644
index 0000000000000..e04dd28ee36a0
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, Literal, Not, Rand}
+import org.apache.spark.sql.catalyst.plans.{Cross, Inner, JoinType, PlanTest}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.internal.SQLConf.CONSTRAINT_PROPAGATION_ENABLED
+import org.apache.spark.sql.types.IntegerType
+
+class EliminateCrossJoinSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Eliminate cross joins", FixedPoint(10),
+        EliminateCrossJoin,
+        PushPredicateThroughJoin) :: Nil
+  }
+
+  val testRelation1 = LocalRelation('a.int, 'b.int)
+  val testRelation2 = LocalRelation('c.int, 'd.int)
+
+  test("successful elimination of cross joins (1)") {
+    checkJoinOptimization(
+      originalFilter = 'a === 1 && 'c === 1 && 'd === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === 1,
+      expectedRightRelationFilter = 'c === 1 && 'd === 1,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'c && 'a === 'd))
+  }
+
+  test("successful elimination of cross joins (2)") {
+    checkJoinOptimization(
+      originalFilter = 'a === 1 && 'b === 2 && 'd === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === 1 && 'b === 2,
+      expectedRightRelationFilter = 'd === 1,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'd))
+  }
+
+  test("successful elimination of cross joins (3)") {
+    // PushPredicateThroughJoin will push 'd === 'a into the join condition
+    // EliminateCrossJoin will NOT apply because the condition will be already present
+    // therefore, the join type will stay the same (i.e., CROSS)
+    checkJoinOptimization(
+      originalFilter = 'a === 1 && Literal(1) === 'd && 'd === 'a,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === 1,
+      expectedRightRelationFilter = Literal(1) === 'd,
+      expectedJoinType = Cross,
+      expectedJoinCondition = Some('a === 'd))
+  }
+
+  test("successful elimination of cross joins (4)") {
+    // Literal(1) * Literal(2) and Literal(2) * Literal(1) are semantically equal
+    checkJoinOptimization(
+      originalFilter = 'a === Literal(1) * Literal(2) && Literal(2) * Literal(1) === 'c,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === Literal(1) * Literal(2),
+      expectedRightRelationFilter = Literal(2) * Literal(1) === 'c,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'c))
+  }
+
+  test("successful elimination of cross joins (5)") {
+    checkJoinOptimization(
+      originalFilter = 'a === 1 && Literal(1) === 'a && 'c === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === 1 && Literal(1) === 'a,
+      expectedRightRelationFilter = 'c === 1,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'c))
+  }
+
+  test("successful elimination of cross joins (6)") {
+    checkJoinOptimization(
+      originalFilter = 'a === Cast("1", IntegerType) && 'c === Cast("1", IntegerType) && 'd === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === Cast("1", IntegerType),
+      expectedRightRelationFilter = 'c === Cast("1", IntegerType) && 'd === 1,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'c))
+  }
+
+  test("successful elimination of cross joins (7)") {
+    // The join condition appears due to PushPredicateThroughJoin
+    checkJoinOptimization(
+      originalFilter = (('a >= 1 && 'c === 1) || 'd === 10) && 'b === 10 && 'c === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'b === 10,
+      expectedRightRelationFilter = 'c === 1,
+      expectedJoinType = Cross,
+      expectedJoinCondition = Some(('a >= 1 && 'c === 1) || 'd === 10))
+  }
+
+  test("successful elimination of cross joins (8)") {
+    checkJoinOptimization(
+      originalFilter = 'a === 1 && 'c === 1 && Literal(1) === 'a && Literal(1) === 'c,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a === 1 && Literal(1) === 'a,
+      expectedRightRelationFilter = 'c === 1 && Literal(1) === 'c,
+      expectedJoinType = Inner,
+      expectedJoinCondition = Some('a === 'c))
+  }
+
+  test("inability to detect join conditions when constant propagation is disabled") {
+    withSQLConf(CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
+      checkJoinOptimization(
+        originalFilter = 'a === 1 && 'c === 1 && 'd === 1,
+        originalJoinType = Cross,
+        originalJoinCondition = None,
+        expectedFilter = None,
+        expectedLeftRelationFilter = 'a === 1,
+        expectedRightRelationFilter = 'c === 1 && 'd === 1,
+        expectedJoinType = Cross,
+        expectedJoinCondition = None)
+    }
+  }
+
+  test("inability to detect join conditions (1)") {
+    checkJoinOptimization(
+      originalFilter = 'a >= 1 && 'c === 1 && 'd >= 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = 'a >= 1,
+      expectedRightRelationFilter = 'c === 1 && 'd >= 1,
+      expectedJoinType = Cross,
+      expectedJoinCondition = None)
+  }
+
+  test("inability to detect join conditions (2)") {
+    checkJoinOptimization(
+      originalFilter = Literal(1) === 'b && ('c === 1 || 'd === 1),
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = Literal(1) === 'b,
+      expectedRightRelationFilter = 'c === 1 || 'd === 1,
+      expectedJoinType = Cross,
+      expectedJoinCondition = None)
+  }
+
+  test("inability to detect join conditions (3)") {
+    checkJoinOptimization(
+      originalFilter = Literal(1) === 'b && 'c === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = Some('c === 'b),
+      expectedFilter = None,
+      expectedLeftRelationFilter = Literal(1) === 'b,
+      expectedRightRelationFilter = 'c === 1,
+      expectedJoinType = Cross,
+      expectedJoinCondition = Some('c === 'b))
+  }
+
+  test("inability to detect join conditions (4)") {
+    checkJoinOptimization(
+      originalFilter = Not('a === 1) && 'd === 1,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = None,
+      expectedLeftRelationFilter = Not('a === 1),
+      expectedRightRelationFilter = 'd === 1,
+      expectedJoinType = Cross,
+      expectedJoinCondition = None)
+  }
+
+  test("inability to detect join conditions (5)") {
+    checkJoinOptimization(
+      originalFilter = 'a === Rand(10) && 'b === 1 && 'd === Rand(10) && 'c === 3,
+      originalJoinType = Cross,
+      originalJoinCondition = None,
+      expectedFilter = Some('a === Rand(10) && 'd === Rand(10)),
+      expectedLeftRelationFilter = 'b === 1,
+      expectedRightRelationFilter = 'c === 3,
+      expectedJoinType = Cross,
+      expectedJoinCondition = None)
+  }
+
+  private def checkJoinOptimization(
+      originalFilter: Expression,
+      originalJoinType: JoinType,
+      originalJoinCondition: Option[Expression],
+      expectedFilter: Option[Expression],
+      expectedLeftRelationFilter: Expression,
+      expectedRightRelationFilter: Expression,
+      expectedJoinType: JoinType,
+      expectedJoinCondition: Option[Expression]): Unit = {
+
+    val originalQuery = testRelation1
+      .join(testRelation2, originalJoinType, originalJoinCondition)
+      .where(originalFilter)
+    val optimizedQuery = Optimize.execute(originalQuery.analyze)
+
+    val left = testRelation1.where(expectedLeftRelationFilter)
+    val right = testRelation2.where(expectedRightRelationFilter)
+    val join = left.join(right, expectedJoinType, expectedJoinCondition)
+    val expectedQuery = expectedFilter.fold(join)(join.where(_)).analyze
+
+    comparePlans(optimizedQuery, expectedQuery)
+  }
+}

From bcceab649510a45f4c4b8e44b157c9987adff6f4 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Thu, 30 Nov 2017 15:36:26 -0800
Subject: [PATCH 017/356] [SPARK-22489][SQL] Shouldn't change broadcast join
 buildSide if user clearly specified

## What changes were proposed in this pull request?

How to reproduce:
```scala
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec

spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("table1")
spark.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value").createTempView("table2")

val bl = sql("SELECT /*+ MAPJOIN(t1) */ * FROM table1 t1 JOIN table2 t2 ON t1.key = t2.key").queryExecution.executedPlan

println(bl.children.head.asInstanceOf[BroadcastHashJoinExec].buildSide)
```
The result is `BuildRight`, but should be `BuildLeft`. This PR fix this issue.
## How was this patch tested?

unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19714 from wangyum/SPARK-22489.
---
 docs/sql-programming-guide.md                 | 58 ++++++++++++++++
 .../spark/sql/execution/SparkStrategies.scala | 67 +++++++++++++-----
 .../execution/joins/BroadcastJoinSuite.scala  | 69 ++++++++++++++++++-
 3 files changed, 177 insertions(+), 17 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 983770d506836..a1b9c3bbfd059 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1492,6 +1492,64 @@ that these options will be deprecated in future release as more optimizations ar
   </tr>
 </table>
 
+## Broadcast Hint for SQL Queries
+
+The `BROADCAST` hint guides Spark to broadcast each specified table when joining them with another table or view.
+When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred, 
+even if the statistics is above the configuration `spark.sql.autoBroadcastJoinThreshold`.
+When both sides of a join are specified, Spark broadcasts the one having the lower statistics.
+Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join) 
+support BHJ. When the broadcast nested loop join is selected, we still respect the hint.
+
+<div class="codetabs">
+
+<div data-lang="scala"  markdown="1">
+
+{% highlight scala %}
+import org.apache.spark.sql.functions.broadcast
+broadcast(spark.table("src")).join(spark.table("records"), "key").show()
+{% endhighlight %}
+
+</div>
+
+<div data-lang="java"  markdown="1">
+
+{% highlight java %}
+import static org.apache.spark.sql.functions.broadcast;
+broadcast(spark.table("src")).join(spark.table("records"), "key").show();
+{% endhighlight %}
+
+</div>
+
+<div data-lang="python"  markdown="1">
+
+{% highlight python %}
+from pyspark.sql.functions import broadcast
+broadcast(spark.table("src")).join(spark.table("records"), "key").show()
+{% endhighlight %}
+
+</div>
+
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+src <- sql("SELECT * FROM src")
+records <- sql("SELECT * FROM records")
+head(join(broadcast(src), records, src$key == records$key))
+{% endhighlight %}
+
+</div>
+
+<div data-lang="sql"  markdown="1">
+
+{% highlight sql %}
+-- We accept BROADCAST, BROADCASTJOIN and MAPJOIN for broadcast hint
+SELECT /*+ BROADCAST(r) */ * FROM records r JOIN src s ON r.key = s.key
+{% endhighlight %}
+
+</div>
+</div>
+
 # Distributed SQL Engine
 
 Spark SQL can also act as a distributed query engine using its JDBC/ODBC or command-line interface.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 19b858faba6ea..1fe3cb1c8750a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.columnar.{InMemoryRelation, InMemoryTableScanExec}
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
-import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight}
+import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamingQuery
@@ -91,12 +91,12 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
    * predicates can be evaluated by matching join keys. If found,  Join implementations are chosen
    * with the following precedence:
    *
-   * - Broadcast: if one side of the join has an estimated physical size that is smaller than the
-   *     user-configurable [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold
-   *     or if that side has an explicit broadcast hint (e.g. the user applied the
-   *     [[org.apache.spark.sql.functions.broadcast()]] function to a DataFrame), then that side
-   *     of the join will be broadcasted and the other side will be streamed, with no shuffling
-   *     performed. If both sides of the join are eligible to be broadcasted then the
+   * - Broadcast: We prefer to broadcast the join side with an explicit broadcast hint(e.g. the
+   *     user applied the [[org.apache.spark.sql.functions.broadcast()]] function to a DataFrame).
+   *     If both sides have the broadcast hint, we prefer to broadcast the side with a smaller
+   *     estimated physical size. If neither one of the sides has the broadcast hint,
+   *     we only broadcast the join side if its estimated physical size that is smaller than
+   *     the user-configurable [[SQLConf.AUTO_BROADCASTJOIN_THRESHOLD]] threshold.
    * - Shuffle hash join: if the average size of a single partition is small enough to build a hash
    *     table.
    * - Sort merge: if the matching join keys are sortable.
@@ -112,9 +112,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
      * Matches a plan whose output should be small enough to be used in broadcast join.
      */
     private def canBroadcast(plan: LogicalPlan): Boolean = {
-      plan.stats.hints.broadcast ||
-        (plan.stats.sizeInBytes >= 0 &&
-          plan.stats.sizeInBytes <= conf.autoBroadcastJoinThreshold)
+      plan.stats.sizeInBytes >= 0 && plan.stats.sizeInBytes <= conf.autoBroadcastJoinThreshold
     }
 
     /**
@@ -149,10 +147,45 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case _ => false
     }
 
+    private def broadcastSide(
+        canBuildLeft: Boolean,
+        canBuildRight: Boolean,
+        left: LogicalPlan,
+        right: LogicalPlan): BuildSide = {
+
+      def smallerSide =
+        if (right.stats.sizeInBytes <= left.stats.sizeInBytes) BuildRight else BuildLeft
+
+      val buildRight = canBuildRight && right.stats.hints.broadcast
+      val buildLeft = canBuildLeft && left.stats.hints.broadcast
+
+      if (buildRight && buildLeft) {
+        // Broadcast smaller side base on its estimated physical size
+        // if both sides have broadcast hint
+        smallerSide
+      } else if (buildRight) {
+        BuildRight
+      } else if (buildLeft) {
+        BuildLeft
+      } else if (canBuildRight && canBuildLeft) {
+        // for the last default broadcast nested loop join
+        smallerSide
+      } else {
+        throw new AnalysisException("Can not decide which side to broadcast for this join")
+      }
+    }
+
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
 
       // --- BroadcastHashJoin --------------------------------------------------------------------
 
+      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
+        if (canBuildRight(joinType) && right.stats.hints.broadcast) ||
+          (canBuildLeft(joinType) && left.stats.hints.broadcast) =>
+        val buildSide = broadcastSide(canBuildLeft(joinType), canBuildRight(joinType), left, right)
+        Seq(joins.BroadcastHashJoinExec(
+          leftKeys, rightKeys, joinType, buildSide, condition, planLater(left), planLater(right)))
+
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
         if canBuildRight(joinType) && canBroadcast(right) =>
         Seq(joins.BroadcastHashJoinExec(
@@ -189,6 +222,13 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       // --- Without joining keys ------------------------------------------------------------
 
       // Pick BroadcastNestedLoopJoin if one side could be broadcasted
+      case j @ logical.Join(left, right, joinType, condition)
+        if (canBuildRight(joinType) && right.stats.hints.broadcast) ||
+          (canBuildLeft(joinType) && left.stats.hints.broadcast) =>
+        val buildSide = broadcastSide(canBuildLeft(joinType), canBuildRight(joinType), left, right)
+        joins.BroadcastNestedLoopJoinExec(
+          planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
+
       case j @ logical.Join(left, right, joinType, condition)
           if canBuildRight(joinType) && canBroadcast(right) =>
         joins.BroadcastNestedLoopJoinExec(
@@ -203,12 +243,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         joins.CartesianProductExec(planLater(left), planLater(right), condition) :: Nil
 
       case logical.Join(left, right, joinType, condition) =>
-        val buildSide =
-          if (right.stats.sizeInBytes <= left.stats.sizeInBytes) {
-            BuildRight
-          } else {
-            BuildLeft
-          }
+        val buildSide = broadcastSide(canBuildLeft = true, canBuildRight = true, left, right)
         // This join could be very slow or OOM
         joins.BroadcastNestedLoopJoinExec(
           planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index a0fad862b44c7..67e2cdc7394bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -22,7 +22,7 @@ import scala.reflect.ClassTag
 import org.apache.spark.AccumulatorSuite
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{BitwiseAnd, BitwiseOr, Cast, Literal, ShiftLeft}
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -223,4 +223,71 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
     assert(HashJoin.rewriteKeyExpr(l :: ss :: Nil) === l :: ss :: Nil)
     assert(HashJoin.rewriteKeyExpr(i :: ss :: Nil) === i :: ss :: Nil)
   }
+
+  test("Shouldn't change broadcast join buildSide if user clearly specified") {
+    def assertJoinBuildSide(sqlStr: String, joinMethod: String, buildSide: BuildSide): Any = {
+      val executedPlan = sql(sqlStr).queryExecution.executedPlan
+      executedPlan match {
+        case b: BroadcastNestedLoopJoinExec =>
+          assert(b.getClass.getSimpleName === joinMethod)
+          assert(b.buildSide === buildSide)
+        case w: WholeStageCodegenExec =>
+          assert(w.children.head.getClass.getSimpleName === joinMethod)
+          assert(w.children.head.asInstanceOf[BroadcastHashJoinExec].buildSide === buildSide)
+      }
+    }
+
+    withTempView("t1", "t2") {
+      spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("t1")
+      spark.createDataFrame(Seq((1, "1"), (2, "12.3"), (2, "123"))).toDF("key", "value")
+        .createTempView("t2")
+
+      val t1Size = spark.table("t1").queryExecution.analyzed.children.head.stats.sizeInBytes
+      val t2Size = spark.table("t2").queryExecution.analyzed.children.head.stats.sizeInBytes
+      assert(t1Size < t2Size)
+
+      val bh = BroadcastHashJoinExec.toString
+      val bl = BroadcastNestedLoopJoinExec.toString
+
+      // INNER JOIN && t1Size < t2Size => BuildLeft
+      assertJoinBuildSide(
+        "SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
+      // LEFT JOIN => BuildRight
+      assertJoinBuildSide(
+        "SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 LEFT JOIN t2 ON t1.key = t2.key", bh, BuildRight)
+      // RIGHT JOIN => BuildLeft
+      assertJoinBuildSide(
+        "SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 RIGHT JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
+      // INNER JOIN && broadcast(t1) => BuildLeft
+      assertJoinBuildSide(
+        "SELECT /*+ MAPJOIN(t1) */ * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
+      // INNER JOIN && broadcast(t2) => BuildRight
+      assertJoinBuildSide(
+        "SELECT /*+ MAPJOIN(t2) */ * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildRight)
+
+
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0",
+        SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+        // INNER JOIN && t1Size < t2Size => BuildLeft
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 JOIN t2", bl, BuildLeft)
+        // FULL JOIN && t1Size < t2Size => BuildLeft
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 FULL JOIN t2", bl, BuildLeft)
+        // LEFT JOIN => BuildRight
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 LEFT JOIN t2", bl, BuildRight)
+        // RIGHT JOIN => BuildLeft
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 RIGHT JOIN t2", bl, BuildLeft)
+        // INNER JOIN && broadcast(t1) => BuildLeft
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1) */ * FROM t1 JOIN t2", bl, BuildLeft)
+        // INNER JOIN && broadcast(t2) => BuildRight
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t2) */ * FROM t1 JOIN t2", bl, BuildRight)
+        // FULL OUTER && broadcast(t1) => BuildLeft
+        assertJoinBuildSide("SELECT /*+ MAPJOIN(t1) */ * FROM t1 FULL OUTER JOIN t2", bl, BuildLeft)
+        // FULL OUTER && broadcast(t2) => BuildRight
+        assertJoinBuildSide(
+          "SELECT /*+ MAPJOIN(t2) */ * FROM t1 FULL OUTER JOIN t2", bl, BuildRight)
+        // FULL OUTER && t1Size < t2Size => BuildLeft
+        assertJoinBuildSide("SELECT * FROM t1 FULL OUTER JOIN t2", bl, BuildLeft)
+      }
+    }
+  }
 }

From f5f8e84d9d35751dad51490b6ae22931aa88db7b Mon Sep 17 00:00:00 2001
From: Adrian Ionescu <adrian@databricks.com>
Date: Thu, 30 Nov 2017 15:41:34 -0800
Subject: [PATCH 018/356] [SPARK-22614] Dataset API: repartitionByRange(...)

## What changes were proposed in this pull request?

This PR introduces a way to explicitly range-partition a Dataset. So far, only round-robin and hash partitioning were possible via `df.repartition(...)`, but sometimes range partitioning might be desirable: e.g. when writing to disk, for better compression without the cost of global sort.

The current implementation piggybacks on the existing `RepartitionByExpression` `LogicalPlan` and simply adds the following logic: If its expressions are of type `SortOrder`, then it will do `RangePartitioning`; otherwise `HashPartitioning`. This was by far the least intrusive solution I could come up with.

## How was this patch tested?
Unit test for `RepartitionByExpression` changes, a test to ensure we're not changing the behavior of existing `.repartition()` and a few end-to-end tests in `DataFrameSuite`.

Author: Adrian Ionescu <adrian@databricks.com>

Closes #19828 from adrian-ionescu/repartitionByRange.
---
 .../plans/logical/basicLogicalOperators.scala | 20 +++++++
 .../sql/catalyst/analysis/AnalysisSuite.scala | 26 +++++++++
 .../scala/org/apache/spark/sql/Dataset.scala  | 57 +++++++++++++++++--
 .../spark/sql/execution/SparkStrategies.scala |  5 +-
 .../org/apache/spark/sql/DataFrameSuite.scala | 57 +++++++++++++++++++
 5 files changed, 157 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index c2750c3079814..93de7c1daf5c2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation._
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.RandomSampler
@@ -838,6 +839,25 @@ case class RepartitionByExpression(
 
   require(numPartitions > 0, s"Number of partitions ($numPartitions) must be positive.")
 
+  val partitioning: Partitioning = {
+    val (sortOrder, nonSortOrder) = partitionExpressions.partition(_.isInstanceOf[SortOrder])
+
+    require(sortOrder.isEmpty || nonSortOrder.isEmpty,
+      s"${getClass.getSimpleName} expects that either all its `partitionExpressions` are of type " +
+        "`SortOrder`, which means `RangePartitioning`, or none of them are `SortOrder`, which " +
+        "means `HashPartitioning`. In this case we have:" +
+      s"""
+         |SortOrder: ${sortOrder}
+         |NonSortOrder: ${nonSortOrder}
+       """.stripMargin)
+
+    if (sortOrder.nonEmpty) {
+      RangePartitioning(sortOrder.map(_.asInstanceOf[SortOrder]), numPartitions)
+    } else {
+      HashPartitioning(nonSortOrder, numPartitions)
+    }
+  }
+
   override def maxRows: Option[Long] = child.maxRows
   override def shuffle: Boolean = true
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index e56a5d6368318..0e2e706a31a05 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning}
 import org.apache.spark.sql.types._
 
 
@@ -514,4 +515,29 @@ class AnalysisSuite extends AnalysisTest with Matchers {
       Seq("Number of column aliases does not match number of columns. " +
         "Number of column aliases: 5; number of columns: 4."))
   }
+
+  test("SPARK-22614 RepartitionByExpression partitioning") {
+    def checkPartitioning[T <: Partitioning](numPartitions: Int, exprs: Expression*): Unit = {
+      val partitioning = RepartitionByExpression(exprs, testRelation2, numPartitions).partitioning
+      assert(partitioning.isInstanceOf[T])
+    }
+
+    checkPartitioning[HashPartitioning](numPartitions = 10, exprs = Literal(20))
+    checkPartitioning[HashPartitioning](numPartitions = 10, exprs = 'a.attr, 'b.attr)
+
+    checkPartitioning[RangePartitioning](numPartitions = 10,
+      exprs = SortOrder(Literal(10), Ascending))
+    checkPartitioning[RangePartitioning](numPartitions = 10,
+      exprs = SortOrder('a.attr, Ascending), SortOrder('b.attr, Descending))
+
+    intercept[IllegalArgumentException] {
+      checkPartitioning(numPartitions = 0, exprs = Literal(20))
+    }
+    intercept[IllegalArgumentException] {
+      checkPartitioning(numPartitions = -1, exprs = Literal(20))
+    }
+    intercept[IllegalArgumentException] {
+      checkPartitioning(numPartitions = 10, exprs = SortOrder('a.attr, Ascending), 'b.attr)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 1620ab3aa2094..167c9d050c3c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2732,8 +2732,18 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   @scala.annotation.varargs
-  def repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] = withTypedPlan {
-    RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, numPartitions)
+  def repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] = {
+    // The underlying `LogicalPlan` operator special-cases all-`SortOrder` arguments.
+    // However, we don't want to complicate the semantics of this API method.
+    // Instead, let's give users a friendly error message, pointing them to the new method.
+    val sortOrders = partitionExprs.filter(_.expr.isInstanceOf[SortOrder])
+    if (sortOrders.nonEmpty) throw new IllegalArgumentException(
+      s"""Invalid partitionExprs specified: $sortOrders
+         |For range partitioning use repartitionByRange(...) instead.
+       """.stripMargin)
+    withTypedPlan {
+      RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, numPartitions)
+    }
   }
 
   /**
@@ -2747,9 +2757,46 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   @scala.annotation.varargs
-  def repartition(partitionExprs: Column*): Dataset[T] = withTypedPlan {
-    RepartitionByExpression(
-      partitionExprs.map(_.expr), logicalPlan, sparkSession.sessionState.conf.numShufflePartitions)
+  def repartition(partitionExprs: Column*): Dataset[T] = {
+    repartition(sparkSession.sessionState.conf.numShufflePartitions, partitionExprs: _*)
+  }
+
+  /**
+   * Returns a new Dataset partitioned by the given partitioning expressions into
+   * `numPartitions`. The resulting Dataset is range partitioned.
+   *
+   * At least one partition-by expression must be specified.
+   * When no explicit sort order is specified, "ascending nulls first" is assumed.
+   *
+   * @group typedrel
+   * @since 2.3.0
+   */
+  @scala.annotation.varargs
+  def repartitionByRange(numPartitions: Int, partitionExprs: Column*): Dataset[T] = {
+    require(partitionExprs.nonEmpty, "At least one partition-by expression must be specified.")
+    val sortOrder: Seq[SortOrder] = partitionExprs.map(_.expr match {
+      case expr: SortOrder => expr
+      case expr: Expression => SortOrder(expr, Ascending)
+    })
+    withTypedPlan {
+      RepartitionByExpression(sortOrder, logicalPlan, numPartitions)
+    }
+  }
+
+  /**
+   * Returns a new Dataset partitioned by the given partitioning expressions, using
+   * `spark.sql.shuffle.partitions` as number of partitions.
+   * The resulting Dataset is range partitioned.
+   *
+   * At least one partition-by expression must be specified.
+   * When no explicit sort order is specified, "ascending nulls first" is assumed.
+   *
+   * @group typedrel
+   * @since 2.3.0
+   */
+  @scala.annotation.varargs
+  def repartitionByRange(partitionExprs: Column*): Dataset[T] = {
+    repartitionByRange(sparkSession.sessionState.conf.numShufflePartitions, partitionExprs: _*)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 1fe3cb1c8750a..9e713cd7bbe2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -482,9 +482,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil
       case r: logical.Range =>
         execution.RangeExec(r) :: Nil
-      case logical.RepartitionByExpression(expressions, child, numPartitions) =>
-        exchange.ShuffleExchangeExec(HashPartitioning(
-          expressions, numPartitions), planLater(child)) :: Nil
+      case r: logical.RepartitionByExpression =>
+        exchange.ShuffleExchangeExec(r.partitioning, planLater(r.child)) :: Nil
       case ExternalRDD(outputObjAttr, rdd) => ExternalRDDScanExec(outputObjAttr, rdd) :: Nil
       case r: LogicalRDD =>
         RDDScanExec(r.output, r.rdd, "ExistingRDD", r.outputPartitioning, r.outputOrdering) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 72a5cc98fbec3..5e4c1a6a484fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -358,6 +358,63 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       testData.select('key).collect().toSeq)
   }
 
+  test("repartition with SortOrder") {
+    // passing SortOrder expressions to .repartition() should result in an informative error
+
+    def checkSortOrderErrorMsg[T](data: => Dataset[T]): Unit = {
+      val ex = intercept[IllegalArgumentException](data)
+      assert(ex.getMessage.contains("repartitionByRange"))
+    }
+
+    checkSortOrderErrorMsg {
+      Seq(0).toDF("a").repartition(2, $"a".asc)
+    }
+
+    checkSortOrderErrorMsg {
+      Seq((0, 0)).toDF("a", "b").repartition(2, $"a".asc, $"b")
+    }
+  }
+
+  test("repartitionByRange") {
+    val data1d = Random.shuffle(0.to(9))
+    val data2d = data1d.map(i => (i, data1d.size - i))
+
+    checkAnswer(
+      data1d.toDF("val").repartitionByRange(data1d.size, $"val".asc)
+        .select(spark_partition_id().as("id"), $"val"),
+      data1d.map(i => Row(i, i)))
+
+    checkAnswer(
+      data1d.toDF("val").repartitionByRange(data1d.size, $"val".desc)
+        .select(spark_partition_id().as("id"), $"val"),
+      data1d.map(i => Row(i, data1d.size - 1 - i)))
+
+    checkAnswer(
+      data1d.toDF("val").repartitionByRange(data1d.size, lit(42))
+        .select(spark_partition_id().as("id"), $"val"),
+      data1d.map(i => Row(0, i)))
+
+    checkAnswer(
+      data1d.toDF("val").repartitionByRange(data1d.size, lit(null), $"val".asc, rand())
+        .select(spark_partition_id().as("id"), $"val"),
+      data1d.map(i => Row(i, i)))
+
+    // .repartitionByRange() assumes .asc by default if no explicit sort order is specified
+    checkAnswer(
+      data2d.toDF("a", "b").repartitionByRange(data2d.size, $"a".desc, $"b")
+        .select(spark_partition_id().as("id"), $"a", $"b"),
+      data2d.toDF("a", "b").repartitionByRange(data2d.size, $"a".desc, $"b".asc)
+        .select(spark_partition_id().as("id"), $"a", $"b"))
+
+    // at least one partition-by expression must be specified
+    intercept[IllegalArgumentException] {
+      data1d.toDF("val").repartitionByRange(data1d.size)
+    }
+    intercept[IllegalArgumentException] {
+      data1d.toDF("val").repartitionByRange(data1d.size, Seq.empty: _*)
+    }
+  }
+
   test("coalesce") {
     intercept[IllegalArgumentException] {
       testData.select('key).coalesce(0)

From 7e5f669eb684629c88218f8ec26c01a41a6fef32 Mon Sep 17 00:00:00 2001
From: gaborgsomogyi <gabor.g.somogyi@gmail.com>
Date: Thu, 30 Nov 2017 19:20:32 -0600
Subject: [PATCH 019/356] =?UTF-8?q?[SPARK-22428][DOC]=20Add=20spark=20appl?=
 =?UTF-8?q?ication=20garbage=20collector=20configurat=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

The spark properties for configuring the ContextCleaner are not documented in the official documentation at https://spark.apache.org/docs/latest/configuration.html#available-properties.

This PR adds the doc.

## How was this patch tested?

Manual.

```
cd docs
jekyll build
open _site/configuration.html
```

Author: gaborgsomogyi <gabor.g.somogyi@gmail.com>

Closes #19826 from gaborgsomogyi/SPARK-22428.
---
 docs/configuration.md | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index e42f866c40566..ef061dd39dcba 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1132,6 +1132,46 @@ Apart from these, the following properties are also available, and may be useful
     to get the replication level of the block to the initial number.
   </td>
 </tr>
+<tr>
+  <td><code>spark.cleaner.periodicGC.interval</code></td>
+  <td>30min</td>
+  <td>
+    Controls how often to trigger a garbage collection.<br><br>
+    This context cleaner triggers cleanups only when weak references are garbage collected.
+    In long-running applications with large driver JVMs, where there is little memory pressure
+    on the driver, this may happen very occasionally or not at all. Not cleaning at all may
+    lead to executors running out of disk space after a while.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.referenceTracking</code></td>
+  <td>true</td>
+  <td>
+    Enables or disables context cleaning.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.referenceTracking.blocking</code></td>
+  <td>true</td>
+  <td>
+    Controls whether the cleaning thread should block on cleanup tasks (other than shuffle, which is controlled by
+    <code>spark.cleaner.referenceTracking.blocking.shuffle</code> Spark property).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.referenceTracking.blocking.shuffle</code></td>
+  <td>false</td>
+  <td>
+    Controls whether the cleaning thread should block on shuffle cleanup tasks.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.cleaner.referenceTracking.cleanCheckpoints</code></td>
+  <td>false</td>
+  <td>
+    Controls whether to clean checkpoint files if the reference is out of scope.
+  </td>
+</tr>
 </table>
 
 ### Execution Behavior

From 7da1f5708cc96c18ddb3acd09542621275e71d83 Mon Sep 17 00:00:00 2001
From: Min Shen <mshen@linkedin.com>
Date: Thu, 30 Nov 2017 19:24:44 -0600
Subject: [PATCH 020/356] =?UTF-8?q?[SPARK-22373]=20Bump=20Janino=20depende?=
 =?UTF-8?q?ncy=20version=20to=20fix=20thread=20safety=20issue=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… with Janino when compiling generated code.

## What changes were proposed in this pull request?

Bump up Janino dependency version to fix thread safety issue during compiling generated code

## How was this patch tested?

Check https://issues.apache.org/jira/browse/SPARK-22373 for details.
Converted part of the code in CodeGenerator into a standalone application, so the issue can be consistently reproduced locally.
Verified that changing Janino dependency version resolved this issue.

Author: Min Shen <mshen@linkedin.com>

Closes #19839 from Victsm/SPARK-22373.
---
 dev/deps/spark-deps-hadoop-2.6 | 4 ++--
 dev/deps/spark-deps-hadoop-2.7 | 4 ++--
 pom.xml                        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 50ac6d139bbd4..8f508219c2ded 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -35,7 +35,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-3.0.0.jar
+commons-compiler-3.0.7.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -96,7 +96,7 @@ jackson-mapper-asl-1.9.13.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
-janino-3.0.0.jar
+janino-3.0.7.jar
 java-xmlbuilder-1.0.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 1b1e3166d53db..68e937f50b391 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -35,7 +35,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-3.0.0.jar
+commons-compiler-3.0.7.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -96,7 +96,7 @@ jackson-mapper-asl-1.9.13.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
-janino-3.0.0.jar
+janino-3.0.7.jar
 java-xmlbuilder-1.0.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
diff --git a/pom.xml b/pom.xml
index 7bc66e7d19540..731ee86439eff 100644
--- a/pom.xml
+++ b/pom.xml
@@ -170,7 +170,7 @@
     <!-- org.apache.commons/commons-lang3/-->
     <commons-lang3.version>3.5</commons-lang3.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
-    <janino.version>3.0.0</janino.version>
+    <janino.version>3.0.7</janino.version>
     <jersey.version>2.22.2</jersey.version>
     <joda.version>2.9.3</joda.version>
     <jodd.version>3.5.2</jodd.version>

From dc365422bb337d19ef39739c7c3cf9e53ec85d09 Mon Sep 17 00:00:00 2001
From: Thomas Graves <tgraves@oath.com>
Date: Fri, 1 Dec 2017 10:53:16 +0800
Subject: [PATCH 021/356] =?UTF-8?q?[SPARK-22653]=20executorAddress=20regis?=
 =?UTF-8?q?tered=20in=20CoarseGrainedSchedulerBac=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

https://issues.apache.org/jira/browse/SPARK-22653
executorRef.address can be null, pass the executorAddress which accounts for it being null a few lines above the fix.

Manually tested this patch. You can reproduce the issue by running a simple spark-shell in yarn client mode with dynamic allocation and request some executors up front. Let those executors idle timeout. Get a heap dump. Without this fix, you will see that addressToExecutorId still contains the ids, with the fix addressToExecutorId is properly cleaned up.

Author: Thomas Graves <tgraves@oath.com>

Closes #19850 from tgravescs/SPARK-22653.
---
 .../spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 22d9c4cf81c55..7bfb4d53c1834 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -182,7 +182,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           addressToExecutorId(executorAddress) = executorId
           totalCoreCount.addAndGet(cores)
           totalRegisteredExecutors.addAndGet(1)
-          val data = new ExecutorData(executorRef, executorRef.address, hostname,
+          val data = new ExecutorData(executorRef, executorAddress, hostname,
             cores, cores, logUrls)
           // This must be synchronized because variables mutated
           // in this block are read when requesting executors

From 16adaf634bcca3074b448d95e72177eefdf50069 Mon Sep 17 00:00:00 2001
From: sujith71955 <sujithchacko.2010@gmail.com>
Date: Thu, 30 Nov 2017 20:45:30 -0800
Subject: [PATCH 022/356] [SPARK-22601][SQL] Data load is getting displayed
 successful on providing non existing nonlocal file path

## What changes were proposed in this pull request?
When user tries to load data with a non existing hdfs file path system is not validating it and the load command operation is getting successful.
This is misleading to the user. already there is a validation in the scenario of none existing local file path. This PR has added validation in the scenario of nonexisting hdfs file path
## How was this patch tested?
UT has been added for verifying the issue, also snapshots has been added after the verification in a spark yarn cluster

Author: sujith71955 <sujithchacko.2010@gmail.com>

Closes #19823 from sujith71955/master_LoadComand_Issue.
---
 .../org/apache/spark/sql/execution/command/tables.scala  | 9 ++++++++-
 .../apache/spark/sql/hive/execution/HiveDDLSuite.scala   | 9 +++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index c9f6e571ddab3..c42e6c3257fad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -340,7 +340,7 @@ case class LoadDataCommand(
         uri
       } else {
         val uri = new URI(path)
-        if (uri.getScheme() != null && uri.getAuthority() != null) {
+        val hdfsUri = if (uri.getScheme() != null && uri.getAuthority() != null) {
           uri
         } else {
           // Follow Hive's behavior:
@@ -380,6 +380,13 @@ case class LoadDataCommand(
           }
           new URI(scheme, authority, absolutePath, uri.getQuery(), uri.getFragment())
         }
+        val hadoopConf = sparkSession.sessionState.newHadoopConf()
+        val srcPath = new Path(hdfsUri)
+        val fs = srcPath.getFileSystem(hadoopConf)
+        if (!fs.exists(srcPath)) {
+          throw new AnalysisException(s"LOAD DATA input path does not exist: $path")
+        }
+        hdfsUri
       }
 
     if (partition.nonEmpty) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 9063ef066aa84..6c11905ba8904 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -2141,4 +2141,13 @@ class HiveDDLSuite
       }
     }
   }
+
+  test("load command for non local invalid path validation") {
+    withTable("tbl") {
+      sql("CREATE TABLE tbl(i INT, j STRING)")
+      val e = intercept[AnalysisException](
+        sql("load data inpath '/doesnotexist.csv' into table tbl"))
+      assert(e.message.contains("LOAD DATA input path does not exist"))
+    }
+  }
 }

From 9d06a9e0cf05af99ba210fabae1e77eccfce7986 Mon Sep 17 00:00:00 2001
From: Mark Petruska <petruska.mark@gmail.com>
Date: Fri, 1 Dec 2017 05:14:12 -0600
Subject: [PATCH 023/356] [SPARK-22393][SPARK-SHELL] spark-shell can't find
 imported types in class constructors, extends clause

## What changes were proposed in this pull request?

[SPARK-22393](https://issues.apache.org/jira/browse/SPARK-22393)

## How was this patch tested?

With a new test case in `RepSuite`

----

This code is a retrofit of the Scala [SI-9881](https://github.com/scala/bug/issues/9881) bug fix, which never made it into the Scala 2.11 branches. Pushing these changes directly to the Scala repo is not practical (see: https://github.com/scala/scala/pull/6195).

Author: Mark Petruska <petruska.mark@gmail.com>

Closes #19846 from mpetruska/SPARK-22393.
---
 .../apache/spark/repl/SparkExprTyper.scala    |  74 +++++++++++++
 .../org/apache/spark/repl/SparkILoop.scala    |   4 +
 .../spark/repl/SparkILoopInterpreter.scala    | 103 ++++++++++++++++++
 .../org/apache/spark/repl/ReplSuite.scala     |  10 ++
 4 files changed, 191 insertions(+)
 create mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
 create mode 100644 repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala

diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
new file mode 100644
index 0000000000000..724ce9af49f77
--- /dev/null
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkExprTyper.scala
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.repl
+
+import scala.tools.nsc.interpreter.{ExprTyper, IR}
+
+trait SparkExprTyper extends ExprTyper {
+
+  import repl._
+  import global.{reporter => _, Import => _, _}
+  import naming.freshInternalVarName
+
+  def doInterpret(code: String): IR.Result = {
+    // interpret/interpretSynthetic may change the phase,
+    // which would have unintended effects on types.
+    val savedPhase = phase
+    try interpretSynthetic(code) finally phase = savedPhase
+  }
+
+  override def symbolOfLine(code: String): Symbol = {
+    def asExpr(): Symbol = {
+      val name = freshInternalVarName()
+      // Typing it with a lazy val would give us the right type, but runs
+      // into compiler bugs with things like existentials, so we compile it
+      // behind a def and strip the NullaryMethodType which wraps the expr.
+      val line = "def " + name + " = " + code
+
+      doInterpret(line) match {
+        case IR.Success =>
+          val sym0 = symbolOfTerm(name)
+          // drop NullaryMethodType
+          sym0.cloneSymbol setInfo exitingTyper(sym0.tpe_*.finalResultType)
+        case _ => NoSymbol
+      }
+    }
+
+    def asDefn(): Symbol = {
+      val old = repl.definedSymbolList.toSet
+
+      doInterpret(code) match {
+        case IR.Success =>
+          repl.definedSymbolList filterNot old match {
+            case Nil => NoSymbol
+            case sym :: Nil => sym
+            case syms => NoSymbol.newOverloaded(NoPrefix, syms)
+          }
+        case _ => NoSymbol
+      }
+    }
+
+    def asError(): Symbol = {
+      doInterpret(code)
+      NoSymbol
+    }
+
+    beSilentDuring(asExpr()) orElse beSilentDuring(asDefn()) orElse asError()
+  }
+
+}
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 3ce7cc7c85f74..e69441a475e9a 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -35,6 +35,10 @@ class SparkILoop(in0: Option[BufferedReader], out: JPrintWriter)
   def this(in0: BufferedReader, out: JPrintWriter) = this(Some(in0), out)
   def this() = this(None, new JPrintWriter(Console.out, true))
 
+  override def createInterpreter(): Unit = {
+    intp = new SparkILoopInterpreter(settings, out)
+  }
+
   val initializationCommands: Seq[String] = Seq(
     """
     @transient val spark = if (org.apache.spark.repl.Main.sparkSession != null) {
diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
new file mode 100644
index 0000000000000..0803426403af5
--- /dev/null
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.repl
+
+import scala.tools.nsc.Settings
+import scala.tools.nsc.interpreter._
+
+class SparkILoopInterpreter(settings: Settings, out: JPrintWriter) extends IMain(settings, out) {
+  self =>
+
+  override lazy val memberHandlers = new {
+    val intp: self.type = self
+  } with MemberHandlers {
+    import intp.global._
+
+    override def chooseHandler(member: intp.global.Tree): MemberHandler = member match {
+      case member: Import => new SparkImportHandler(member)
+      case _ => super.chooseHandler (member)
+    }
+
+    class SparkImportHandler(imp: Import) extends ImportHandler(imp: Import) {
+
+      override def targetType: Type = intp.global.rootMirror.getModuleIfDefined("" + expr) match {
+        case NoSymbol => intp.typeOfExpression("" + expr)
+        case sym => sym.tpe
+      }
+
+      private def safeIndexOf(name: Name, s: String): Int = fixIndexOf(name, pos(name, s))
+      private def fixIndexOf(name: Name, idx: Int): Int = if (idx == name.length) -1 else idx
+      private def pos(name: Name, s: String): Int = {
+        var i = name.pos(s.charAt(0), 0)
+        val sLen = s.length()
+        if (sLen == 1) return i
+        while (i + sLen <= name.length) {
+          var j = 1
+          while (s.charAt(j) == name.charAt(i + j)) {
+            j += 1
+            if (j == sLen) return i
+          }
+          i = name.pos(s.charAt(0), i + 1)
+        }
+        name.length
+      }
+
+      private def isFlattenedSymbol(sym: Symbol): Boolean =
+        sym.owner.isPackageClass &&
+          sym.name.containsName(nme.NAME_JOIN_STRING) &&
+          sym.owner.info.member(sym.name.take(
+            safeIndexOf(sym.name, nme.NAME_JOIN_STRING))) != NoSymbol
+
+      private def importableTargetMembers =
+        importableMembers(exitingTyper(targetType)).filterNot(isFlattenedSymbol).toList
+
+      def isIndividualImport(s: ImportSelector): Boolean =
+        s.name != nme.WILDCARD && s.rename != nme.WILDCARD
+      def isWildcardImport(s: ImportSelector): Boolean =
+        s.name == nme.WILDCARD
+
+      // non-wildcard imports
+      private def individualSelectors = selectors filter isIndividualImport
+
+      override val importsWildcard: Boolean = selectors exists isWildcardImport
+
+      lazy val importableSymbolsWithRenames: List[(Symbol, Name)] = {
+        val selectorRenameMap =
+          individualSelectors.flatMap(x => x.name.bothNames zip x.rename.bothNames).toMap
+        importableTargetMembers flatMap (m => selectorRenameMap.get(m.name) map (m -> _))
+      }
+
+      override lazy val individualSymbols: List[Symbol] = importableSymbolsWithRenames map (_._1)
+      override lazy val wildcardSymbols: List[Symbol] =
+        if (importsWildcard) importableTargetMembers else Nil
+
+    }
+
+  }
+
+  object expressionTyper extends {
+    val repl: SparkILoopInterpreter.this.type = self
+  } with SparkExprTyper { }
+
+  override def symbolOfLine(code: String): global.Symbol =
+    expressionTyper.symbolOfLine(code)
+
+  override def typeOfExpression(expr: String, silent: Boolean): global.Type =
+    expressionTyper.typeOfExpression(expr, silent)
+
+}
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 905b41cdc1594..a5053521f8e31 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -227,4 +227,14 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("error: not found: value sc", output)
   }
 
+   test("spark-shell should find imported types in class constructors and extends clause") {
+     val output = runInterpreter("local",
+       """
+         |import org.apache.spark.Partition
+         |class P(p: Partition)
+         |class P(val index: Int) extends Partition
+       """.stripMargin)
+     assertDoesNotContain("error: not found: type Partition", output)
+   }
+
 }

From ee10ca7ec6cf7fbaab3f95a097b46936d97d0835 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Fri, 1 Dec 2017 13:02:03 -0800
Subject: [PATCH 024/356] [SPARK-22638][SS] Use a separate queue for
 StreamingQueryListenerBus

## What changes were proposed in this pull request?

Use a separate Spark event queue for StreamingQueryListenerBus so that if there are many non-streaming events, streaming query listeners don't need to wait for other Spark listeners and can catch up.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #19838 from zsxwing/SPARK-22638.
---
 .../scala/org/apache/spark/scheduler/LiveListenerBus.scala  | 4 +++-
 .../sql/execution/streaming/StreamingQueryListenerBus.scala | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 2f93c497c5771..23121402b1025 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -87,7 +87,9 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
    * of each other (each one uses a separate thread for delivering events), allowing slower
    * listeners to be somewhat isolated from others.
    */
-  private def addToQueue(listener: SparkListenerInterface, queue: String): Unit = synchronized {
+  private[spark] def addToQueue(
+      listener: SparkListenerInterface,
+      queue: String): Unit = synchronized {
     if (stopped.get()) {
       throw new IllegalStateException("LiveListenerBus is stopped.")
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
index 07e39023c8366..7dd491ede9d05 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryListenerBus.scala
@@ -40,7 +40,7 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
 
   import StreamingQueryListener._
 
-  sparkListenerBus.addToSharedQueue(this)
+  sparkListenerBus.addToQueue(this, StreamingQueryListenerBus.STREAM_EVENT_QUERY)
 
   /**
    * RunIds of active queries whose events are supposed to be forwarded by this ListenerBus
@@ -130,3 +130,7 @@ class StreamingQueryListenerBus(sparkListenerBus: LiveListenerBus)
     }
   }
 }
+
+object StreamingQueryListenerBus {
+  val STREAM_EVENT_QUERY = "streams"
+}

From aa4cf2b19e4cf5588af7e2192e0e9f687cd84bc5 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 2 Dec 2017 11:55:43 +0900
Subject: [PATCH 025/356] [SPARK-22651][PYTHON][ML] Prevent initiating multiple
 Hive clients for ImageSchema.readImages

## What changes were proposed in this pull request?

Calling `ImageSchema.readImages` multiple times as below in PySpark shell:

```python
from pyspark.ml.image import ImageSchema
data_path = 'data/mllib/images/kittens'
_ = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True).collect()
_ = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True).collect()
```

throws an error as below:

```
...
org.datanucleus.exceptions.NucleusDataStoreException: Unable to open a test connection to the given database. JDBC url = jdbc:derby:;databaseName=metastore_db;create=true, username = APP. Terminating connection pool (set lazyInit to true if you expect to start your database after your app). Original Exception: ------
java.sql.SQLException: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1742f639f, see the next exception for details.
...
	at org.apache.derby.jdbc.AutoloadedDriver.connect(Unknown Source)
...
	at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
...
	at org.apache.spark.sql.hive.client.HiveClientImpl.newState(HiveClientImpl.scala:180)
...
	at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply$mcZ$sp(HiveExternalCatalog.scala:195)
	at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:195)
	at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$databaseExists$1.apply(HiveExternalCatalog.scala:195)
	at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:97)
	at org.apache.spark.sql.hive.HiveExternalCatalog.databaseExists(HiveExternalCatalog.scala:194)
	at org.apache.spark.sql.internal.SharedState.externalCatalog$lzycompute(SharedState.scala:100)
	at org.apache.spark.sql.internal.SharedState.externalCatalog(SharedState.scala:88)
	at org.apache.spark.sql.hive.HiveSessionStateBuilder.externalCatalog(HiveSessionStateBuilder.scala:39)
	at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog$lzycompute(HiveSessionStateBuilder.scala:54)
	at org.apache.spark.sql.hive.HiveSessionStateBuilder.catalog(HiveSessionStateBuilder.scala:52)
	at org.apache.spark.sql.hive.HiveSessionStateBuilder$$anon$1.<init>(HiveSessionStateBuilder.scala:69)
	at org.apache.spark.sql.hive.HiveSessionStateBuilder.analyzer(HiveSessionStateBuilder.scala:69)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder$$anonfun$build$2.apply(BaseSessionStateBuilder.scala:293)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder$$anonfun$build$2.apply(BaseSessionStateBuilder.scala:293)
	at org.apache.spark.sql.internal.SessionState.analyzer$lzycompute(SessionState.scala:79)
	at org.apache.spark.sql.internal.SessionState.analyzer(SessionState.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:70)
	at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:68)
	at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:51)
	at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:70)
	at org.apache.spark.sql.SparkSession.internalCreateDataFrame(SparkSession.scala:574)
	at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:593)
	at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:348)
	at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:348)
	at org.apache.spark.ml.image.ImageSchema$$anonfun$readImages$2$$anonfun$apply$1.apply(ImageSchema.scala:253)
...
Caused by: ERROR XJ040: Failed to start database 'metastore_db' with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1742f639f, see the next exception for details.
	at org.apache.derby.iapi.error.StandardException.newException(Unknown Source)
	at org.apache.derby.impl.jdbc.SQLExceptionFactory.wrapArgsForTransportAcrossDRDA(Unknown Source)
	... 121 more
Caused by: ERROR XSDB6: Another instance of Derby may have already booted the database /.../spark/metastore_db.
...
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../spark/python/pyspark/ml/image.py", line 190, in readImages
    dropImageFailures, float(sampleRatio), seed)
  File "/.../spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py", line 1160, in __call__
  File "/.../spark/python/pyspark/sql/utils.py", line 69, in deco
    raise AnalysisException(s.split(': ', 1)[1], stackTrace)
pyspark.sql.utils.AnalysisException: u'java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient;'
```

Seems we better stick to `SparkSession.builder.getOrCreate()` like:

https://github.com/apache/spark/blob/51620e288b5e0a7fffc3899c9deadabace28e6d7/python/pyspark/sql/streaming.py#L329

https://github.com/apache/spark/blob/dc5d34d8dcd6526d1dfdac8606661561c7576a62/python/pyspark/sql/column.py#L541

https://github.com/apache/spark/blob/33d43bf1b6f55594187066f0e38ba3985fa2542b/python/pyspark/sql/readwriter.py#L105

## How was this patch tested?

This was tested as below in PySpark shell:

```python
from pyspark.ml.image import ImageSchema
data_path = 'data/mllib/images/kittens'
_ = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True).collect()
_ = ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True).collect()
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19845 from HyukjinKwon/SPARK-22651.
---
 python/pyspark/ml/image.py |  5 ++---
 python/pyspark/ml/tests.py | 31 ++++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 2b61aa9c0d9e9..384599dc0c532 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -201,9 +201,8 @@ def readImages(self, path, recursive=False, numPartitions=-1,
         .. versionadded:: 2.3.0
         """
 
-        ctx = SparkContext._active_spark_context
-        spark = SparkSession(ctx)
-        image_schema = ctx._jvm.org.apache.spark.ml.image.ImageSchema
+        spark = SparkSession.builder.getOrCreate()
+        image_schema = spark._jvm.org.apache.spark.ml.image.ImageSchema
         jsession = spark._jsparkSession
         jresult = image_schema.readImages(path, jsession, recursive, numPartitions,
                                           dropImageFailures, float(sampleRatio), seed)
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 89ef555cf3442..3a0b816c367ec 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -67,7 +67,7 @@
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaParams, JavaWrapper
 from pyspark.serializers import PickleSerializer
-from pyspark.sql import DataFrame, Row, SparkSession
+from pyspark.sql import DataFrame, Row, SparkSession, HiveContext
 from pyspark.sql.functions import rand
 from pyspark.sql.types import DoubleType, IntegerType
 from pyspark.storagelevel import *
@@ -1855,6 +1855,35 @@ def test_read_images(self):
                 lambda: ImageSchema.toImage("a"))
 
 
+class ImageReaderTest2(PySparkTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        PySparkTestCase.setUpClass()
+        # Note that here we enable Hive's support.
+        try:
+            cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
+        except py4j.protocol.Py4JError:
+            cls.tearDownClass()
+            raise unittest.SkipTest("Hive is not available")
+        except TypeError:
+            cls.tearDownClass()
+            raise unittest.SkipTest("Hive is not available")
+        cls.spark = HiveContext._createForTesting(cls.sc)
+
+    @classmethod
+    def tearDownClass(cls):
+        PySparkTestCase.tearDownClass()
+        cls.spark.sparkSession.stop()
+
+    def test_read_images_multiple_times(self):
+        # This test case is to check if `ImageSchema.readImages` tries to
+        # initiate Hive client multiple times. See SPARK-22651.
+        data_path = 'data/mllib/images/kittens'
+        ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
+        ImageSchema.readImages(data_path, recursive=True, dropImageFailures=True)
+
+
 class ALSTest(SparkSessionTestCase):
 
     def test_storage_levels(self):

From d2cf95aa63f5f5c9423f0455c2bfbee7833c9982 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sat, 2 Dec 2017 07:37:02 -0600
Subject: [PATCH 026/356] [SPARK-22634][BUILD] Update Bouncy Castle to 1.58

## What changes were proposed in this pull request?

Update Bouncy Castle to 1.58, and jets3t to 0.9.4 to (sort of) match.

## How was this patch tested?

Existing tests

Author: Sean Owen <sowen@cloudera.com>

Closes #19859 from srowen/SPARK-22634.
---
 dev/deps/spark-deps-hadoop-2.6 | 8 +++-----
 dev/deps/spark-deps-hadoop-2.7 | 8 +++-----
 pom.xml                        | 8 +++++++-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 8f508219c2ded..2c68b73095c4d 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -21,7 +21,7 @@ avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcprov-jdk15on-1.58.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.13.2.jar
 breeze_2.11-0.13.2.jar
@@ -97,7 +97,7 @@ jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.7.jar
-java-xmlbuilder-1.0.jar
+java-xmlbuilder-1.1.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
 javax.inject-1.jar
@@ -115,7 +115,7 @@ jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
 jersey-server-2.22.2.jar
-jets3t-0.9.3.jar
+jets3t-0.9.4.jar
 jetty-6.1.26.jar
 jetty-util-6.1.26.jar
 jline-2.12.1.jar
@@ -137,14 +137,12 @@ log4j-1.2.17.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
-mail-1.4.7.jar
 mesos-1.4.0-shaded-protobuf.jar
 metrics-core-3.1.5.jar
 metrics-graphite-3.1.5.jar
 metrics-json-3.1.5.jar
 metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
-mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
 netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 68e937f50b391..2aaac600b3ec3 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -21,7 +21,7 @@ avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
 base64-2.3.8.jar
-bcprov-jdk15on-1.51.jar
+bcprov-jdk15on-1.58.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.13.2.jar
 breeze_2.11-0.13.2.jar
@@ -97,7 +97,7 @@ jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
 janino-3.0.7.jar
-java-xmlbuilder-1.0.jar
+java-xmlbuilder-1.1.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
 javax.inject-1.jar
@@ -115,7 +115,7 @@ jersey-container-servlet-core-2.22.2.jar
 jersey-guava-2.22.2.jar
 jersey-media-jaxb-2.22.2.jar
 jersey-server-2.22.2.jar
-jets3t-0.9.3.jar
+jets3t-0.9.4.jar
 jetty-6.1.26.jar
 jetty-util-6.1.26.jar
 jline-2.12.1.jar
@@ -138,14 +138,12 @@ log4j-1.2.17.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
-mail-1.4.7.jar
 mesos-1.4.0-shaded-protobuf.jar
 metrics-core-3.1.5.jar
 metrics-graphite-3.1.5.jar
 metrics-json-3.1.5.jar
 metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
-mx4j-3.0.2.jar
 netty-3.9.9.Final.jar
 netty-all-4.0.47.Final.jar
 objenesis-2.1.jar
diff --git a/pom.xml b/pom.xml
index 731ee86439eff..07bca9d267da0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -141,7 +141,7 @@
     <codahale.metrics.version>3.1.5</codahale.metrics.version>
     <avro.version>1.7.7</avro.version>
     <avro.mapred.classifier>hadoop2</avro.mapred.classifier>
-    <jets3t.version>0.9.3</jets3t.version>
+    <jets3t.version>0.9.4</jets3t.version>
     <aws.kinesis.client.version>1.7.3</aws.kinesis.client.version>
     <!-- Should be consistent with Kinesis client dependency -->
     <aws.java.sdk.version>1.11.76</aws.java.sdk.version>
@@ -985,6 +985,12 @@
           </exclusion>
         </exclusions>
       </dependency>
+      <dependency>
+        <groupId>org.bouncycastle</groupId>
+        <artifactId>bcprov-jdk15on</artifactId>
+        <!-- brought in by jets3t; see SPARK-22634 -->
+        <version>1.58</version>
+      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-api</artifactId>

From f23dddf105aef88531b3572ad70889cf2fc300c9 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sun, 3 Dec 2017 22:21:44 +0800
Subject: [PATCH 027/356] [SPARK-20682][SPARK-15474][SPARK-21791] Add new
 ORCFileFormat based on ORC 1.4.1

## What changes were proposed in this pull request?

Since [SPARK-2883](https://issues.apache.org/jira/browse/SPARK-2883), Apache Spark supports Apache ORC inside `sql/hive` module with Hive dependency. This PR aims to add a new ORC data source inside `sql/core` and to replace the old ORC data source eventually. This PR resolves the following three issues.

- [SPARK-20682](https://issues.apache.org/jira/browse/SPARK-20682): Add new ORCFileFormat based on Apache ORC 1.4.1
- [SPARK-15474](https://issues.apache.org/jira/browse/SPARK-15474): ORC data source fails to write and read back empty dataframe
- [SPARK-21791](https://issues.apache.org/jira/browse/SPARK-21791): ORC should support column names with dot

## How was this patch tested?

Pass the Jenkins with the existing all tests and new tests for SPARK-15474 and SPARK-21791.

Author: Dongjoon Hyun <dongjoon@apache.org>
Author: Wenchen Fan <wenchen@databricks.com>

Closes #19651 from dongjoon-hyun/SPARK-20682.
---
 .../datasources/orc/OrcDeserializer.scala     | 243 ++++++++++++++++++
 .../datasources/orc/OrcFileFormat.scala       | 139 +++++++++-
 .../datasources/orc/OrcFilters.scala          | 210 +++++++++++++++
 .../datasources/orc/OrcOutputWriter.scala     |  53 ++++
 .../datasources/orc/OrcSerializer.scala       | 228 ++++++++++++++++
 .../execution/datasources/orc/OrcUtils.scala  | 113 ++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  25 ++
 7 files changed, 1009 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
new file mode 100644
index 0000000000000..4ecc54bd2fd96
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcDeserializer.scala
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.hadoop.io._
+import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp}
+import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable}
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData}
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A deserializer to deserialize ORC structs to Spark rows.
+ */
+class OrcDeserializer(
+    dataSchema: StructType,
+    requiredSchema: StructType,
+    requestedColIds: Array[Int]) {
+
+  private val resultRow = new SpecificInternalRow(requiredSchema.map(_.dataType))
+
+  private val fieldWriters: Array[WritableComparable[_] => Unit] = {
+    requiredSchema.zipWithIndex
+      // The value of missing columns are always null, do not need writers.
+      .filterNot { case (_, index) => requestedColIds(index) == -1 }
+      .map { case (f, index) =>
+        val writer = newWriter(f.dataType, new RowUpdater(resultRow))
+        (value: WritableComparable[_]) => writer(index, value)
+      }.toArray
+  }
+
+  private val validColIds = requestedColIds.filterNot(_ == -1)
+
+  def deserialize(orcStruct: OrcStruct): InternalRow = {
+    var i = 0
+    while (i < validColIds.length) {
+      val value = orcStruct.getFieldValue(validColIds(i))
+      if (value == null) {
+        resultRow.setNullAt(i)
+      } else {
+        fieldWriters(i)(value)
+      }
+      i += 1
+    }
+    resultRow
+  }
+
+  /**
+   * Creates a writer to write ORC values to Catalyst data structure at the given ordinal.
+   */
+  private def newWriter(
+      dataType: DataType, updater: CatalystDataUpdater): (Int, WritableComparable[_]) => Unit =
+    dataType match {
+      case NullType => (ordinal, _) =>
+        updater.setNullAt(ordinal)
+
+      case BooleanType => (ordinal, value) =>
+        updater.setBoolean(ordinal, value.asInstanceOf[BooleanWritable].get)
+
+      case ByteType => (ordinal, value) =>
+        updater.setByte(ordinal, value.asInstanceOf[ByteWritable].get)
+
+      case ShortType => (ordinal, value) =>
+        updater.setShort(ordinal, value.asInstanceOf[ShortWritable].get)
+
+      case IntegerType => (ordinal, value) =>
+        updater.setInt(ordinal, value.asInstanceOf[IntWritable].get)
+
+      case LongType => (ordinal, value) =>
+        updater.setLong(ordinal, value.asInstanceOf[LongWritable].get)
+
+      case FloatType => (ordinal, value) =>
+        updater.setFloat(ordinal, value.asInstanceOf[FloatWritable].get)
+
+      case DoubleType => (ordinal, value) =>
+        updater.setDouble(ordinal, value.asInstanceOf[DoubleWritable].get)
+
+      case StringType => (ordinal, value) =>
+        updater.set(ordinal, UTF8String.fromBytes(value.asInstanceOf[Text].copyBytes))
+
+      case BinaryType => (ordinal, value) =>
+        val binary = value.asInstanceOf[BytesWritable]
+        val bytes = new Array[Byte](binary.getLength)
+        System.arraycopy(binary.getBytes, 0, bytes, 0, binary.getLength)
+        updater.set(ordinal, bytes)
+
+      case DateType => (ordinal, value) =>
+        updater.setInt(ordinal, DateTimeUtils.fromJavaDate(value.asInstanceOf[DateWritable].get))
+
+      case TimestampType => (ordinal, value) =>
+        updater.setLong(ordinal, DateTimeUtils.fromJavaTimestamp(value.asInstanceOf[OrcTimestamp]))
+
+      case DecimalType.Fixed(precision, scale) => (ordinal, value) =>
+        val decimal = value.asInstanceOf[HiveDecimalWritable].getHiveDecimal()
+        val v = Decimal(decimal.bigDecimalValue, decimal.precision(), decimal.scale())
+        v.changePrecision(precision, scale)
+        updater.set(ordinal, v)
+
+      case st: StructType => (ordinal, value) =>
+        val result = new SpecificInternalRow(st)
+        val fieldUpdater = new RowUpdater(result)
+        val fieldConverters = st.map(_.dataType).map { dt =>
+          newWriter(dt, fieldUpdater)
+        }.toArray
+        val orcStruct = value.asInstanceOf[OrcStruct]
+
+        var i = 0
+        while (i < st.length) {
+          val value = orcStruct.getFieldValue(i)
+          if (value == null) {
+            result.setNullAt(i)
+          } else {
+            fieldConverters(i)(i, value)
+          }
+          i += 1
+        }
+
+        updater.set(ordinal, result)
+
+      case ArrayType(elementType, _) => (ordinal, value) =>
+        val orcArray = value.asInstanceOf[OrcList[WritableComparable[_]]]
+        val length = orcArray.size()
+        val result = createArrayData(elementType, length)
+        val elementUpdater = new ArrayDataUpdater(result)
+        val elementConverter = newWriter(elementType, elementUpdater)
+
+        var i = 0
+        while (i < length) {
+          val value = orcArray.get(i)
+          if (value == null) {
+            result.setNullAt(i)
+          } else {
+            elementConverter(i, value)
+          }
+          i += 1
+        }
+
+        updater.set(ordinal, result)
+
+      case MapType(keyType, valueType, _) => (ordinal, value) =>
+        val orcMap = value.asInstanceOf[OrcMap[WritableComparable[_], WritableComparable[_]]]
+        val length = orcMap.size()
+        val keyArray = createArrayData(keyType, length)
+        val keyUpdater = new ArrayDataUpdater(keyArray)
+        val keyConverter = newWriter(keyType, keyUpdater)
+        val valueArray = createArrayData(valueType, length)
+        val valueUpdater = new ArrayDataUpdater(valueArray)
+        val valueConverter = newWriter(valueType, valueUpdater)
+
+        var i = 0
+        val it = orcMap.entrySet().iterator()
+        while (it.hasNext) {
+          val entry = it.next()
+          keyConverter(i, entry.getKey)
+          val value = entry.getValue
+          if (value == null) {
+            valueArray.setNullAt(i)
+          } else {
+            valueConverter(i, value)
+          }
+          i += 1
+        }
+
+        updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray))
+
+      case udt: UserDefinedType[_] => newWriter(udt.sqlType, updater)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"$dataType is not supported yet.")
+    }
+
+  private def createArrayData(elementType: DataType, length: Int): ArrayData = elementType match {
+    case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length))
+    case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length))
+    case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length))
+    case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length))
+    case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length))
+    case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length))
+    case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length))
+    case _ => new GenericArrayData(new Array[Any](length))
+  }
+
+  /**
+   * A base interface for updating values inside catalyst data structure like `InternalRow` and
+   * `ArrayData`.
+   */
+  sealed trait CatalystDataUpdater {
+    def set(ordinal: Int, value: Any): Unit
+
+    def setNullAt(ordinal: Int): Unit = set(ordinal, null)
+    def setBoolean(ordinal: Int, value: Boolean): Unit = set(ordinal, value)
+    def setByte(ordinal: Int, value: Byte): Unit = set(ordinal, value)
+    def setShort(ordinal: Int, value: Short): Unit = set(ordinal, value)
+    def setInt(ordinal: Int, value: Int): Unit = set(ordinal, value)
+    def setLong(ordinal: Int, value: Long): Unit = set(ordinal, value)
+    def setDouble(ordinal: Int, value: Double): Unit = set(ordinal, value)
+    def setFloat(ordinal: Int, value: Float): Unit = set(ordinal, value)
+  }
+
+  final class RowUpdater(row: InternalRow) extends CatalystDataUpdater {
+    override def setNullAt(ordinal: Int): Unit = row.setNullAt(ordinal)
+    override def set(ordinal: Int, value: Any): Unit = row.update(ordinal, value)
+
+    override def setBoolean(ordinal: Int, value: Boolean): Unit = row.setBoolean(ordinal, value)
+    override def setByte(ordinal: Int, value: Byte): Unit = row.setByte(ordinal, value)
+    override def setShort(ordinal: Int, value: Short): Unit = row.setShort(ordinal, value)
+    override def setInt(ordinal: Int, value: Int): Unit = row.setInt(ordinal, value)
+    override def setLong(ordinal: Int, value: Long): Unit = row.setLong(ordinal, value)
+    override def setDouble(ordinal: Int, value: Double): Unit = row.setDouble(ordinal, value)
+    override def setFloat(ordinal: Int, value: Float): Unit = row.setFloat(ordinal, value)
+  }
+
+  final class ArrayDataUpdater(array: ArrayData) extends CatalystDataUpdater {
+    override def setNullAt(ordinal: Int): Unit = array.setNullAt(ordinal)
+    override def set(ordinal: Int, value: Any): Unit = array.update(ordinal, value)
+
+    override def setBoolean(ordinal: Int, value: Boolean): Unit = array.setBoolean(ordinal, value)
+    override def setByte(ordinal: Int, value: Byte): Unit = array.setByte(ordinal, value)
+    override def setShort(ordinal: Int, value: Short): Unit = array.setShort(ordinal, value)
+    override def setInt(ordinal: Int, value: Int): Unit = array.setInt(ordinal, value)
+    override def setLong(ordinal: Int, value: Long): Unit = array.setLong(ordinal, value)
+    override def setDouble(ordinal: Int, value: Double): Unit = array.setDouble(ordinal, value)
+    override def setFloat(ordinal: Int, value: Float): Unit = array.setFloat(ordinal, value)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index 215740e90fe84..75c42213db3c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -17,10 +17,29 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
-import org.apache.orc.TypeDescription
+import java.io._
+import java.net.URI
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.mapred.JobConf
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.FileSplit
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.orc._
+import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA}
+import org.apache.orc.mapred.OrcStruct
+import org.apache.orc.mapreduce._
+
+import org.apache.spark.TaskContext
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.sources._
+import org.apache.spark.sql.types._
+import org.apache.spark.util.SerializableConfiguration
 
 private[sql] object OrcFileFormat {
   private def checkFieldName(name: String): Unit = {
@@ -39,3 +58,119 @@ private[sql] object OrcFileFormat {
     names.foreach(checkFieldName)
   }
 }
+
+/**
+ * New ORC File Format based on Apache ORC.
+ */
+class OrcFileFormat
+  extends FileFormat
+  with DataSourceRegister
+  with Serializable {
+
+  override def shortName(): String = "orc"
+
+  override def toString: String = "ORC"
+
+  override def hashCode(): Int = getClass.hashCode()
+
+  override def equals(other: Any): Boolean = other.isInstanceOf[OrcFileFormat]
+
+  override def inferSchema(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      files: Seq[FileStatus]): Option[StructType] = {
+    OrcUtils.readSchema(sparkSession, files)
+  }
+
+  override def prepareWrite(
+      sparkSession: SparkSession,
+      job: Job,
+      options: Map[String, String],
+      dataSchema: StructType): OutputWriterFactory = {
+    val orcOptions = new OrcOptions(options, sparkSession.sessionState.conf)
+
+    val conf = job.getConfiguration
+
+    conf.set(MAPRED_OUTPUT_SCHEMA.getAttribute, dataSchema.catalogString)
+
+    conf.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
+
+    conf.asInstanceOf[JobConf]
+      .setOutputFormat(classOf[org.apache.orc.mapred.OrcOutputFormat[OrcStruct]])
+
+    new OutputWriterFactory {
+      override def newInstance(
+          path: String,
+          dataSchema: StructType,
+          context: TaskAttemptContext): OutputWriter = {
+        new OrcOutputWriter(path, dataSchema, context)
+      }
+
+      override def getFileExtension(context: TaskAttemptContext): String = {
+        val compressionExtension: String = {
+          val name = context.getConfiguration.get(COMPRESS.getAttribute)
+          OrcUtils.extensionsForCompressionCodecNames.getOrElse(name, "")
+        }
+
+        compressionExtension + ".orc"
+      }
+    }
+  }
+
+  override def isSplitable(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      path: Path): Boolean = {
+    true
+  }
+
+  override def buildReader(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      requiredSchema: StructType,
+      filters: Seq[Filter],
+      options: Map[String, String],
+      hadoopConf: Configuration): (PartitionedFile) => Iterator[InternalRow] = {
+    if (sparkSession.sessionState.conf.orcFilterPushDown) {
+      OrcFilters.createFilter(dataSchema, filters).foreach { f =>
+        OrcInputFormat.setSearchArgument(hadoopConf, f, dataSchema.fieldNames)
+      }
+    }
+
+    val broadcastedConf =
+      sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+    val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
+
+    (file: PartitionedFile) => {
+      val conf = broadcastedConf.value.value
+
+      val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds(
+        isCaseSensitive, dataSchema, requiredSchema, new Path(new URI(file.filePath)), conf)
+
+      if (requestedColIdsOrEmptyFile.isEmpty) {
+        Iterator.empty
+      } else {
+        val requestedColIds = requestedColIdsOrEmptyFile.get
+        assert(requestedColIds.length == requiredSchema.length,
+          "[BUG] requested column IDs do not match required schema")
+        conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
+          requestedColIds.filter(_ != -1).sorted.mkString(","))
+
+        val fileSplit =
+          new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
+        val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
+        val taskAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
+
+        val orcRecordReader = new OrcInputFormat[OrcStruct]
+          .createRecordReader(fileSplit, taskAttemptContext)
+        val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
+        Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
+        val unsafeProjection = UnsafeProjection.create(requiredSchema)
+        val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
+        iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
new file mode 100644
index 0000000000000..cec256cc1b498
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -0,0 +1,210 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument, SearchArgumentFactory}
+import org.apache.orc.storage.ql.io.sarg.SearchArgument.Builder
+import org.apache.orc.storage.serde2.io.HiveDecimalWritable
+
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types._
+
+/**
+ * Helper object for building ORC `SearchArgument`s, which are used for ORC predicate push-down.
+ *
+ * Due to limitation of ORC `SearchArgument` builder, we had to end up with a pretty weird double-
+ * checking pattern when converting `And`/`Or`/`Not` filters.
+ *
+ * An ORC `SearchArgument` must be built in one pass using a single builder.  For example, you can't
+ * build `a = 1` and `b = 2` first, and then combine them into `a = 1 AND b = 2`.  This is quite
+ * different from the cases in Spark SQL or Parquet, where complex filters can be easily built using
+ * existing simpler ones.
+ *
+ * The annoying part is that, `SearchArgument` builder methods like `startAnd()`, `startOr()`, and
+ * `startNot()` mutate internal state of the builder instance.  This forces us to translate all
+ * convertible filters with a single builder instance. However, before actually converting a filter,
+ * we've no idea whether it can be recognized by ORC or not. Thus, when an inconvertible filter is
+ * found, we may already end up with a builder whose internal state is inconsistent.
+ *
+ * For example, to convert an `And` filter with builder `b`, we call `b.startAnd()` first, and then
+ * try to convert its children.  Say we convert `left` child successfully, but find that `right`
+ * child is inconvertible.  Alas, `b.startAnd()` call can't be rolled back, and `b` is inconsistent
+ * now.
+ *
+ * The workaround employed here is that, for `And`/`Or`/`Not`, we first try to convert their
+ * children with brand new builders, and only do the actual conversion with the right builder
+ * instance when the children are proven to be convertible.
+ *
+ * P.S.: Hive seems to use `SearchArgument` together with `ExprNodeGenericFuncDesc` only.  Usage of
+ * builder methods mentioned above can only be found in test code, where all tested filters are
+ * known to be convertible.
+ */
+private[orc] object OrcFilters {
+
+  /**
+   * Create ORC filter as a SearchArgument instance.
+   */
+  def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
+    val dataTypeMap = schema.map(f => f.name -> f.dataType).toMap
+
+    // First, tries to convert each filter individually to see whether it's convertible, and then
+    // collect all convertible ones to build the final `SearchArgument`.
+    val convertibleFilters = for {
+      filter <- filters
+      _ <- buildSearchArgument(dataTypeMap, filter, SearchArgumentFactory.newBuilder())
+    } yield filter
+
+    for {
+      // Combines all convertible filters using `And` to produce a single conjunction
+      conjunction <- convertibleFilters.reduceOption(org.apache.spark.sql.sources.And)
+      // Then tries to build a single ORC `SearchArgument` for the conjunction predicate
+      builder <- buildSearchArgument(dataTypeMap, conjunction, SearchArgumentFactory.newBuilder())
+    } yield builder.build()
+  }
+
+  /**
+   * Return true if this is a searchable type in ORC.
+   * Both CharType and VarcharType are cleaned at AstBuilder.
+   */
+  private def isSearchableType(dataType: DataType) = dataType match {
+    // TODO: SPARK-21787 Support for pushing down filters for DateType in ORC
+    case BinaryType | DateType => false
+    case _: AtomicType => true
+    case _ => false
+  }
+
+  /**
+   * Get PredicateLeafType which is corresponding to the given DataType.
+   */
+  private def getPredicateLeafType(dataType: DataType) = dataType match {
+    case BooleanType => PredicateLeaf.Type.BOOLEAN
+    case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG
+    case FloatType | DoubleType => PredicateLeaf.Type.FLOAT
+    case StringType => PredicateLeaf.Type.STRING
+    case DateType => PredicateLeaf.Type.DATE
+    case TimestampType => PredicateLeaf.Type.TIMESTAMP
+    case _: DecimalType => PredicateLeaf.Type.DECIMAL
+    case _ => throw new UnsupportedOperationException(s"DataType: $dataType")
+  }
+
+  /**
+   * Cast literal values for filters.
+   *
+   * We need to cast to long because ORC raises exceptions
+   * at 'checkLiteralType' of SearchArgumentImpl.java.
+   */
+  private def castLiteralValue(value: Any, dataType: DataType): Any = dataType match {
+    case ByteType | ShortType | IntegerType | LongType =>
+      value.asInstanceOf[Number].longValue
+    case FloatType | DoubleType =>
+      value.asInstanceOf[Number].doubleValue()
+    case _: DecimalType =>
+      val decimal = value.asInstanceOf[java.math.BigDecimal]
+      val decimalWritable = new HiveDecimalWritable(decimal.longValue)
+      decimalWritable.mutateEnforcePrecisionScale(decimal.precision, decimal.scale)
+      decimalWritable
+    case _ => value
+  }
+
+  /**
+   * Build a SearchArgument and return the builder so far.
+   */
+  private def buildSearchArgument(
+      dataTypeMap: Map[String, DataType],
+      expression: Filter,
+      builder: Builder): Option[Builder] = {
+    def newBuilder = SearchArgumentFactory.newBuilder()
+
+    def getType(attribute: String): PredicateLeaf.Type =
+      getPredicateLeafType(dataTypeMap(attribute))
+
+    import org.apache.spark.sql.sources._
+
+    expression match {
+      case And(left, right) =>
+        // At here, it is not safe to just convert one side if we do not understand the
+        // other side. Here is an example used to explain the reason.
+        // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
+        // convert b in ('1'). If we only convert a = 2, we will end up with a filter
+        // NOT(a = 2), which will generate wrong results.
+        // Pushing one side of AND down is only safe to do at the top level.
+        // You can see ParquetRelation's initializeLocalJobFunc method as an example.
+        for {
+          _ <- buildSearchArgument(dataTypeMap, left, newBuilder)
+          _ <- buildSearchArgument(dataTypeMap, right, newBuilder)
+          lhs <- buildSearchArgument(dataTypeMap, left, builder.startAnd())
+          rhs <- buildSearchArgument(dataTypeMap, right, lhs)
+        } yield rhs.end()
+
+      case Or(left, right) =>
+        for {
+          _ <- buildSearchArgument(dataTypeMap, left, newBuilder)
+          _ <- buildSearchArgument(dataTypeMap, right, newBuilder)
+          lhs <- buildSearchArgument(dataTypeMap, left, builder.startOr())
+          rhs <- buildSearchArgument(dataTypeMap, right, lhs)
+        } yield rhs.end()
+
+      case Not(child) =>
+        for {
+          _ <- buildSearchArgument(dataTypeMap, child, newBuilder)
+          negate <- buildSearchArgument(dataTypeMap, child, builder.startNot())
+        } yield negate.end()
+
+      // NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()`
+      // call is mandatory.  ORC `SearchArgument` builder requires that all leaf predicates must be
+      // wrapped by a "parent" predicate (`And`, `Or`, or `Not`).
+
+      case EqualTo(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startAnd().equals(attribute, getType(attribute), castedValue).end())
+
+      case EqualNullSafe(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startAnd().nullSafeEquals(attribute, getType(attribute), castedValue).end())
+
+      case LessThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startAnd().lessThan(attribute, getType(attribute), castedValue).end())
+
+      case LessThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startAnd().lessThanEquals(attribute, getType(attribute), castedValue).end())
+
+      case GreaterThan(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startNot().lessThanEquals(attribute, getType(attribute), castedValue).end())
+
+      case GreaterThanOrEqual(attribute, value) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValue = castLiteralValue(value, dataTypeMap(attribute))
+        Some(builder.startNot().lessThan(attribute, getType(attribute), castedValue).end())
+
+      case IsNull(attribute) if isSearchableType(dataTypeMap(attribute)) =>
+        Some(builder.startAnd().isNull(attribute, getType(attribute)).end())
+
+      case IsNotNull(attribute) if isSearchableType(dataTypeMap(attribute)) =>
+        Some(builder.startNot().isNull(attribute, getType(attribute)).end())
+
+      case In(attribute, values) if isSearchableType(dataTypeMap(attribute)) =>
+        val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(attribute)))
+        Some(builder.startAnd().in(attribute, getType(attribute),
+          castedValues.map(_.asInstanceOf[AnyRef]): _*).end())
+
+      case _ => None
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala
new file mode 100644
index 0000000000000..84755bfa301f0
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcOutputWriter.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.NullWritable
+import org.apache.hadoop.mapreduce.TaskAttemptContext
+import org.apache.orc.mapred.OrcStruct
+import org.apache.orc.mapreduce.OrcOutputFormat
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.execution.datasources.OutputWriter
+import org.apache.spark.sql.types._
+
+private[orc] class OrcOutputWriter(
+    path: String,
+    dataSchema: StructType,
+    context: TaskAttemptContext)
+  extends OutputWriter {
+
+  private[this] val serializer = new OrcSerializer(dataSchema)
+
+  private val recordWriter = {
+    new OrcOutputFormat[OrcStruct]() {
+      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
+        new Path(path)
+      }
+    }.getRecordWriter(context)
+  }
+
+  override def write(row: InternalRow): Unit = {
+    recordWriter.write(NullWritable.get(), serializer.serialize(row))
+  }
+
+  override def close(): Unit = {
+    recordWriter.close(context)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
new file mode 100644
index 0000000000000..899af0750cadf
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcSerializer.scala
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import org.apache.hadoop.io._
+import org.apache.orc.TypeDescription
+import org.apache.orc.mapred.{OrcList, OrcMap, OrcStruct, OrcTimestamp}
+import org.apache.orc.storage.common.`type`.HiveDecimal
+import org.apache.orc.storage.serde2.io.{DateWritable, HiveDecimalWritable}
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.types._
+
+/**
+ * A serializer to serialize Spark rows to ORC structs.
+ */
+class OrcSerializer(dataSchema: StructType) {
+
+  private val result = createOrcValue(dataSchema).asInstanceOf[OrcStruct]
+  private val converters = dataSchema.map(_.dataType).map(newConverter(_)).toArray
+
+  def serialize(row: InternalRow): OrcStruct = {
+    var i = 0
+    while (i < converters.length) {
+      if (row.isNullAt(i)) {
+        result.setFieldValue(i, null)
+      } else {
+        result.setFieldValue(i, converters(i)(row, i))
+      }
+      i += 1
+    }
+    result
+  }
+
+  private type Converter = (SpecializedGetters, Int) => WritableComparable[_]
+
+  /**
+   * Creates a converter to convert Catalyst data at the given ordinal to ORC values.
+   */
+  private def newConverter(
+      dataType: DataType,
+      reuseObj: Boolean = true): Converter = dataType match {
+    case NullType => (getter, ordinal) => null
+
+    case BooleanType =>
+      if (reuseObj) {
+        val result = new BooleanWritable()
+        (getter, ordinal) =>
+          result.set(getter.getBoolean(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new BooleanWritable(getter.getBoolean(ordinal))
+      }
+
+    case ByteType =>
+      if (reuseObj) {
+        val result = new ByteWritable()
+        (getter, ordinal) =>
+          result.set(getter.getByte(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new ByteWritable(getter.getByte(ordinal))
+      }
+
+    case ShortType =>
+      if (reuseObj) {
+        val result = new ShortWritable()
+        (getter, ordinal) =>
+          result.set(getter.getShort(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new ShortWritable(getter.getShort(ordinal))
+      }
+
+    case IntegerType =>
+      if (reuseObj) {
+        val result = new IntWritable()
+        (getter, ordinal) =>
+          result.set(getter.getInt(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new IntWritable(getter.getInt(ordinal))
+      }
+
+
+    case LongType =>
+      if (reuseObj) {
+        val result = new LongWritable()
+        (getter, ordinal) =>
+          result.set(getter.getLong(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new LongWritable(getter.getLong(ordinal))
+      }
+
+    case FloatType =>
+      if (reuseObj) {
+        val result = new FloatWritable()
+        (getter, ordinal) =>
+          result.set(getter.getFloat(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new FloatWritable(getter.getFloat(ordinal))
+      }
+
+    case DoubleType =>
+      if (reuseObj) {
+        val result = new DoubleWritable()
+        (getter, ordinal) =>
+          result.set(getter.getDouble(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new DoubleWritable(getter.getDouble(ordinal))
+      }
+
+
+    // Don't reuse the result object for string and binary as it would cause extra data copy.
+    case StringType => (getter, ordinal) =>
+      new Text(getter.getUTF8String(ordinal).getBytes)
+
+    case BinaryType => (getter, ordinal) =>
+      new BytesWritable(getter.getBinary(ordinal))
+
+    case DateType =>
+      if (reuseObj) {
+        val result = new DateWritable()
+        (getter, ordinal) =>
+          result.set(getter.getInt(ordinal))
+          result
+      } else {
+        (getter, ordinal) => new DateWritable(getter.getInt(ordinal))
+      }
+
+    // The following cases are already expensive, reusing object or not doesn't matter.
+
+    case TimestampType => (getter, ordinal) =>
+      val ts = DateTimeUtils.toJavaTimestamp(getter.getLong(ordinal))
+      val result = new OrcTimestamp(ts.getTime)
+      result.setNanos(ts.getNanos)
+      result
+
+    case DecimalType.Fixed(precision, scale) => (getter, ordinal) =>
+      val d = getter.getDecimal(ordinal, precision, scale)
+      new HiveDecimalWritable(HiveDecimal.create(d.toJavaBigDecimal))
+
+    case st: StructType => (getter, ordinal) =>
+      val result = createOrcValue(st).asInstanceOf[OrcStruct]
+      val fieldConverters = st.map(_.dataType).map(newConverter(_))
+      val numFields = st.length
+      val struct = getter.getStruct(ordinal, numFields)
+      var i = 0
+      while (i < numFields) {
+        if (struct.isNullAt(i)) {
+          result.setFieldValue(i, null)
+        } else {
+          result.setFieldValue(i, fieldConverters(i)(struct, i))
+        }
+        i += 1
+      }
+      result
+
+    case ArrayType(elementType, _) => (getter, ordinal) =>
+      val result = createOrcValue(dataType).asInstanceOf[OrcList[WritableComparable[_]]]
+      // Need to put all converted values to a list, can't reuse object.
+      val elementConverter = newConverter(elementType, reuseObj = false)
+      val array = getter.getArray(ordinal)
+      var i = 0
+      while (i < array.numElements()) {
+        if (array.isNullAt(i)) {
+          result.add(null)
+        } else {
+          result.add(elementConverter(array, i))
+        }
+        i += 1
+      }
+      result
+
+    case MapType(keyType, valueType, _) => (getter, ordinal) =>
+      val result = createOrcValue(dataType)
+        .asInstanceOf[OrcMap[WritableComparable[_], WritableComparable[_]]]
+      // Need to put all converted values to a list, can't reuse object.
+      val keyConverter = newConverter(keyType, reuseObj = false)
+      val valueConverter = newConverter(valueType, reuseObj = false)
+      val map = getter.getMap(ordinal)
+      val keyArray = map.keyArray()
+      val valueArray = map.valueArray()
+      var i = 0
+      while (i < map.numElements()) {
+        val key = keyConverter(keyArray, i)
+        if (valueArray.isNullAt(i)) {
+          result.put(key, null)
+        } else {
+          result.put(key, valueConverter(valueArray, i))
+        }
+        i += 1
+      }
+      result
+
+    case udt: UserDefinedType[_] => newConverter(udt.sqlType)
+
+    case _ =>
+      throw new UnsupportedOperationException(s"$dataType is not supported yet.")
+  }
+
+  /**
+   * Return a Orc value object for the given Spark schema.
+   */
+  private def createOrcValue(dataType: DataType) = {
+    OrcStruct.createValue(TypeDescription.fromString(dataType.catalogString))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
new file mode 100644
index 0000000000000..b03ee06d04a16
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.orc.{OrcFile, TypeDescription}
+
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.analysis.{caseInsensitiveResolution, caseSensitiveResolution}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
+import org.apache.spark.sql.types._
+
+object OrcUtils extends Logging {
+
+  // The extensions for ORC compression codecs
+  val extensionsForCompressionCodecNames = Map(
+    "NONE" -> "",
+    "SNAPPY" -> ".snappy",
+    "ZLIB" -> ".zlib",
+    "LZO" -> ".lzo")
+
+  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
+    val origPath = new Path(pathStr)
+    val fs = origPath.getFileSystem(conf)
+    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
+      .filterNot(_.isDirectory)
+      .map(_.getPath)
+      .filterNot(_.getName.startsWith("_"))
+      .filterNot(_.getName.startsWith("."))
+    paths
+  }
+
+  def readSchema(file: Path, conf: Configuration): Option[TypeDescription] = {
+    val fs = file.getFileSystem(conf)
+    val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+    val reader = OrcFile.createReader(file, readerOptions)
+    val schema = reader.getSchema
+    if (schema.getFieldNames.size == 0) {
+      None
+    } else {
+      Some(schema)
+    }
+  }
+
+  def readSchema(sparkSession: SparkSession, files: Seq[FileStatus])
+      : Option[StructType] = {
+    val conf = sparkSession.sessionState.newHadoopConf()
+    // TODO: We need to support merge schema. Please see SPARK-11412.
+    files.map(_.getPath).flatMap(readSchema(_, conf)).headOption.map { schema =>
+      logDebug(s"Reading schema from file $files, got Hive schema string: $schema")
+      CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]
+    }
+  }
+
+  /**
+   * Returns the requested column ids from the given ORC file. Column id can be -1, which means the
+   * requested column doesn't exist in the ORC file. Returns None if the given ORC file is empty.
+   */
+  def requestedColumnIds(
+      isCaseSensitive: Boolean,
+      dataSchema: StructType,
+      requiredSchema: StructType,
+      file: Path,
+      conf: Configuration): Option[Array[Int]] = {
+    val fs = file.getFileSystem(conf)
+    val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+    val reader = OrcFile.createReader(file, readerOptions)
+    val orcFieldNames = reader.getSchema.getFieldNames.asScala
+    if (orcFieldNames.isEmpty) {
+      // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer.
+      None
+    } else {
+      if (orcFieldNames.forall(_.startsWith("_col"))) {
+        // This is a ORC file written by Hive, no field names in the physical schema, assume the
+        // physical schema maps to the data scheme by index.
+        assert(orcFieldNames.length <= dataSchema.length, "The given data schema " +
+          s"${dataSchema.simpleString} has less fields than the actual ORC physical schema, " +
+          "no idea which columns were dropped, fail to read.")
+        Some(requiredSchema.fieldNames.map { name =>
+          val index = dataSchema.fieldIndex(name)
+          if (index < orcFieldNames.length) {
+            index
+          } else {
+            -1
+          }
+        })
+      } else {
+        val resolver = if (isCaseSensitive) caseSensitiveResolution else caseInsensitiveResolution
+        Some(requiredSchema.fieldNames.map { name => orcFieldNames.indexWhere(resolver(_, name)) })
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5d0bba69daca1..31d9b909ad463 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2757,4 +2757,29 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  // Only New OrcFileFormat supports this
+  Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
+      "parquet").foreach { format =>
+    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
+        emptyDf.write.format(format).save(path)
+
+        val df = spark.read.format(format).load(path)
+        assert(df.schema.sameType(emptyDf.schema))
+        checkAnswer(df, emptyDf)
+      }
+    }
+  }
+
+  test("SPARK-21791 ORC should support column names with dot") {
+    val orc = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
+    withTempDir { dir =>
+      val path = new File(dir, "orc").getCanonicalPath
+      Seq(Some(1), None).toDF("col.dots").write.format(orc).save(path)
+      assert(spark.read.format(orc).load(path).collect().length == 2)
+    }
+  }
 }

From 2c16267f7ca392d717942a7654e90db60ba60770 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Sun, 3 Dec 2017 22:56:03 +0800
Subject: [PATCH 028/356] [SPARK-22669][SQL] Avoid unnecessary function calls
 in code generation

## What changes were proposed in this pull request?

In many parts of the codebase for code generation, we are splitting the code to avoid exceptions due to the 64KB method size limit. This is generating a lot of methods which are called every time, even though sometime this is not needed. As pointed out here: https://github.com/apache/spark/pull/19752#discussion_r153081547, this is a not negligible overhead which can be avoided.

The PR applies the same approach used in #19752 also to the other places where this was feasible.

## How was this patch tested?

existing UTs.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19860 from mgaido91/SPARK-22669.
---
 .../expressions/nullExpressions.scala         | 141 ++++++++++++------
 .../sql/catalyst/expressions/predicates.scala |  68 ++++++---
 2 files changed, 140 insertions(+), 69 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 173e171910b69..3b52a0efd404a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -75,23 +75,51 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
     ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
     ctx.addMutableState(ctx.javaType(dataType), ev.value)
 
+    // all the evals are meant to be in a do { ... } while (false); loop
     val evals = children.map { e =>
       val eval = e.genCode(ctx)
       s"""
-        if (${ev.isNull}) {
-          ${eval.code}
-          if (!${eval.isNull}) {
-            ${ev.isNull} = false;
-            ${ev.value} = ${eval.value};
-          }
-        }
-      """
+         |${eval.code}
+         |if (!${eval.isNull}) {
+         |  ${ev.isNull} = false;
+         |  ${ev.value} = ${eval.value};
+         |  continue;
+         |}
+       """.stripMargin
     }
+    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
+        evals.mkString("\n")
+      } else {
+        ctx.splitExpressions(evals, "coalesce",
+          ("InternalRow", ctx.INPUT_ROW) :: Nil,
+          makeSplitFunction = {
+            func =>
+              s"""
+                |do {
+                |  $func
+                |} while (false);
+              """.stripMargin
+          },
+          foldFunctions = { funcCalls =>
+            funcCalls.map { funcCall =>
+              s"""
+                 |$funcCall;
+                 |if (!${ev.isNull}) {
+                 |  continue;
+                 |}
+               """.stripMargin
+            }.mkString
+          })
+      }
 
-    ev.copy(code = s"""
-      ${ev.isNull} = true;
-      ${ev.value} = ${ctx.defaultValue(dataType)};
-      ${ctx.splitExpressions(evals)}""")
+    ev.copy(code =
+      s"""
+         |${ev.isNull} = true;
+         |${ev.value} = ${ctx.defaultValue(dataType)};
+         |do {
+         |  $code
+         |} while (false);
+       """.stripMargin)
   }
 }
 
@@ -358,53 +386,70 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val nonnull = ctx.freshName("nonnull")
+    // all evals are meant to be inside a do { ... } while (false); loop
     val evals = children.map { e =>
       val eval = e.genCode(ctx)
       e.dataType match {
         case DoubleType | FloatType =>
           s"""
-            if ($nonnull < $n) {
-              ${eval.code}
-              if (!${eval.isNull} && !Double.isNaN(${eval.value})) {
-                $nonnull += 1;
-              }
-            }
-          """
+             |if ($nonnull < $n) {
+             |  ${eval.code}
+             |  if (!${eval.isNull} && !Double.isNaN(${eval.value})) {
+             |    $nonnull += 1;
+             |  }
+             |} else {
+             |  continue;
+             |}
+           """.stripMargin
         case _ =>
           s"""
-            if ($nonnull < $n) {
-              ${eval.code}
-              if (!${eval.isNull}) {
-                $nonnull += 1;
-              }
-            }
-          """
+             |if ($nonnull < $n) {
+             |  ${eval.code}
+             |  if (!${eval.isNull}) {
+             |    $nonnull += 1;
+             |  }
+             |} else {
+             |  continue;
+             |}
+           """.stripMargin
       }
     }
 
     val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-      evals.mkString("\n")
-    } else {
-      ctx.splitExpressions(
-        expressions = evals,
-        funcName = "atLeastNNonNulls",
-        arguments = ("InternalRow", ctx.INPUT_ROW) :: ("int", nonnull) :: Nil,
-        returnType = "int",
-        makeSplitFunction = { body =>
-          s"""
-            $body
-            return $nonnull;
-          """
-        },
-        foldFunctions = { funcCalls =>
-          funcCalls.map(funcCall => s"$nonnull = $funcCall;").mkString("\n")
-        }
-      )
-    }
+        evals.mkString("\n")
+      } else {
+        ctx.splitExpressions(
+          expressions = evals,
+          funcName = "atLeastNNonNulls",
+          arguments = ("InternalRow", ctx.INPUT_ROW) :: (ctx.JAVA_INT, nonnull) :: Nil,
+          returnType = ctx.JAVA_INT,
+          makeSplitFunction = { body =>
+            s"""
+               |do {
+               |  $body
+               |} while (false);
+               |return $nonnull;
+             """.stripMargin
+          },
+          foldFunctions = { funcCalls =>
+            funcCalls.map(funcCall =>
+              s"""
+                 |$nonnull = $funcCall;
+                 |if ($nonnull >= $n) {
+                 |  continue;
+                 |}
+               """.stripMargin).mkString("\n")
+          }
+        )
+      }
 
-    ev.copy(code = s"""
-      int $nonnull = 0;
-      $code
-      boolean ${ev.value} = $nonnull >= $n;""", isNull = "false")
+    ev.copy(code =
+      s"""
+         |${ctx.JAVA_INT} $nonnull = 0;
+         |do {
+         |  $code
+         |} while (false);
+         |${ctx.JAVA_BOOLEAN} ${ev.value} = $nonnull >= $n;
+       """.stripMargin, isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 1aaaaf1db48d1..75cc9b3bd8045 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -234,36 +234,62 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val javaDataType = ctx.javaType(value.dataType)
     val valueGen = value.genCode(ctx)
     val listGen = list.map(_.genCode(ctx))
     ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.value)
     ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
     val valueArg = ctx.freshName("valueArg")
+    // All the blocks are meant to be inside a do { ... } while (false); loop.
+    // The evaluation of variables can be stopped when we find a matching value.
     val listCode = listGen.map(x =>
       s"""
-        if (!${ev.value}) {
-          ${x.code}
-          if (${x.isNull}) {
-            ${ev.isNull} = true;
-          } else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) {
-            ${ev.isNull} = false;
-            ${ev.value} = true;
+         |${x.code}
+         |if (${x.isNull}) {
+         |  ${ev.isNull} = true;
+         |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) {
+         |  ${ev.isNull} = false;
+         |  ${ev.value} = true;
+         |  continue;
+         |}
+       """.stripMargin)
+    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
+        listCode.mkString("\n")
+      } else {
+        ctx.splitExpressions(
+          expressions = listCode,
+          funcName = "valueIn",
+          arguments = ("InternalRow", ctx.INPUT_ROW) :: (javaDataType, valueArg) :: Nil,
+          makeSplitFunction = { body =>
+            s"""
+               |do {
+               |  $body
+               |} while (false);
+             """.stripMargin
+          },
+          foldFunctions = { funcCalls =>
+            funcCalls.map(funcCall =>
+              s"""
+                 |$funcCall;
+                 |if (${ev.value}) {
+                 |  continue;
+                 |}
+               """.stripMargin).mkString("\n")
           }
-        }
-       """)
-    val listCodes = ctx.splitExpressions(
-      expressions = listCode,
-      funcName = "valueIn",
-      extraArguments = (ctx.javaType(value.dataType), valueArg) :: Nil)
-    ev.copy(code = s"""
-      ${valueGen.code}
-      ${ev.value} = false;
-      ${ev.isNull} = ${valueGen.isNull};
-      if (!${ev.isNull}) {
-        ${ctx.javaType(value.dataType)} $valueArg = ${valueGen.value};
-        $listCodes
+        )
       }
-    """)
+    ev.copy(code =
+      s"""
+         |${valueGen.code}
+         |${ev.value} = false;
+         |${ev.isNull} = ${valueGen.isNull};
+         |if (!${ev.isNull}) {
+         |  $javaDataType $valueArg = ${valueGen.value};
+         |  do {
+         |    $code
+         |  } while (false);
+         |}
+       """.stripMargin)
   }
 
   override def sql: String = {

From dff440f1ecdbce65cad377c44fd2abfd4eff9b44 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sun, 3 Dec 2017 23:05:31 +0800
Subject: [PATCH 029/356] [SPARK-22626][SQL] deals with wrong Hive's statistics
 (zero rowCount)

This pr to ensure that the Hive's statistics `totalSize` (or `rawDataSize`) > 0, `rowCount` also must be > 0. Otherwise may cause OOM when CBO is enabled.

unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19831 from wangyum/SPARK-22626.
---
 .../sql/hive/client/HiveClientImpl.scala      |  6 +++---
 .../spark/sql/hive/StatisticsSuite.scala      | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 47ce6ba83866c..77e836003b39f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -418,7 +418,7 @@ private[hive] class HiveClientImpl(
       // Note that this statistics could be overridden by Spark's statistics if that's available.
       val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
       val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
-      val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_)).filter(_ >= 0)
+      val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
       // TODO: check if this estimate is valid for tables after partition pruning.
       // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
       // relatively cheap if parameters for the table are populated into the metastore.
@@ -430,9 +430,9 @@ private[hive] class HiveClientImpl(
         // so when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
         // return None. Later, we will use the other ways to estimate the statistics.
         if (totalSize.isDefined && totalSize.get > 0L) {
-          Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount))
+          Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
         } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
-          Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount))
+          Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount.filter(_ > 0)))
         } else {
           // TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?
           None
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 0cdd9305c6b6f..ee027e5308265 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1360,4 +1360,23 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
 
   }
+
+  test("Deals with wrong Hive's statistics (zero rowCount)") {
+    withTable("maybe_big") {
+      sql("CREATE TABLE maybe_big (c1 bigint)" +
+        "TBLPROPERTIES ('numRows'='0', 'rawDataSize'='60000000000', 'totalSize'='8000000000000')")
+
+      val relation = spark.table("maybe_big").queryExecution.analyzed.children.head
+        .asInstanceOf[HiveTableRelation]
+
+      val properties = relation.tableMeta.ignoredProperties
+      assert(properties("totalSize").toLong > 0)
+      assert(properties("rawDataSize").toLong > 0)
+      assert(properties("numRows").toLong == 0)
+
+      assert(relation.stats.sizeInBytes > 0)
+      // May be cause OOM if rowCount == 0 when enables CBO, see SPARK-22626 for details.
+      assert(relation.stats.rowCount.isEmpty)
+    }
+  }
 }

From 4131ad03f4d2dfcfb1e166e5dfdf0752479f7340 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sun, 3 Dec 2017 23:52:37 -0800
Subject: [PATCH 030/356] [SPARK-22489][DOC][FOLLOWUP] Update broadcast
 behavior changes in migration section

## What changes were proposed in this pull request?

Update broadcast behavior changes in migration section.

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19858 from wangyum/SPARK-22489-migration.
---
 docs/sql-programming-guide.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index a1b9c3bbfd059..b76be9132dd03 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1776,6 +1776,8 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
+ 
+ - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 

From 3927bb9b460d2d944ecf3c8552d71e8a25d29655 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Mon, 4 Dec 2017 11:07:27 -0600
Subject: [PATCH 031/356] [SPARK-22473][FOLLOWUP][TEST] Remove deprecated Date
 functions

## What changes were proposed in this pull request?

#19696 replaced the deprecated usages for `Date` and `Waiter`, but a few methods were missed. The PR fixes the forgotten deprecated usages.

## How was this patch tested?

existing UTs

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19875 from mgaido91/SPARK-22473_FOLLOWUP.
---
 .../spark/sql/catalyst/expressions/PredicateSuite.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 865092a659f26..0079e4e8d6f74 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -291,26 +291,26 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   private val udt = new ExamplePointUDT
 
   private val smallValues =
-    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), Date.valueOf("2000-01-01"),
       new Timestamp(1), "a", 1f, 1d, 0f, 0d, false, Array(1L, 2L))
       .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
       Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),
       Literal.create(ArrayData.toArrayData(Array(1.0, 2.0)), udt))
   private val largeValues =
-    Seq(2.toByte, 2.toShort, 2, 2L, Decimal(2), Array(2.toByte), new Date(2000, 1, 2),
+    Seq(2.toByte, 2.toShort, 2, 2L, Decimal(2), Array(2.toByte), Date.valueOf("2000-01-02"),
       new Timestamp(2), "b", 2f, 2d, Float.NaN, Double.NaN, true, Array(2L, 1L))
       .map(Literal(_)) ++ Seq(Literal.create(MyStruct(2L, "b")),
       Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 2))),
       Literal.create(ArrayData.toArrayData(Array(1.0, 3.0)), udt))
 
   private val equalValues1 =
-    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), Date.valueOf("2000-01-01"),
       new Timestamp(1), "a", 1f, 1d, Float.NaN, Double.NaN, true, Array(1L, 2L))
       .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
       Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),
       Literal.create(ArrayData.toArrayData(Array(1.0, 2.0)), udt))
   private val equalValues2 =
-    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), new Date(2000, 1, 1),
+    Seq(1.toByte, 1.toShort, 1, 1L, Decimal(1), Array(1.toByte), Date.valueOf("2000-01-01"),
       new Timestamp(1), "a", 1f, 1d, Float.NaN, Double.NaN, true, Array(1L, 2L))
       .map(Literal(_)) ++ Seq(Literal.create(MyStruct(1L, "b")),
       Literal.create(MyStruct2(MyStruct(1L, "a"), Array(1, 1))),

From f81401e1cb39f2d6049b79dc8d61305f3371276f Mon Sep 17 00:00:00 2001
From: Reza Safi <rezasafi@cloudera.com>
Date: Mon, 4 Dec 2017 09:23:48 -0800
Subject: [PATCH 032/356] [SPARK-22162] Executors and the driver should use
 consistent JobIDs in the RDD commit protocol

I have modified SparkHadoopWriter so that executors and the driver always use consistent JobIds during the hadoop commit. Before SPARK-18191, spark always used the rddId, it just incorrectly named the variable stageId. After SPARK-18191, it used the rddId as the jobId on the driver's side, and the stageId as the jobId on the executors' side. With this change executors and the driver will consistently uses rddId as the jobId. Also with this change, during the hadoop commit protocol spark uses  actual stageId to check whether a stage can be committed unlike before that  it was using executors' jobId to do this check.
In addition to the existing unit tests, a test has been added to check whether executors and the driver are using the same JobId. The test failed before this change and passed after applying this fix.

Author: Reza Safi <rezasafi@cloudera.com>

Closes #19848 from rezasafi/stagerddsimple.
---
 .../spark/internal/io/SparkHadoopWriter.scala | 12 ++---
 .../spark/mapred/SparkHadoopMapRedUtil.scala  |  5 ++-
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 44 +++++++++++++++++++
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
index 949d8c677998f..abf39213fa0d2 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/SparkHadoopWriter.scala
@@ -60,17 +60,17 @@ object SparkHadoopWriter extends Logging {
       config: HadoopWriteConfigUtil[K, V]): Unit = {
     // Extract context and configuration from RDD.
     val sparkContext = rdd.context
-    val stageId = rdd.id
+    val commitJobId = rdd.id
 
     // Set up a job.
     val jobTrackerId = createJobTrackerID(new Date())
-    val jobContext = config.createJobContext(jobTrackerId, stageId)
+    val jobContext = config.createJobContext(jobTrackerId, commitJobId)
     config.initOutputFormat(jobContext)
 
     // Assert the output format/key/value class is set in JobConf.
     config.assertConf(jobContext, rdd.conf)
 
-    val committer = config.createCommitter(stageId)
+    val committer = config.createCommitter(commitJobId)
     committer.setupJob(jobContext)
 
     // Try to write all RDD partitions as a Hadoop OutputFormat.
@@ -80,7 +80,7 @@ object SparkHadoopWriter extends Logging {
           context = context,
           config = config,
           jobTrackerId = jobTrackerId,
-          sparkStageId = context.stageId,
+          commitJobId = commitJobId,
           sparkPartitionId = context.partitionId,
           sparkAttemptNumber = context.attemptNumber,
           committer = committer,
@@ -102,14 +102,14 @@ object SparkHadoopWriter extends Logging {
       context: TaskContext,
       config: HadoopWriteConfigUtil[K, V],
       jobTrackerId: String,
-      sparkStageId: Int,
+      commitJobId: Int,
       sparkPartitionId: Int,
       sparkAttemptNumber: Int,
       committer: FileCommitProtocol,
       iterator: Iterator[(K, V)]): TaskCommitMessage = {
     // Set up a task.
     val taskContext = config.createTaskAttemptContext(
-      jobTrackerId, sparkStageId, sparkPartitionId, sparkAttemptNumber)
+      jobTrackerId, commitJobId, sparkPartitionId, sparkAttemptNumber)
     committer.setupTask(taskContext)
 
     val (outputMetrics, callback) = initHadoopOutputMetrics(context)
diff --git a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
index 607283a306b8f..764735dc4eae7 100644
--- a/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
+++ b/core/src/main/scala/org/apache/spark/mapred/SparkHadoopMapRedUtil.scala
@@ -70,7 +70,8 @@ object SparkHadoopMapRedUtil extends Logging {
       if (shouldCoordinateWithDriver) {
         val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator
         val taskAttemptNumber = TaskContext.get().attemptNumber()
-        val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber)
+        val stageId = TaskContext.get().stageId()
+        val canCommit = outputCommitCoordinator.canCommit(stageId, splitId, taskAttemptNumber)
 
         if (canCommit) {
           performCommit()
@@ -80,7 +81,7 @@ object SparkHadoopMapRedUtil extends Logging {
           logInfo(message)
           // We need to abort the task so that the driver can reschedule new attempts, if necessary
           committer.abortTask(mrTaskContext)
-          throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber)
+          throw new CommitDeniedException(message, stageId, splitId, taskAttemptNumber)
         }
       } else {
         // Speculation is disabled or a user has chosen to manually bypass the commit coordination
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 0a248b6064ee8..65d35264dc108 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.{Job => NewJob, JobContext => NewJobContext,
   OutputCommitter => NewOutputCommitter, OutputFormat => NewOutputFormat,
   RecordWriter => NewRecordWriter, TaskAttemptContext => NewTaskAttempContext}
 import org.apache.hadoop.util.Progressable
+import org.scalatest.Assertions
 
 import org.apache.spark._
 import org.apache.spark.Partitioner
@@ -524,6 +525,15 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     pairs.saveAsNewAPIHadoopFile[ConfigTestFormat]("ignored")
   }
 
+  test("The JobId on the driver and executors should be the same during the commit") {
+    // Create more than one rdd to mimic stageId not equal to rddId
+    val pairs = sc.parallelize(Array((1, 2), (2, 3)), 2)
+      .map { p => (new Integer(p._1 + 1), new Integer(p._2 + 1)) }
+      .filter { p => p._1 > 0 }
+    pairs.saveAsNewAPIHadoopFile[YetAnotherFakeFormat]("ignored")
+    assert(JobID.jobid != -1)
+  }
+
   test("saveAsHadoopFile should respect configured output committers") {
     val pairs = sc.parallelize(Array((new Integer(1), new Integer(1))))
     val conf = new JobConf()
@@ -908,6 +918,40 @@ class NewFakeFormatWithCallback() extends NewFakeFormat {
   }
 }
 
+class YetAnotherFakeCommitter extends NewOutputCommitter with Assertions {
+  def setupJob(j: NewJobContext): Unit = {
+    JobID.jobid = j.getJobID().getId
+  }
+
+  def needsTaskCommit(t: NewTaskAttempContext): Boolean = false
+
+  def setupTask(t: NewTaskAttempContext): Unit = {
+    val jobId = t.getTaskAttemptID().getJobID().getId
+    assert(jobId === JobID.jobid)
+  }
+
+  def commitTask(t: NewTaskAttempContext): Unit = {}
+
+  def abortTask(t: NewTaskAttempContext): Unit = {}
+}
+
+class YetAnotherFakeFormat() extends NewOutputFormat[Integer, Integer]() {
+
+  def checkOutputSpecs(j: NewJobContext): Unit = {}
+
+  def getRecordWriter(t: NewTaskAttempContext): NewRecordWriter[Integer, Integer] = {
+    new NewFakeWriter()
+  }
+
+  def getOutputCommitter(t: NewTaskAttempContext): NewOutputCommitter = {
+    new YetAnotherFakeCommitter()
+  }
+}
+
+object JobID {
+  var jobid = -1
+}
+
 class ConfigTestFormat() extends NewFakeFormat() with Configurable {
 
   var setConfCalled = false

From e1dd03e42c2131b167b1e80c761291e88bfdf03f Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 4 Dec 2017 11:05:03 -0800
Subject: [PATCH 033/356] [SPARK-22372][CORE, YARN] Make cluster submission use
 SparkApplication.

The main goal of this change is to allow multiple cluster-mode
submissions from the same JVM, without having them end up with
mixed configuration. That is done by extending the SparkApplication
trait, and doing so was reasonably trivial for standalone and
mesos modes.

For YARN mode, there was a complication. YARN used a "SPARK_YARN_MODE"
system property to control behavior indirectly in a whole bunch of
places, mainly in the SparkHadoopUtil / YarnSparkHadoopUtil classes.
Most of the changes here are removing that.

Since we removed support for Hadoop 1.x, some methods that lived in
YarnSparkHadoopUtil can now live in SparkHadoopUtil. The remaining
methods don't need to be part of the class, and can be called directly
from the YarnSparkHadoopUtil object, so now there's a single
implementation of SparkHadoopUtil.

There were two places in the code that relied on  SPARK_YARN_MODE to
make decisions about YARN-specific functionality, and now explicitly check
the master from the configuration for that instead:

* fetching the external shuffle service port, which can come from the YARN
  configuration.

* propagation of the authentication secret using Hadoop credentials. This also
  was cleaned up a little to not need so many methods in `SparkHadoopUtil`.

With those out of the way, actually changing the YARN client
to extend SparkApplication was easy.

Tested with existing unit tests, and also by running YARN apps
with auth and kerberos both on and off in a real cluster.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19631 from vanzin/SPARK-22372.
---
 .../org/apache/spark/SecurityManager.scala    | 102 ++++++-------
 .../scala/org/apache/spark/SparkContext.scala |   3 -
 .../scala/org/apache/spark/SparkEnv.scala     |   4 +
 .../org/apache/spark/deploy/Client.scala      |   8 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala |  48 +------
 .../org/apache/spark/deploy/SparkSubmit.scala |  31 ++--
 .../deploy/rest/RestSubmissionClient.scala    |  31 ++--
 .../CoarseGrainedExecutorBackend.scala        |  10 +-
 .../scala/org/apache/spark/util/Utils.scala   |   6 +-
 .../apache/spark/SecurityManagerSuite.scala   |  31 +++-
 .../spark/deploy/SparkSubmitSuite.scala       |   8 +-
 .../rest/StandaloneRestSubmitSuite.scala      |   2 +-
 project/MimaExcludes.scala                    |   6 +
 .../spark/deploy/yarn/ApplicationMaster.scala |  54 +++----
 .../org/apache/spark/deploy/yarn/Client.scala |  65 ++++-----
 .../spark/deploy/yarn/YarnRMClient.scala      |   2 +-
 .../deploy/yarn/YarnSparkHadoopUtil.scala     | 135 ++++++------------
 .../yarn/security/AMCredentialRenewer.scala   |   2 +-
 .../cluster/YarnClientSchedulerBackend.scala  |   4 +-
 .../cluster/YarnClusterSchedulerBackend.scala |   2 +-
 .../deploy/yarn/BaseYarnClusterSuite.scala    |   5 -
 .../spark/deploy/yarn/ClientSuite.scala       |  31 +---
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  20 ++-
 .../yarn/YarnSparkHadoopUtilSuite.scala       |  49 +------
 ...ARNHadoopDelegationTokenManagerSuite.scala |  11 +-
 25 files changed, 274 insertions(+), 396 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SecurityManager.scala b/core/src/main/scala/org/apache/spark/SecurityManager.scala
index 2480e56b72ccf..4c1dbe3ffb4ad 100644
--- a/core/src/main/scala/org/apache/spark/SecurityManager.scala
+++ b/core/src/main/scala/org/apache/spark/SecurityManager.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import java.lang.{Byte => JByte}
 import java.net.{Authenticator, PasswordAuthentication}
+import java.nio.charset.StandardCharsets.UTF_8
 import java.security.{KeyStore, SecureRandom}
 import java.security.cert.X509Certificate
 import javax.net.ssl._
@@ -26,10 +27,11 @@ import javax.net.ssl._
 import com.google.common.hash.HashCodes
 import com.google.common.io.Files
 import org.apache.hadoop.io.Text
+import org.apache.hadoop.security.{Credentials, UserGroupInformation}
 
-import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
+import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.network.sasl.SecretKeyHolder
 import org.apache.spark.util.Utils
 
@@ -225,7 +227,6 @@ private[spark] class SecurityManager(
   setViewAclsGroups(sparkConf.get("spark.ui.view.acls.groups", ""));
   setModifyAclsGroups(sparkConf.get("spark.modify.acls.groups", ""));
 
-  private val secretKey = generateSecretKey()
   logInfo("SecurityManager: authentication " + (if (authOn) "enabled" else "disabled") +
     "; ui acls " + (if (aclsOn) "enabled" else "disabled") +
     "; users  with view permissions: " + viewAcls.toString() +
@@ -416,50 +417,6 @@ private[spark] class SecurityManager(
 
   def getIOEncryptionKey(): Option[Array[Byte]] = ioEncryptionKey
 
-  /**
-   * Generates or looks up the secret key.
-   *
-   * The way the key is stored depends on the Spark deployment mode. Yarn
-   * uses the Hadoop UGI.
-   *
-   * For non-Yarn deployments, If the config variable is not set
-   * we throw an exception.
-   */
-  private def generateSecretKey(): String = {
-    if (!isAuthenticationEnabled) {
-      null
-    } else if (SparkHadoopUtil.get.isYarnMode) {
-      // In YARN mode, the secure cookie will be created by the driver and stashed in the
-      // user's credentials, where executors can get it. The check for an array of size 0
-      // is because of the test code in YarnSparkHadoopUtilSuite.
-      val secretKey = SparkHadoopUtil.get.getSecretKeyFromUserCredentials(SECRET_LOOKUP_KEY)
-      if (secretKey == null || secretKey.length == 0) {
-        logDebug("generateSecretKey: yarn mode, secret key from credentials is null")
-        val rnd = new SecureRandom()
-        val length = sparkConf.getInt("spark.authenticate.secretBitLength", 256) / JByte.SIZE
-        val secret = new Array[Byte](length)
-        rnd.nextBytes(secret)
-
-        val cookie = HashCodes.fromBytes(secret).toString()
-        SparkHadoopUtil.get.addSecretKeyToUserCredentials(SECRET_LOOKUP_KEY, cookie)
-        cookie
-      } else {
-        new Text(secretKey).toString
-      }
-    } else {
-      // user must have set spark.authenticate.secret config
-      // For Master/Worker, auth secret is in conf; for Executors, it is in env variable
-      Option(sparkConf.getenv(SecurityManager.ENV_AUTH_SECRET))
-        .orElse(sparkConf.getOption(SecurityManager.SPARK_AUTH_SECRET_CONF)) match {
-        case Some(value) => value
-        case None =>
-          throw new IllegalArgumentException(
-            "Error: a secret key must be specified via the " +
-              SecurityManager.SPARK_AUTH_SECRET_CONF + " config")
-      }
-    }
-  }
-
   /**
    * Check to see if Acls for the UI are enabled
    * @return true if UI authentication is enabled, otherwise false
@@ -542,7 +499,51 @@ private[spark] class SecurityManager(
    * Gets the secret key.
    * @return the secret key as a String if authentication is enabled, otherwise returns null
    */
-  def getSecretKey(): String = secretKey
+  def getSecretKey(): String = {
+    if (isAuthenticationEnabled) {
+      val creds = UserGroupInformation.getCurrentUser().getCredentials()
+      Option(creds.getSecretKey(SECRET_LOOKUP_KEY))
+        .map { bytes => new String(bytes, UTF_8) }
+        .orElse(Option(sparkConf.getenv(ENV_AUTH_SECRET)))
+        .orElse(sparkConf.getOption(SPARK_AUTH_SECRET_CONF))
+        .getOrElse {
+          throw new IllegalArgumentException(
+            s"A secret key must be specified via the $SPARK_AUTH_SECRET_CONF config")
+        }
+    } else {
+      null
+    }
+  }
+
+  /**
+   * Initialize the authentication secret.
+   *
+   * If authentication is disabled, do nothing.
+   *
+   * In YARN mode, generate a new secret and store it in the current user's credentials.
+   *
+   * In other modes, assert that the auth secret is set in the configuration.
+   */
+  def initializeAuth(): Unit = {
+    if (!sparkConf.get(NETWORK_AUTH_ENABLED)) {
+      return
+    }
+
+    if (sparkConf.get(SparkLauncher.SPARK_MASTER, null) != "yarn") {
+      require(sparkConf.contains(SPARK_AUTH_SECRET_CONF),
+        s"A secret key must be specified via the $SPARK_AUTH_SECRET_CONF config.")
+      return
+    }
+
+    val rnd = new SecureRandom()
+    val length = sparkConf.getInt("spark.authenticate.secretBitLength", 256) / JByte.SIZE
+    val secretBytes = new Array[Byte](length)
+    rnd.nextBytes(secretBytes)
+
+    val creds = new Credentials()
+    creds.addSecretKey(SECRET_LOOKUP_KEY, secretBytes)
+    UserGroupInformation.getCurrentUser().addCredentials(creds)
+  }
 
   // Default SecurityManager only has a single secret key, so ignore appId.
   override def getSaslUser(appId: String): String = getSaslUser()
@@ -551,13 +552,12 @@ private[spark] class SecurityManager(
 
 private[spark] object SecurityManager {
 
-  val SPARK_AUTH_CONF: String = "spark.authenticate"
-  val SPARK_AUTH_SECRET_CONF: String = "spark.authenticate.secret"
+  val SPARK_AUTH_CONF = NETWORK_AUTH_ENABLED.key
+  val SPARK_AUTH_SECRET_CONF = "spark.authenticate.secret"
   // This is used to set auth secret to an executor's env variable. It should have the same
   // value as SPARK_AUTH_SECRET_CONF set in SparkConf
   val ENV_AUTH_SECRET = "_SPARK_AUTH_SECRET"
 
   // key used to store the spark secret in the Hadoop UGI
-  val SECRET_LOOKUP_KEY = "sparkCookie"
-
+  val SECRET_LOOKUP_KEY = new Text("sparkCookie")
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index c174939ca2e54..71f1e7c7321bc 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -413,8 +413,6 @@ class SparkContext(config: SparkConf) extends Logging {
       }
     }
 
-    if (master == "yarn" && deployMode == "client") System.setProperty("SPARK_YARN_MODE", "true")
-
     _listenerBus = new LiveListenerBus(_conf)
 
     // Initialize the app status store and listener before SparkEnv is created so that it gets
@@ -1955,7 +1953,6 @@ class SparkContext(config: SparkConf) extends Logging {
     // `SparkContext` is stopped.
     localProperties.remove()
     // Unset YARN mode system env variable, to allow switching between cluster types.
-    System.clearProperty("SPARK_YARN_MODE")
     SparkContext.clearActiveContext()
     logInfo("Successfully stopped SparkContext")
   }
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index 24928150315e8..72123f2232532 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -234,6 +234,10 @@ object SparkEnv extends Logging {
     }
 
     val securityManager = new SecurityManager(conf, ioEncryptionKey)
+    if (isDriver) {
+      securityManager.initializeAuth()
+    }
+
     ioEncryptionKey.foreach { _ =>
       if (!securityManager.isEncryptionEnabled()) {
         logWarning("I/O encryption enabled without RPC encryption: keys will be visible on the " +
diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index 7acb5c55bb252..d5145094ec079 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -217,8 +217,13 @@ object Client {
       println("Use ./bin/spark-submit with \"--master spark://host:port\"")
     }
     // scalastyle:on println
+    new ClientApp().start(args, new SparkConf())
+  }
+}
 
-    val conf = new SparkConf()
+private[spark] class ClientApp extends SparkApplication {
+
+  override def start(args: Array[String], conf: SparkConf): Unit = {
     val driverArgs = new ClientArguments(args)
 
     if (!conf.contains("spark.rpc.askTimeout")) {
@@ -235,4 +240,5 @@ object Client {
 
     rpcEnv.awaitTermination()
   }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 17c7319b40f24..e14f9845e6db6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -75,9 +75,7 @@ class SparkHadoopUtil extends Logging {
   }
 
   def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
-    for (token <- source.getTokens.asScala) {
-      dest.addToken(token)
-    }
+    dest.addCredentials(source.getCredentials())
   }
 
   /**
@@ -120,16 +118,9 @@ class SparkHadoopUtil extends Logging {
    * Add any user credentials to the job conf which are necessary for running on a secure Hadoop
    * cluster.
    */
-  def addCredentials(conf: JobConf) {}
-
-  def isYarnMode(): Boolean = { false }
-
-  def addSecretKeyToUserCredentials(key: String, secret: String) {}
-
-  def getSecretKeyFromUserCredentials(key: String): Array[Byte] = { null }
-
-  def getCurrentUserCredentials(): Credentials = {
-    UserGroupInformation.getCurrentUser().getCredentials()
+  def addCredentials(conf: JobConf): Unit = {
+    val jobCreds = conf.getCredentials()
+    jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
   }
 
   def addCurrentUserCredentials(creds: Credentials): Unit = {
@@ -328,17 +319,6 @@ class SparkHadoopUtil extends Logging {
     }
   }
 
-  /**
-   * Start a thread to periodically update the current user's credentials with new credentials so
-   * that access to secured service does not fail.
-   */
-  private[spark] def startCredentialUpdater(conf: SparkConf) {}
-
-  /**
-   * Stop the thread that does the credential updates.
-   */
-  private[spark] def stopCredentialUpdater() {}
-
   /**
    * Return a fresh Hadoop configuration, bypassing the HDFS cache mechanism.
    * This is to prevent the DFSClient from using an old cached token to connect to the NameNode.
@@ -441,14 +421,7 @@ class SparkHadoopUtil extends Logging {
 
 object SparkHadoopUtil {
 
-  private lazy val hadoop = new SparkHadoopUtil
-  private lazy val yarn = try {
-    Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
-      .newInstance()
-      .asInstanceOf[SparkHadoopUtil]
-  } catch {
-    case e: Exception => throw new SparkException("Unable to load YARN support", e)
-  }
+  private lazy val instance = new SparkHadoopUtil
 
   val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp"
 
@@ -462,16 +435,7 @@ object SparkHadoopUtil {
    */
   private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000
 
-  def get: SparkHadoopUtil = {
-    // Check each time to support changing to/from YARN
-    val yarnMode = java.lang.Boolean.parseBoolean(
-        System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
-    if (yarnMode) {
-      yarn
-    } else {
-      hadoop
-    }
-  }
+  def get: SparkHadoopUtil = instance
 
   /**
    * Given an expiration date (e.g. for Hadoop Delegation Tokens) return a the date
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 73b956ef3e470..cfcdce648d330 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -92,6 +92,12 @@ object SparkSubmit extends CommandLineUtils with Logging {
 
   private val CLASS_NOT_FOUND_EXIT_STATUS = 101
 
+  // Following constants are visible for testing.
+  private[deploy] val YARN_CLUSTER_SUBMIT_CLASS =
+    "org.apache.spark.deploy.yarn.YarnClusterApplication"
+  private[deploy] val REST_CLUSTER_SUBMIT_CLASS = classOf[RestSubmissionClientApp].getName()
+  private[deploy] val STANDALONE_CLUSTER_SUBMIT_CLASS = classOf[ClientApp].getName()
+
   // scalastyle:off println
   private[spark] def printVersionAndExit(): Unit = {
     printStream.println("""Welcome to
@@ -281,7 +287,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
 
       // Make sure YARN is included in our build if we're trying to use it
-      if (!Utils.classIsLoadable("org.apache.spark.deploy.yarn.Client") && !Utils.isTesting) {
+      if (!Utils.classIsLoadable(YARN_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) {
         printErrorAndExit(
           "Could not load YARN classes. " +
           "This copy of Spark may not have been compiled with YARN support.")
@@ -363,10 +369,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
     args.pyFiles = Option(args.pyFiles).map(resolveGlobPaths(_, hadoopConf)).orNull
     args.archives = Option(args.archives).map(resolveGlobPaths(_, hadoopConf)).orNull
 
-    // This security manager will not need an auth secret, but set a dummy value in case
-    // spark.authenticate is enabled, otherwise an exception is thrown.
-    lazy val downloadConf = sparkConf.clone().set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
-    lazy val secMgr = new SecurityManager(downloadConf)
+    lazy val secMgr = new SecurityManager(sparkConf)
 
     // In client mode, download remote files.
     var localPrimaryResource: String = null
@@ -374,13 +377,13 @@ object SparkSubmit extends CommandLineUtils with Logging {
     var localPyFiles: String = null
     if (deployMode == CLIENT) {
       localPrimaryResource = Option(args.primaryResource).map {
-        downloadFile(_, targetDir, downloadConf, hadoopConf, secMgr)
+        downloadFile(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
       localJars = Option(args.jars).map {
-        downloadFileList(_, targetDir, downloadConf, hadoopConf, secMgr)
+        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
       localPyFiles = Option(args.pyFiles).map {
-        downloadFileList(_, targetDir, downloadConf, hadoopConf, secMgr)
+        downloadFileList(_, targetDir, sparkConf, hadoopConf, secMgr)
       }.orNull
     }
 
@@ -391,8 +394,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
     // For yarn client mode, since we already download them with above code, so we only need to
     // figure out the local path and replace the remote one.
     if (clusterManager == YARN) {
-      sparkConf.setIfMissing(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
-      val secMgr = new SecurityManager(sparkConf)
       val forceDownloadSchemes = sparkConf.get(FORCE_DOWNLOAD_SCHEMES)
 
       def shouldDownload(scheme: String): Boolean = {
@@ -409,7 +410,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
             if (file.exists()) {
               file.toURI.toString
             } else {
-              downloadFile(resource, targetDir, downloadConf, hadoopConf, secMgr)
+              downloadFile(resource, targetDir, sparkConf, hadoopConf, secMgr)
             }
           case _ => uri.toString
         }
@@ -634,11 +635,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
     // All Spark parameters are expected to be passed to the client through system properties.
     if (args.isStandaloneCluster) {
       if (args.useRest) {
-        childMainClass = "org.apache.spark.deploy.rest.RestSubmissionClient"
+        childMainClass = REST_CLUSTER_SUBMIT_CLASS
         childArgs += (args.primaryResource, args.mainClass)
       } else {
         // In legacy standalone cluster mode, use Client as a wrapper around the user class
-        childMainClass = "org.apache.spark.deploy.Client"
+        childMainClass = STANDALONE_CLUSTER_SUBMIT_CLASS
         if (args.supervise) { childArgs += "--supervise" }
         Option(args.driverMemory).foreach { m => childArgs += ("--memory", m) }
         Option(args.driverCores).foreach { c => childArgs += ("--cores", c) }
@@ -663,7 +664,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
 
     // In yarn-cluster mode, use yarn.Client as a wrapper around the user class
     if (isYarnCluster) {
-      childMainClass = "org.apache.spark.deploy.yarn.Client"
+      childMainClass = YARN_CLUSTER_SUBMIT_CLASS
       if (args.isPython) {
         childArgs += ("--primary-py-file", args.primaryResource)
         childArgs += ("--class", "org.apache.spark.deploy.PythonRunner")
@@ -684,7 +685,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
 
     if (isMesosCluster) {
       assert(args.useRest, "Mesos cluster mode is only supported through the REST submission API")
-      childMainClass = "org.apache.spark.deploy.rest.RestSubmissionClient"
+      childMainClass = REST_CLUSTER_SUBMIT_CLASS
       if (args.isPython) {
         // Second argument is main class
         childArgs += (args.primaryResource, "")
diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
index 21cb94142b15b..742a95841a138 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/RestSubmissionClient.scala
@@ -32,6 +32,7 @@ import scala.util.control.NonFatal
 import com.fasterxml.jackson.core.JsonProcessingException
 
 import org.apache.spark.{SPARK_VERSION => sparkVersion, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkApplication
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
@@ -398,9 +399,20 @@ private[spark] object RestSubmissionClient {
   val PROTOCOL_VERSION = "v1"
 
   /**
-   * Submit an application, assuming Spark parameters are specified through the given config.
-   * This is abstracted to its own method for testing purposes.
+   * Filter non-spark environment variables from any environment.
    */
+  private[rest] def filterSystemEnvironment(env: Map[String, String]): Map[String, String] = {
+    env.filterKeys { k =>
+      // SPARK_HOME is filtered out because it is usually wrong on the remote machine (SPARK-12345)
+      (k.startsWith("SPARK_") && k != "SPARK_ENV_LOADED" && k != "SPARK_HOME") ||
+        k.startsWith("MESOS_")
+    }
+  }
+}
+
+private[spark] class RestSubmissionClientApp extends SparkApplication {
+
+  /** Submits a request to run the application and return the response. Visible for testing. */
   def run(
       appResource: String,
       mainClass: String,
@@ -417,7 +429,7 @@ private[spark] object RestSubmissionClient {
     client.createSubmission(submitRequest)
   }
 
-  def main(args: Array[String]): Unit = {
+  override def start(args: Array[String], conf: SparkConf): Unit = {
     if (args.length < 2) {
       sys.error("Usage: RestSubmissionClient [app resource] [main class] [app args*]")
       sys.exit(1)
@@ -425,19 +437,8 @@ private[spark] object RestSubmissionClient {
     val appResource = args(0)
     val mainClass = args(1)
     val appArgs = args.slice(2, args.length)
-    val conf = new SparkConf
-    val env = filterSystemEnvironment(sys.env)
+    val env = RestSubmissionClient.filterSystemEnvironment(sys.env)
     run(appResource, mainClass, appArgs, conf, env)
   }
 
-  /**
-   * Filter non-spark environment variables from any environment.
-   */
-  private[rest] def filterSystemEnvironment(env: Map[String, String]): Map[String, String] = {
-    env.filterKeys { k =>
-      // SPARK_HOME is filtered out because it is usually wrong on the remote machine (SPARK-12345)
-      (k.startsWith("SPARK_") && k != "SPARK_ENV_LOADED" && k != "SPARK_HOME") ||
-        k.startsWith("MESOS_")
-    }
-  }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index acefc9d2436d0..4c1f92a1bcbf2 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -220,7 +220,9 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       if (driverConf.contains("spark.yarn.credentials.file")) {
         logInfo("Will periodically update credentials from: " +
           driverConf.get("spark.yarn.credentials.file"))
-        SparkHadoopUtil.get.startCredentialUpdater(driverConf)
+        Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
+          .getMethod("startCredentialUpdater", classOf[SparkConf])
+          .invoke(null, driverConf)
       }
 
       cfg.hadoopDelegationCreds.foreach { tokens =>
@@ -236,7 +238,11 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
         env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url))
       }
       env.rpcEnv.awaitTermination()
-      SparkHadoopUtil.get.stopCredentialUpdater()
+      if (driverConf.contains("spark.yarn.credentials.file")) {
+        Utils.classForName("org.apache.spark.deploy.yarn.YarnSparkHadoopUtil")
+          .getMethod("stopCredentialUpdater")
+          .invoke(null)
+      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 930e09d90c2f5..51bf91614c866 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -50,6 +50,7 @@ import org.apache.commons.lang3.SystemUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
 import org.apache.hadoop.security.UserGroupInformation
+import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.log4j.PropertyConfigurator
 import org.eclipse.jetty.util.MultiException
 import org.json4s._
@@ -59,6 +60,7 @@ import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
+import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
 
@@ -2405,8 +2407,8 @@ private[spark] object Utils extends Logging {
    */
   def getSparkOrYarnConfig(conf: SparkConf, key: String, default: String): String = {
     val sparkValue = conf.get(key, default)
-    if (SparkHadoopUtil.get.isYarnMode) {
-      SparkHadoopUtil.get.newConfiguration(conf).get(key, sparkValue)
+    if (conf.get(SparkLauncher.SPARK_MASTER, null) == "yarn") {
+      new YarnConfiguration(SparkHadoopUtil.get.newConfiguration(conf)).get(key, sparkValue)
     } else {
       sparkValue
     }
diff --git a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
index 9801b2638cc15..cf59265dd646d 100644
--- a/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SecurityManagerSuite.scala
@@ -18,7 +18,13 @@
 package org.apache.spark
 
 import java.io.File
+import java.nio.charset.StandardCharsets.UTF_8
+import java.security.PrivilegedExceptionAction
 
+import org.apache.hadoop.security.UserGroupInformation
+
+import org.apache.spark.internal.config._
+import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.security.GroupMappingServiceProvider
 import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv, Utils}
 
@@ -411,8 +417,12 @@ class SecurityManagerSuite extends SparkFunSuite with ResetSystemProperties {
 
   test("missing secret authentication key") {
     val conf = new SparkConf().set("spark.authenticate", "true")
+    val mgr = new SecurityManager(conf)
+    intercept[IllegalArgumentException] {
+      mgr.getSecretKey()
+    }
     intercept[IllegalArgumentException] {
-      new SecurityManager(conf)
+      mgr.initializeAuth()
     }
   }
 
@@ -430,5 +440,24 @@ class SecurityManagerSuite extends SparkFunSuite with ResetSystemProperties {
     assert(keyFromEnv === new SecurityManager(conf2).getSecretKey())
   }
 
+  test("secret key generation in yarn mode") {
+    val conf = new SparkConf()
+      .set(NETWORK_AUTH_ENABLED, true)
+      .set(SparkLauncher.SPARK_MASTER, "yarn")
+    val mgr = new SecurityManager(conf)
+
+    UserGroupInformation.createUserForTesting("authTest", Array()).doAs(
+      new PrivilegedExceptionAction[Unit]() {
+        override def run(): Unit = {
+          mgr.initializeAuth()
+          val creds = UserGroupInformation.getCurrentUser().getCredentials()
+          val secret = creds.getSecretKey(SecurityManager.SECRET_LOOKUP_KEY)
+          assert(secret != null)
+          assert(new String(secret, UTF_8) === mgr.getSecretKey())
+        }
+      }
+    )
+  }
+
 }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index d0a34c5cdcf57..e200755e639e1 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -235,7 +235,7 @@ class SparkSubmitSuite
     childArgsStr should include ("--class org.SomeClass")
     childArgsStr should include ("--arg arg1 --arg arg2")
     childArgsStr should include regex ("--jar .*thejar.jar")
-    mainClass should be ("org.apache.spark.deploy.yarn.Client")
+    mainClass should be (SparkSubmit.YARN_CLUSTER_SUBMIT_CLASS)
 
     // In yarn cluster mode, also adding jars to classpath
     classpath(0) should endWith ("thejar.jar")
@@ -323,11 +323,11 @@ class SparkSubmitSuite
     val childArgsStr = childArgs.mkString(" ")
     if (useRest) {
       childArgsStr should endWith ("thejar.jar org.SomeClass arg1 arg2")
-      mainClass should be ("org.apache.spark.deploy.rest.RestSubmissionClient")
+      mainClass should be (SparkSubmit.REST_CLUSTER_SUBMIT_CLASS)
     } else {
       childArgsStr should startWith ("--supervise --memory 4g --cores 5")
       childArgsStr should include regex "launch spark://h:p .*thejar.jar org.SomeClass arg1 arg2"
-      mainClass should be ("org.apache.spark.deploy.Client")
+      mainClass should be (SparkSubmit.STANDALONE_CLUSTER_SUBMIT_CLASS)
     }
     classpath should have size 0
     sys.props("SPARK_SUBMIT") should be ("true")
@@ -402,7 +402,7 @@ class SparkSubmitSuite
     conf.get("spark.executor.memory") should be ("5g")
     conf.get("spark.master") should be ("yarn")
     conf.get("spark.submit.deployMode") should be ("cluster")
-    mainClass should be ("org.apache.spark.deploy.yarn.Client")
+    mainClass should be (SparkSubmit.YARN_CLUSTER_SUBMIT_CLASS)
   }
 
   test("SPARK-21568 ConsoleProgressBar should be enabled only in shells") {
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 490baf040491f..e505bc018857d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -92,7 +92,7 @@ class StandaloneRestSubmitSuite extends SparkFunSuite with BeforeAndAfterEach {
     conf.set("spark.app.name", "dreamer")
     val appArgs = Array("one", "two", "six")
     // main method calls this
-    val response = RestSubmissionClient.run("app-resource", "main-class", appArgs, conf)
+    val response = new RestSubmissionClientApp().run("app-resource", "main-class", appArgs, conf)
     val submitResponse = getSubmitResponse(response)
     assert(submitResponse.action === Utils.getFormattedClassName(submitResponse))
     assert(submitResponse.serverSparkVersion === SPARK_VERSION)
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 5b8dcd0338cce..9be01f617217b 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,12 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // SPARK-22372: Make cluster submission use SparkApplication.
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.getSecretKeyFromUserCredentials"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.isYarnMode"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.getCurrentUserCredentials"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.addSecretKeyToUserCredentials"),
+
     // SPARK-18085: Better History Server scalability for many / large applications
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.status.api.v1.ExecutorSummary.executorLogs"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.history.HistoryServer.getSparkUI"),
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index ca0aa0ea3bc73..b2576b0d72633 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -56,11 +56,28 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
   // TODO: Currently, task to container is computed once (TaskSetManager) - which need not be
   // optimal as more containers are available. Might need to handle this better.
 
-  private val sparkConf = new SparkConf()
-  private val yarnConf: YarnConfiguration = SparkHadoopUtil.get.newConfiguration(sparkConf)
-    .asInstanceOf[YarnConfiguration]
   private val isClusterMode = args.userClass != null
 
+  private val sparkConf = new SparkConf()
+  if (args.propertiesFile != null) {
+    Utils.getPropertiesFromFile(args.propertiesFile).foreach { case (k, v) =>
+      sparkConf.set(k, v)
+    }
+  }
+
+  private val securityMgr = new SecurityManager(sparkConf)
+
+  // Set system properties for each config entry. This covers two use cases:
+  // - The default configuration stored by the SparkHadoopUtil class
+  // - The user application creating a new SparkConf in cluster mode
+  //
+  // Both cases create a new SparkConf object which reads these configs from system properties.
+  sparkConf.getAll.foreach { case (k, v) =>
+    sys.props(k) = v
+  }
+
+  private val yarnConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf))
+
   private val ugi = {
     val original = UserGroupInformation.getCurrentUser()
 
@@ -311,7 +328,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
             val credentialManager = new YARNHadoopDelegationTokenManager(
               sparkConf,
               yarnConf,
-              conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
+              conf => YarnSparkHadoopUtil.hadoopFSsToAccess(sparkConf, conf))
 
             val credentialRenewer =
               new AMCredentialRenewer(sparkConf, yarnConf, credentialManager)
@@ -323,13 +340,10 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
         credentialRenewerThread.join()
       }
 
-      // Call this to force generation of secret so it gets populated into the Hadoop UGI.
-      val securityMgr = new SecurityManager(sparkConf)
-
       if (isClusterMode) {
-        runDriver(securityMgr)
+        runDriver()
       } else {
-        runExecutorLauncher(securityMgr)
+        runExecutorLauncher()
       }
     } catch {
       case e: Exception =>
@@ -410,8 +424,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
       _sparkConf: SparkConf,
       _rpcEnv: RpcEnv,
       driverRef: RpcEndpointRef,
-      uiAddress: Option[String],
-      securityMgr: SecurityManager) = {
+      uiAddress: Option[String]) = {
     val appId = client.getAttemptId().getApplicationId().toString()
     val attemptId = client.getAttemptId().getAttemptId().toString()
     val historyAddress =
@@ -463,7 +476,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
       YarnSchedulerBackend.ENDPOINT_NAME)
   }
 
-  private def runDriver(securityMgr: SecurityManager): Unit = {
+  private def runDriver(): Unit = {
     addAmIpFilter(None)
     userClassThread = startUserApplication()
 
@@ -479,7 +492,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
         val driverRef = createSchedulerRef(
           sc.getConf.get("spark.driver.host"),
           sc.getConf.get("spark.driver.port"))
-        registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl), securityMgr)
+        registerAM(sc.getConf, rpcEnv, driverRef, sc.ui.map(_.webUrl))
         registered = true
       } else {
         // Sanity check; should never happen in normal operation, since sc should only be null
@@ -498,15 +511,14 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
     }
   }
 
-  private def runExecutorLauncher(securityMgr: SecurityManager): Unit = {
+  private def runExecutorLauncher(): Unit = {
     val hostname = Utils.localHostName
     val amCores = sparkConf.get(AM_CORES)
     rpcEnv = RpcEnv.create("sparkYarnAM", hostname, hostname, -1, sparkConf, securityMgr,
       amCores, true)
     val driverRef = waitForSparkDriver()
     addAmIpFilter(Some(driverRef))
-    registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"),
-      securityMgr)
+    registerAM(sparkConf, rpcEnv, driverRef, sparkConf.getOption("spark.driver.appUIAddress"))
     registered = true
 
     // In client mode the actor will stop the reporter thread.
@@ -686,6 +698,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
     if (args.primaryRFile != null && args.primaryRFile.endsWith(".R")) {
       // TODO(davies): add R dependencies here
     }
+
     val mainMethod = userClassLoader.loadClass(args.userClass)
       .getMethod("main", classOf[Array[String]])
 
@@ -809,15 +822,6 @@ object ApplicationMaster extends Logging {
   def main(args: Array[String]): Unit = {
     SignalUtils.registerLogger(log)
     val amArgs = new ApplicationMasterArguments(args)
-
-    // Load the properties file with the Spark configuration and set entries as system properties,
-    // so that user code run inside the AM also has access to them.
-    // Note: we must do this before SparkHadoopUtil instantiated
-    if (amArgs.propertiesFile != null) {
-      Utils.getPropertiesFromFile(amArgs.propertiesFile).foreach { case (k, v) =>
-        sys.props(k) = v
-      }
-    }
     master = new ApplicationMaster(amArgs)
     System.exit(master.run())
   }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 99e7d46ca5c96..3781b261a0381 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -48,7 +48,7 @@ import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 import org.apache.hadoop.yarn.util.Records
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.{SparkApplication, SparkHadoopUtil}
 import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.deploy.yarn.security.YARNHadoopDelegationTokenManager
 import org.apache.spark.internal.Logging
@@ -58,18 +58,14 @@ import org.apache.spark.util.{CallerContext, Utils}
 
 private[spark] class Client(
     val args: ClientArguments,
-    val hadoopConf: Configuration,
     val sparkConf: SparkConf)
   extends Logging {
 
   import Client._
   import YarnSparkHadoopUtil._
 
-  def this(clientArgs: ClientArguments, spConf: SparkConf) =
-    this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
-
   private val yarnClient = YarnClient.createYarnClient
-  private val yarnConf = new YarnConfiguration(hadoopConf)
+  private val hadoopConf = new YarnConfiguration(SparkHadoopUtil.newConfiguration(sparkConf))
 
   private val isClusterMode = sparkConf.get("spark.submit.deployMode", "client") == "cluster"
 
@@ -125,7 +121,7 @@ private[spark] class Client(
   private val credentialManager = new YARNHadoopDelegationTokenManager(
     sparkConf,
     hadoopConf,
-    conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
+    conf => YarnSparkHadoopUtil.hadoopFSsToAccess(sparkConf, conf))
 
   def reportLauncherState(state: SparkAppHandle.State): Unit = {
     launcherBackend.setState(state)
@@ -134,8 +130,6 @@ private[spark] class Client(
   def stop(): Unit = {
     launcherBackend.close()
     yarnClient.stop()
-    // Unset YARN mode system env variable, to allow switching between cluster types.
-    System.clearProperty("SPARK_YARN_MODE")
   }
 
   /**
@@ -152,7 +146,7 @@ private[spark] class Client(
       // Setup the credentials before doing anything else,
       // so we have don't have issues at any point.
       setupCredentials()
-      yarnClient.init(yarnConf)
+      yarnClient.init(hadoopConf)
       yarnClient.start()
 
       logInfo("Requesting a new application from cluster with %d NodeManagers"
@@ -398,7 +392,7 @@ private[spark] class Client(
       if (SparkHadoopUtil.get.isProxyUser(currentUser)) {
         currentUser.addCredentials(credentials)
       }
-      logDebug(YarnSparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
+      logDebug(SparkHadoopUtil.get.dumpTokens(credentials).mkString("\n"))
     }
 
     // If we use principal and keytab to login, also credentials can be renewed some time
@@ -758,12 +752,14 @@ private[spark] class Client(
       // Save the YARN configuration into a separate file that will be overlayed on top of the
       // cluster's Hadoop conf.
       confStream.putNextEntry(new ZipEntry(SPARK_HADOOP_CONF_FILE))
-      yarnConf.writeXml(confStream)
+      hadoopConf.writeXml(confStream)
       confStream.closeEntry()
 
-      // Save Spark configuration to a file in the archive.
+      // Save Spark configuration to a file in the archive, but filter out the app's secret.
       val props = new Properties()
-      sparkConf.getAll.foreach { case (k, v) => props.setProperty(k, v) }
+      sparkConf.getAll.foreach { case (k, v) =>
+        props.setProperty(k, v)
+      }
       // Override spark.yarn.key to point to the location in distributed cache which will be used
       // by AM.
       Option(amKeytabFileName).foreach { k => props.setProperty(KEYTAB.key, k) }
@@ -786,8 +782,7 @@ private[spark] class Client(
       pySparkArchives: Seq[String]): HashMap[String, String] = {
     logInfo("Setting up the launch environment for our AM container")
     val env = new HashMap[String, String]()
-    populateClasspath(args, yarnConf, sparkConf, env, sparkConf.get(DRIVER_CLASS_PATH))
-    env("SPARK_YARN_MODE") = "true"
+    populateClasspath(args, hadoopConf, sparkConf, env, sparkConf.get(DRIVER_CLASS_PATH))
     env("SPARK_YARN_STAGING_DIR") = stagingDirPath.toString
     env("SPARK_USER") = UserGroupInformation.getCurrentUser().getShortUserName()
     if (loginFromKeytab) {
@@ -861,6 +856,7 @@ private[spark] class Client(
       } else {
         Nil
       }
+
     val launchEnv = setupLaunchEnv(appStagingDirPath, pySparkArchives)
     val localResources = prepareLocalResources(appStagingDirPath, pySparkArchives)
 
@@ -991,7 +987,11 @@ private[spark] class Client(
     logDebug("YARN AM launch context:")
     logDebug(s"    user class: ${Option(args.userClass).getOrElse("N/A")}")
     logDebug("    env:")
-    launchEnv.foreach { case (k, v) => logDebug(s"        $k -> $v") }
+    if (log.isDebugEnabled) {
+      Utils.redact(sparkConf, launchEnv.toSeq).foreach { case (k, v) =>
+        logDebug(s"        $k -> $v")
+      }
+    }
     logDebug("    resources:")
     localResources.foreach { case (k, v) => logDebug(s"        $k -> $v")}
     logDebug("    command:")
@@ -1185,24 +1185,6 @@ private[spark] class Client(
 
 private object Client extends Logging {
 
-  def main(argStrings: Array[String]) {
-    if (!sys.props.contains("SPARK_SUBMIT")) {
-      logWarning("WARNING: This client is deprecated and will be removed in a " +
-        "future version of Spark. Use ./bin/spark-submit with \"--master yarn\"")
-    }
-
-    // Set an env variable indicating we are running in YARN mode.
-    // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes
-    System.setProperty("SPARK_YARN_MODE", "true")
-    val sparkConf = new SparkConf
-    // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
-    // so remove them from sparkConf here for yarn mode.
-    sparkConf.remove("spark.jars")
-    sparkConf.remove("spark.files")
-    val args = new ClientArguments(argStrings)
-    new Client(args, sparkConf).run()
-  }
-
   // Alias for the user jar
   val APP_JAR_NAME: String = "__app__.jar"
 
@@ -1506,3 +1488,16 @@ private object Client extends Logging {
   }
 
 }
+
+private[spark] class YarnClusterApplication extends SparkApplication {
+
+  override def start(args: Array[String], conf: SparkConf): Unit = {
+    // SparkSubmit would use yarn cache to distribute files & jars in yarn mode,
+    // so remove them from sparkConf here for yarn mode.
+    conf.remove("spark.jars")
+    conf.remove("spark.files")
+
+    new Client(new ClientArguments(args), conf).run()
+  }
+
+}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 72f4d273ab53b..c1ae12aabb8cc 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -92,7 +92,7 @@ private[spark] class YarnRMClient extends Logging {
 
   /** Returns the attempt ID. */
   def getAttemptId(): ApplicationAttemptId = {
-    YarnSparkHadoopUtil.get.getContainerId.getApplicationAttemptId()
+    YarnSparkHadoopUtil.getContainerId.getApplicationAttemptId()
   }
 
   /** Returns the configuration for the AmIpFilter to add to the Spark UI. */
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
index 9c1472cb50e3a..f406fabd61860 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtil.scala
@@ -17,21 +17,14 @@
 
 package org.apache.spark.deploy.yarn
 
-import java.nio.charset.StandardCharsets.UTF_8
-import java.util.regex.Matcher
-import java.util.regex.Pattern
+import java.util.regex.{Matcher, Pattern}
 
 import scala.collection.mutable.{HashMap, ListBuffer}
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.io.Text
-import org.apache.hadoop.mapred.{JobConf, Master}
-import org.apache.hadoop.security.Credentials
-import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.yarn.api.ApplicationConstants
 import org.apache.hadoop.yarn.api.records.{ApplicationAccessType, ContainerId, Priority}
-import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.util.ConverterUtils
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
@@ -43,87 +36,10 @@ import org.apache.spark.internal.config._
 import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.util.Utils
 
-
-/**
- * Contains util methods to interact with Hadoop from spark.
- */
-class YarnSparkHadoopUtil extends SparkHadoopUtil {
+object YarnSparkHadoopUtil {
 
   private var credentialUpdater: CredentialUpdater = _
 
-  override def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
-    dest.addCredentials(source.getCredentials())
-  }
-
-  // Note that all params which start with SPARK are propagated all the way through, so if in yarn
-  // mode, this MUST be set to true.
-  override def isYarnMode(): Boolean = { true }
-
-  // Return an appropriate (subclass) of Configuration. Creating a config initializes some Hadoop
-  // subsystems. Always create a new config, don't reuse yarnConf.
-  override def newConfiguration(conf: SparkConf): Configuration = {
-    val hadoopConf = new YarnConfiguration(super.newConfiguration(conf))
-    hadoopConf.addResource(Client.SPARK_HADOOP_CONF_FILE)
-    hadoopConf
-  }
-
-  // Add any user credentials to the job conf which are necessary for running on a secure Hadoop
-  // cluster
-  override def addCredentials(conf: JobConf) {
-    val jobCreds = conf.getCredentials()
-    jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
-  }
-
-  override def addSecretKeyToUserCredentials(key: String, secret: String) {
-    val creds = new Credentials()
-    creds.addSecretKey(new Text(key), secret.getBytes(UTF_8))
-    addCurrentUserCredentials(creds)
-  }
-
-  override def getSecretKeyFromUserCredentials(key: String): Array[Byte] = {
-    val credentials = getCurrentUserCredentials()
-    if (credentials != null) credentials.getSecretKey(new Text(key)) else null
-  }
-
-  private[spark] override def startCredentialUpdater(sparkConf: SparkConf): Unit = {
-    val hadoopConf = newConfiguration(sparkConf)
-    val credentialManager = new YARNHadoopDelegationTokenManager(
-      sparkConf,
-      hadoopConf,
-      conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
-    credentialUpdater = new CredentialUpdater(sparkConf, hadoopConf, credentialManager)
-    credentialUpdater.start()
-  }
-
-  private[spark] override def stopCredentialUpdater(): Unit = {
-    if (credentialUpdater != null) {
-      credentialUpdater.stop()
-      credentialUpdater = null
-    }
-  }
-
-  private[spark] def getContainerId: ContainerId = {
-    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
-    ConverterUtils.toContainerId(containerIdString)
-  }
-
-  /** The filesystems for which YARN should fetch delegation tokens. */
-  private[spark] def hadoopFSsToAccess(
-      sparkConf: SparkConf,
-      hadoopConf: Configuration): Set[FileSystem] = {
-    val filesystemsToAccess = sparkConf.get(FILESYSTEMS_TO_ACCESS)
-      .map(new Path(_).getFileSystem(hadoopConf))
-      .toSet
-
-    val stagingFS = sparkConf.get(STAGING_DIR)
-      .map(new Path(_).getFileSystem(hadoopConf))
-      .getOrElse(FileSystem.get(hadoopConf))
-
-    filesystemsToAccess + stagingFS
-  }
-}
-
-object YarnSparkHadoopUtil {
   // Additional memory overhead
   // 10% was arrived at experimentally. In the interest of minimizing memory waste while covering
   // the common cases. Memory overhead tends to grow with container size.
@@ -137,14 +53,6 @@ object YarnSparkHadoopUtil {
   // request types (like map/reduce in hadoop for example)
   val RM_REQUEST_PRIORITY = Priority.newInstance(1)
 
-  def get: YarnSparkHadoopUtil = {
-    val yarnMode = java.lang.Boolean.parseBoolean(
-      System.getProperty("SPARK_YARN_MODE", System.getenv("SPARK_YARN_MODE")))
-    if (!yarnMode) {
-      throw new SparkException("YarnSparkHadoopUtil is not available in non-YARN mode!")
-    }
-    SparkHadoopUtil.get.asInstanceOf[YarnSparkHadoopUtil]
-  }
   /**
    * Add a path variable to the given environment map.
    * If the map already contains this key, append the value to the existing value instead.
@@ -277,5 +185,42 @@ object YarnSparkHadoopUtil {
         securityMgr.getModifyAclsGroups)
     )
   }
-}
 
+  def getContainerId: ContainerId = {
+    val containerIdString = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name())
+    ConverterUtils.toContainerId(containerIdString)
+  }
+
+  /** The filesystems for which YARN should fetch delegation tokens. */
+  def hadoopFSsToAccess(
+      sparkConf: SparkConf,
+      hadoopConf: Configuration): Set[FileSystem] = {
+    val filesystemsToAccess = sparkConf.get(FILESYSTEMS_TO_ACCESS)
+      .map(new Path(_).getFileSystem(hadoopConf))
+      .toSet
+
+    val stagingFS = sparkConf.get(STAGING_DIR)
+      .map(new Path(_).getFileSystem(hadoopConf))
+      .getOrElse(FileSystem.get(hadoopConf))
+
+    filesystemsToAccess + stagingFS
+  }
+
+  def startCredentialUpdater(sparkConf: SparkConf): Unit = {
+    val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
+    val credentialManager = new YARNHadoopDelegationTokenManager(
+      sparkConf,
+      hadoopConf,
+      conf => YarnSparkHadoopUtil.hadoopFSsToAccess(sparkConf, conf))
+    credentialUpdater = new CredentialUpdater(sparkConf, hadoopConf, credentialManager)
+    credentialUpdater.start()
+  }
+
+  def stopCredentialUpdater(): Unit = {
+    if (credentialUpdater != null) {
+      credentialUpdater.stop()
+      credentialUpdater = null
+    }
+  }
+
+}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
index 6134757a82fdc..eaf2cff111a49 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/security/AMCredentialRenewer.scala
@@ -62,7 +62,7 @@ private[yarn] class AMCredentialRenewer(
   private val credentialRenewerThread: ScheduledExecutorService =
     ThreadUtils.newDaemonSingleThreadScheduledExecutor("Credential Refresh Thread")
 
-  private val hadoopUtil = YarnSparkHadoopUtil.get
+  private val hadoopUtil = SparkHadoopUtil.get
 
   private val credentialsFile = sparkConf.get(CREDENTIALS_FILE_PATH)
   private val daysToKeepFiles = sparkConf.get(CREDENTIALS_FILE_MAX_RETENTION)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index b722cc401bb73..0c6206eebe41d 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -66,7 +66,7 @@ private[spark] class YarnClientSchedulerBackend(
     // reads the credentials from HDFS, just like the executors and updates its own credentials
     // cache.
     if (conf.contains("spark.yarn.credentials.file")) {
-      YarnSparkHadoopUtil.get.startCredentialUpdater(conf)
+      YarnSparkHadoopUtil.startCredentialUpdater(conf)
     }
     monitorThread = asyncMonitorApplication()
     monitorThread.start()
@@ -153,7 +153,7 @@ private[spark] class YarnClientSchedulerBackend(
     client.reportLauncherState(SparkAppHandle.State.FINISHED)
 
     super.stop()
-    YarnSparkHadoopUtil.get.stopCredentialUpdater()
+    YarnSparkHadoopUtil.stopCredentialUpdater()
     client.stop()
     logInfo("Stopped")
   }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
index e2d477be329c3..62bf9818ee248 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClusterSchedulerBackend.scala
@@ -41,7 +41,7 @@ private[spark] class YarnClusterSchedulerBackend(
     var driverLogs: Option[Map[String, String]] = None
     try {
       val yarnConf = new YarnConfiguration(sc.hadoopConfiguration)
-      val containerId = YarnSparkHadoopUtil.get.getContainerId
+      val containerId = YarnSparkHadoopUtil.getContainerId
 
       val httpAddress = System.getenv(Environment.NM_HOST.name()) +
         ":" + System.getenv(Environment.NM_HTTP_PORT.name())
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 9c3b18e4ec5f3..ac67f2196e0a0 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -62,18 +62,14 @@ abstract class BaseYarnClusterSuite
   protected var hadoopConfDir: File = _
   private var logConfDir: File = _
 
-  var oldSystemProperties: Properties = null
-
   def newYarnConfig(): YarnConfiguration
 
   override def beforeAll() {
     super.beforeAll()
-    oldSystemProperties = SerializationUtils.clone(System.getProperties)
 
     tempDir = Utils.createTempDir()
     logConfDir = new File(tempDir, "log4j")
     logConfDir.mkdir()
-    System.setProperty("SPARK_YARN_MODE", "true")
 
     val logConfFile = new File(logConfDir, "log4j.properties")
     Files.write(LOG4J_CONF, logConfFile, StandardCharsets.UTF_8)
@@ -124,7 +120,6 @@ abstract class BaseYarnClusterSuite
     try {
       yarnCluster.stop()
     } finally {
-      System.setProperties(oldSystemProperties)
       super.afterAll()
     }
   }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 6cf68427921fd..9d5f5eb621118 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -24,7 +24,6 @@ import java.util.Properties
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{HashMap => MutableHashMap}
 
-import org.apache.commons.lang3.SerializationUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.MRJobConfig
@@ -36,34 +35,18 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.util.Records
 import org.mockito.Matchers.{eq => meq, _}
 import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfterAll, Matchers}
+import org.scalatest.Matchers
 
 import org.apache.spark.{SparkConf, SparkFunSuite, TestUtils}
 import org.apache.spark.deploy.yarn.config._
-import org.apache.spark.util.{ResetSystemProperties, SparkConfWithEnv, Utils}
+import org.apache.spark.util.{SparkConfWithEnv, Utils}
 
-class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
-  with ResetSystemProperties {
+class ClientSuite extends SparkFunSuite with Matchers {
 
   import Client._
 
   var oldSystemProperties: Properties = null
 
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    oldSystemProperties = SerializationUtils.clone(System.getProperties)
-    System.setProperty("SPARK_YARN_MODE", "true")
-  }
-
-  override def afterAll(): Unit = {
-    try {
-      System.setProperties(oldSystemProperties)
-      oldSystemProperties = null
-    } finally {
-      super.afterAll()
-    }
-  }
-
   test("default Yarn application classpath") {
     getDefaultYarnApplicationClasspath should be(Fixtures.knownDefYarnAppCP)
   }
@@ -185,7 +168,6 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
   }
 
   test("configuration and args propagate through createApplicationSubmissionContext") {
-    val conf = new Configuration()
     // When parsing tags, duplicates and leading/trailing whitespace should be removed.
     // Spaces between non-comma strings should be preserved as single tags. Empty strings may or
     // may not be removed depending on the version of Hadoop being used.
@@ -200,7 +182,7 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
     val getNewApplicationResponse = Records.newRecord(classOf[GetNewApplicationResponse])
     val containerLaunchContext = Records.newRecord(classOf[ContainerLaunchContext])
 
-    val client = new Client(args, conf, sparkConf)
+    val client = new Client(args, sparkConf)
     client.createApplicationSubmissionContext(
       new YarnClientApplication(getNewApplicationResponse, appContext),
       containerLaunchContext)
@@ -407,15 +389,14 @@ class ClientSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
 
   private def createClient(
       sparkConf: SparkConf,
-      conf: Configuration = new Configuration(),
       args: Array[String] = Array()): Client = {
     val clientArgs = new ClientArguments(args)
-    spy(new Client(clientArgs, conf, sparkConf))
+    spy(new Client(clientArgs, sparkConf))
   }
 
   private def classpath(client: Client): Array[String] = {
     val env = new MutableHashMap[String, String]()
-    populateClasspath(null, client.hadoopConf, client.sparkConf, env)
+    populateClasspath(null, new Configuration(), client.sparkConf, env)
     classpath(env)
   }
 
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d5de19072ce29..ab0005d7b53a8 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -115,8 +115,13 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
       ))
   }
 
-  test("run Spark in yarn-cluster mode with using SparkHadoopUtil.conf") {
-    testYarnAppUseSparkHadoopUtilConf()
+  test("yarn-cluster should respect conf overrides in SparkHadoopUtil (SPARK-16414)") {
+    val result = File.createTempFile("result", null, tempDir)
+    val finalState = runSpark(false,
+      mainClassName(YarnClusterDriverUseSparkHadoopUtilConf.getClass),
+      appArgs = Seq("key=value", result.getAbsolutePath()),
+      extraConf = Map("spark.hadoop.key" -> "value"))
+    checkResult(finalState, result)
   }
 
   test("run Spark in yarn-client mode with additional jar") {
@@ -216,15 +221,6 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     checkResult(finalState, result)
   }
 
-  private def testYarnAppUseSparkHadoopUtilConf(): Unit = {
-    val result = File.createTempFile("result", null, tempDir)
-    val finalState = runSpark(false,
-      mainClassName(YarnClusterDriverUseSparkHadoopUtilConf.getClass),
-      appArgs = Seq("key=value", result.getAbsolutePath()),
-      extraConf = Map("spark.hadoop.key" -> "value"))
-    checkResult(finalState, result)
-  }
-
   private def testWithAddJar(clientMode: Boolean): Unit = {
     val originalJar = TestUtils.createJarWithFiles(Map("test.resource" -> "ORIGINAL"), tempDir)
     val driverResult = File.createTempFile("driver", null, tempDir)
@@ -424,7 +420,7 @@ private object YarnClusterDriver extends Logging with Matchers {
             s"Driver logs contain sensitive info (${SECRET_PASSWORD}): \n${log} "
           )
         }
-        val containerId = YarnSparkHadoopUtil.get.getContainerId
+        val containerId = YarnSparkHadoopUtil.getContainerId
         val user = Utils.getCurrentUserName()
         assert(urlStr.endsWith(s"/node/containerlogs/$containerId/$user/stderr?start=-4096"))
       }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
index a057618b39950..f21353aa007c8 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnSparkHadoopUtilSuite.scala
@@ -71,14 +71,10 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
 
   test("Yarn configuration override") {
     val key = "yarn.nodemanager.hostname"
-    val default = new YarnConfiguration()
-
     val sparkConf = new SparkConf()
       .set("spark.hadoop." + key, "someHostName")
-    val yarnConf = new YarnSparkHadoopUtil().newConfiguration(sparkConf)
-
-    yarnConf.getClass() should be (classOf[YarnConfiguration])
-    yarnConf.get(key) should not be default.get(key)
+    val yarnConf = new YarnConfiguration(SparkHadoopUtil.get.newConfiguration(sparkConf))
+    yarnConf.get(key) should be ("someHostName")
   }
 
 
@@ -145,45 +141,4 @@ class YarnSparkHadoopUtilSuite extends SparkFunSuite with Matchers with Logging
 
   }
 
-  test("check different hadoop utils based on env variable") {
-    try {
-      System.setProperty("SPARK_YARN_MODE", "true")
-      assert(SparkHadoopUtil.get.getClass === classOf[YarnSparkHadoopUtil])
-      System.setProperty("SPARK_YARN_MODE", "false")
-      assert(SparkHadoopUtil.get.getClass === classOf[SparkHadoopUtil])
-    } finally {
-      System.clearProperty("SPARK_YARN_MODE")
-    }
-  }
-
-
-
-  // This test needs to live here because it depends on isYarnMode returning true, which can only
-  // happen in the YARN module.
-  test("security manager token generation") {
-    try {
-      System.setProperty("SPARK_YARN_MODE", "true")
-      val initial = SparkHadoopUtil.get
-        .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
-      assert(initial === null || initial.length === 0)
-
-      val conf = new SparkConf()
-        .set(SecurityManager.SPARK_AUTH_CONF, "true")
-        .set(SecurityManager.SPARK_AUTH_SECRET_CONF, "unused")
-      val sm = new SecurityManager(conf)
-
-      val generated = SparkHadoopUtil.get
-        .getSecretKeyFromUserCredentials(SecurityManager.SECRET_LOOKUP_KEY)
-      assert(generated != null)
-      val genString = new Text(generated).toString()
-      assert(genString != "unused")
-      assert(sm.getSecretKey() === genString)
-    } finally {
-      // removeSecretKey() was only added in Hadoop 2.6, so instead we just set the secret
-      // to an empty string.
-      SparkHadoopUtil.get.addSecretKeyToUserCredentials(SecurityManager.SECRET_LOOKUP_KEY, "")
-      System.clearProperty("SPARK_YARN_MODE")
-    }
-  }
-
 }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
index c918998bde07c..3c7cdc0f1dab8 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/security/YARNHadoopDelegationTokenManagerSuite.scala
@@ -31,24 +31,15 @@ class YARNHadoopDelegationTokenManagerSuite extends SparkFunSuite with Matchers
 
   override def beforeAll(): Unit = {
     super.beforeAll()
-
-    System.setProperty("SPARK_YARN_MODE", "true")
-
     sparkConf = new SparkConf()
     hadoopConf = new Configuration()
   }
 
-  override def afterAll(): Unit = {
-    super.afterAll()
-
-    System.clearProperty("SPARK_YARN_MODE")
-  }
-
   test("Correctly loads credential providers") {
     credentialManager = new YARNHadoopDelegationTokenManager(
       sparkConf,
       hadoopConf,
-      conf => YarnSparkHadoopUtil.get.hadoopFSsToAccess(sparkConf, conf))
+      conf => YarnSparkHadoopUtil.hadoopFSsToAccess(sparkConf, conf))
 
     credentialManager.credentialProviders.get("yarn-test") should not be (None)
   }

From 1d5597b408485e41812f3645a670864ad88570a0 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Mon, 4 Dec 2017 15:08:07 -0800
Subject: [PATCH 034/356] [SPARK-22626][SQL][FOLLOWUP] improve documentation
 and simplify test case

## What changes were proposed in this pull request?

This PR improves documentation for not using zero `numRows` statistics and simplifies the test case.

The reason why some Hive tables have zero `numRows` is that, in Hive, when stats gathering is disabled, `numRows` is always zero after INSERT command:
```
hive> create table src (key int, value string) stored as orc;
hive> desc formatted src;
Table Parameters:
	COLUMN_STATS_ACCURATE	{\"BASIC_STATS\":\"true\"}
	numFiles            	0
	numRows             	0
	rawDataSize         	0
	totalSize           	0
	transient_lastDdlTime	1512399590

hive> set hive.stats.autogather=false;
hive> insert into src select 1, 'a';
hive> desc formatted src;
Table Parameters:
	numFiles            	1
	numRows             	0
	rawDataSize         	0
	totalSize           	275
	transient_lastDdlTime	1512399647

hive> insert into src select 1, 'b';
hive> desc formatted src;
Table Parameters:
	numFiles            	2
	numRows             	0
	rawDataSize         	0
	totalSize           	550
	transient_lastDdlTime	1512399687
```

## How was this patch tested?

Modified existing test.

Author: Zhenhua Wang <wzh_zju@163.com>

Closes #19880 from wzhfy/doc_zero_rowCount.
---
 .../apache/spark/sql/hive/client/HiveClientImpl.scala |  8 +++++---
 .../org/apache/spark/sql/hive/StatisticsSuite.scala   | 11 +++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 77e836003b39f..08eb5c74f06d7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -426,9 +426,11 @@ private[hive] class HiveClientImpl(
       // TODO: stats should include all the other two fields (`numFiles` and `numPartitions`).
       // (see StatsSetupConst in Hive)
       val stats =
-        // When table is external, `totalSize` is always zero, which will influence join strategy
-        // so when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
-        // return None. Later, we will use the other ways to estimate the statistics.
+        // When table is external, `totalSize` is always zero, which will influence join strategy.
+        // So when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
+        // return None.
+        // In Hive, when statistics gathering is disabled, `rawDataSize` and `numRows` is always
+        // zero after INSERT command. So they are used here only if they are larger than zero.
         if (totalSize.isDefined && totalSize.get > 0L) {
           Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
         } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index ee027e5308265..13f06a2503656 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -1366,17 +1366,16 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       sql("CREATE TABLE maybe_big (c1 bigint)" +
         "TBLPROPERTIES ('numRows'='0', 'rawDataSize'='60000000000', 'totalSize'='8000000000000')")
 
-      val relation = spark.table("maybe_big").queryExecution.analyzed.children.head
-        .asInstanceOf[HiveTableRelation]
+      val catalogTable = getCatalogTable("maybe_big")
 
-      val properties = relation.tableMeta.ignoredProperties
+      val properties = catalogTable.ignoredProperties
       assert(properties("totalSize").toLong > 0)
       assert(properties("rawDataSize").toLong > 0)
       assert(properties("numRows").toLong == 0)
 
-      assert(relation.stats.sizeInBytes > 0)
-      // May be cause OOM if rowCount == 0 when enables CBO, see SPARK-22626 for details.
-      assert(relation.stats.rowCount.isEmpty)
+      val catalogStats = catalogTable.stats.get
+      assert(catalogStats.sizeInBytes > 0)
+      assert(catalogStats.rowCount.isEmpty)
     }
   }
 }

From 3887b7eef7b89d3aeecadebc0fdafa47586a232b Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Mon, 4 Dec 2017 17:08:56 -0800
Subject: [PATCH 035/356] [SPARK-22665][SQL] Avoid repartitioning with empty
 list of expressions

## What changes were proposed in this pull request?

Repartitioning by empty set of expressions is currently possible, even though it is a case which is not handled properly. Indeed, in `HashExpression` there is a check to avoid to run it on an empty set, but this check is not performed while repartitioning.
Thus, the PR adds a check to avoid this wrong situation.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19870 from mgaido91/SPARK-22665.
---
 .../plans/logical/basicLogicalOperators.scala        | 12 +++++++-----
 .../spark/sql/catalyst/analysis/AnalysisSuite.scala  |  5 ++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 93de7c1daf5c2..ba5f97d608feb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -22,8 +22,8 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.statsEstimation._
-import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning,
+  RangePartitioning, RoundRobinPartitioning}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.random.RandomSampler
@@ -847,14 +847,16 @@ case class RepartitionByExpression(
         "`SortOrder`, which means `RangePartitioning`, or none of them are `SortOrder`, which " +
         "means `HashPartitioning`. In this case we have:" +
       s"""
-         |SortOrder: ${sortOrder}
-         |NonSortOrder: ${nonSortOrder}
+         |SortOrder: $sortOrder
+         |NonSortOrder: $nonSortOrder
        """.stripMargin)
 
     if (sortOrder.nonEmpty) {
       RangePartitioning(sortOrder.map(_.asInstanceOf[SortOrder]), numPartitions)
-    } else {
+    } else if (nonSortOrder.nonEmpty) {
       HashPartitioning(nonSortOrder, numPartitions)
+    } else {
+      RoundRobinPartitioning(numPartitions)
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 0e2e706a31a05..109fb32aa4a12 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{Cross, Inner}
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, RangePartitioning}
+import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning,
+  RangePartitioning, RoundRobinPartitioning}
 import org.apache.spark.sql.types._
 
 
@@ -530,6 +531,8 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     checkPartitioning[RangePartitioning](numPartitions = 10,
       exprs = SortOrder('a.attr, Ascending), SortOrder('b.attr, Descending))
 
+    checkPartitioning[RoundRobinPartitioning](numPartitions = 10, exprs = Seq.empty: _*)
+
     intercept[IllegalArgumentException] {
       checkPartitioning(numPartitions = 0, exprs = Literal(20))
     }

From 295df746ecb1def5530a044d6670b28821da89f0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 5 Dec 2017 12:38:26 +0800
Subject: [PATCH 036/356] [SPARK-22677][SQL] cleanup whole stage codegen for
 hash aggregate

## What changes were proposed in this pull request?

The `HashAggregateExec` whole stage codegen path is a little messy and hard to understand, this code cleans it up a little bit, especially for the fast hash map part.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19869 from cloud-fan/hash-agg.
---
 .../aggregate/HashAggregateExec.scala         | 402 +++++++++---------
 1 file changed, 195 insertions(+), 207 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 913978892cd8c..26d8cd7278353 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.vectorized.MutableColumnarRow
+import org.apache.spark.sql.execution.vectorized.{ColumnarRow, MutableColumnarRow}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.Utils
@@ -444,6 +444,7 @@ case class HashAggregateExec(
     val funcName = ctx.freshName("doAggregateWithKeysOutput")
     val keyTerm = ctx.freshName("keyTerm")
     val bufferTerm = ctx.freshName("bufferTerm")
+    val numOutput = metricTerm(ctx, "numOutputRows")
 
     val body =
     if (modes.contains(Final) || modes.contains(Complete)) {
@@ -520,6 +521,7 @@ case class HashAggregateExec(
       s"""
         private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm)
             throws java.io.IOException {
+          $numOutput.add(1);
           $body
         }
        """)
@@ -549,7 +551,7 @@ case class HashAggregateExec(
     isSupported  && isNotByteArrayDecimalType
   }
 
-  private def enableTwoLevelHashMap(ctx: CodegenContext) = {
+  private def enableTwoLevelHashMap(ctx: CodegenContext): Unit = {
     if (!checkIfFastHashMapSupported(ctx)) {
       if (modes.forall(mode => mode == Partial || mode == PartialMerge) && !Utils.isTesting) {
         logInfo("spark.sql.codegen.aggregate.map.twolevel.enabled is set to true, but"
@@ -560,9 +562,8 @@ case class HashAggregateExec(
 
       // This is for testing/benchmarking only.
       // We enforce to first level to be a vectorized hashmap, instead of the default row-based one.
-      sqlContext.getConf("spark.sql.codegen.aggregate.map.vectorized.enable", null) match {
-        case "true" => isVectorizedHashMapEnabled = true
-        case null | "" | "false" => None      }
+      isVectorizedHashMapEnabled = sqlContext.getConf(
+        "spark.sql.codegen.aggregate.map.vectorized.enable", "false") == "true"
     }
   }
 
@@ -573,94 +574,84 @@ case class HashAggregateExec(
       enableTwoLevelHashMap(ctx)
     } else {
       sqlContext.getConf("spark.sql.codegen.aggregate.map.vectorized.enable", null) match {
-        case "true" => logWarning("Two level hashmap is disabled but vectorized hashmap is " +
-          "enabled.")
-        case null | "" | "false" => None
+        case "true" =>
+          logWarning("Two level hashmap is disabled but vectorized hashmap is enabled.")
+        case _ =>
       }
     }
-    fastHashMapTerm = ctx.freshName("fastHashMap")
-    val fastHashMapClassName = ctx.freshName("FastHashMap")
-    val fastHashMapGenerator =
-      if (isVectorizedHashMapEnabled) {
-        new VectorizedHashMapGenerator(ctx, aggregateExpressions,
-          fastHashMapClassName, groupingKeySchema, bufferSchema)
-      } else {
-        new RowBasedHashMapGenerator(ctx, aggregateExpressions,
-          fastHashMapClassName, groupingKeySchema, bufferSchema)
-      }
 
     val thisPlan = ctx.addReferenceObj("plan", this)
 
-    // Create a name for iterator from vectorized HashMap
+    // Create a name for the iterator from the fast hash map.
     val iterTermForFastHashMap = ctx.freshName("fastHashMapIter")
     if (isFastHashMapEnabled) {
+      // Generates the fast hash map class and creates the fash hash map term.
+      fastHashMapTerm = ctx.freshName("fastHashMap")
+      val fastHashMapClassName = ctx.freshName("FastHashMap")
       if (isVectorizedHashMapEnabled) {
+        val generatedMap = new VectorizedHashMapGenerator(ctx, aggregateExpressions,
+          fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
+        ctx.addInnerClass(generatedMap)
+
         ctx.addMutableState(fastHashMapClassName, fastHashMapTerm,
           s"$fastHashMapTerm = new $fastHashMapClassName();")
         ctx.addMutableState(
-          "java.util.Iterator<org.apache.spark.sql.execution.vectorized.ColumnarRow>",
+          s"java.util.Iterator<${classOf[ColumnarRow].getName}>",
           iterTermForFastHashMap)
       } else {
+        val generatedMap = new RowBasedHashMapGenerator(ctx, aggregateExpressions,
+          fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
+        ctx.addInnerClass(generatedMap)
+
         ctx.addMutableState(fastHashMapClassName, fastHashMapTerm,
           s"$fastHashMapTerm = new $fastHashMapClassName(" +
             s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());")
         ctx.addMutableState(
-          "org.apache.spark.unsafe.KVIterator",
+          "org.apache.spark.unsafe.KVIterator<UnsafeRow, UnsafeRow>",
           iterTermForFastHashMap)
       }
     }
 
+    // Create a name for the iterator from the regular hash map.
+    val iterTerm = ctx.freshName("mapIter")
+    ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName, iterTerm)
     // create hashMap
     hashMapTerm = ctx.freshName("hashMap")
     val hashMapClassName = classOf[UnsafeFixedWidthAggregationMap].getName
-    ctx.addMutableState(hashMapClassName, hashMapTerm)
+    ctx.addMutableState(hashMapClassName, hashMapTerm, s"$hashMapTerm = $thisPlan.createHashMap();")
     sorterTerm = ctx.freshName("sorter")
     ctx.addMutableState(classOf[UnsafeKVExternalSorter].getName, sorterTerm)
 
-    // Create a name for iterator from HashMap
-    val iterTerm = ctx.freshName("mapIter")
-    ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName, iterTerm)
-
-    def generateGenerateCode(): String = {
-      if (isFastHashMapEnabled) {
-        if (isVectorizedHashMapEnabled) {
-          s"""
-               | ${fastHashMapGenerator.asInstanceOf[VectorizedHashMapGenerator].generate()}
-          """.stripMargin
-        } else {
-          s"""
-               | ${fastHashMapGenerator.asInstanceOf[RowBasedHashMapGenerator].generate()}
-          """.stripMargin
-        }
-      } else ""
-    }
-    ctx.addInnerClass(generateGenerateCode())
-
     val doAgg = ctx.freshName("doAggregateWithKeys")
     val peakMemory = metricTerm(ctx, "peakMemory")
     val spillSize = metricTerm(ctx, "spillSize")
     val avgHashProbe = metricTerm(ctx, "avgHashProbe")
-    val doAggFuncName = ctx.addNewFunction(doAgg,
-      s"""
-        private void $doAgg() throws java.io.IOException {
-          $hashMapTerm = $thisPlan.createHashMap();
-          ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
 
-          ${if (isFastHashMapEnabled) {
-              s"$iterTermForFastHashMap = $fastHashMapTerm.rowIterator();"} else ""}
+    val finishRegularHashMap = s"$iterTerm = $thisPlan.finishAggregate(" +
+      s"$hashMapTerm, $sorterTerm, $peakMemory, $spillSize, $avgHashProbe);"
+    val finishHashMap = if (isFastHashMapEnabled) {
+      s"""
+         |$iterTermForFastHashMap = $fastHashMapTerm.rowIterator();
+         |$finishRegularHashMap
+       """.stripMargin
+    } else {
+      finishRegularHashMap
+    }
 
-          $iterTerm = $thisPlan.finishAggregate($hashMapTerm, $sorterTerm, $peakMemory, $spillSize,
-            $avgHashProbe);
-        }
-       """)
+    val doAggFuncName = ctx.addNewFunction(doAgg,
+      s"""
+         |private void $doAgg() throws java.io.IOException {
+         |  ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
+         |  $finishHashMap
+         |}
+       """.stripMargin)
 
     // generate code for output
     val keyTerm = ctx.freshName("aggKey")
     val bufferTerm = ctx.freshName("aggBuffer")
     val outputFunc = generateResultFunction(ctx)
-    val numOutput = metricTerm(ctx, "numOutputRows")
 
-    def outputFromGeneratedMap: String = {
+    def outputFromFastHashMap: String = {
       if (isFastHashMapEnabled) {
         if (isVectorizedHashMapEnabled) {
           outputFromVectorizedMap
@@ -672,48 +663,56 @@ case class HashAggregateExec(
 
     def outputFromRowBasedMap: String = {
       s"""
-       while ($iterTermForFastHashMap.next()) {
-         $numOutput.add(1);
-         UnsafeRow $keyTerm = (UnsafeRow) $iterTermForFastHashMap.getKey();
-         UnsafeRow $bufferTerm = (UnsafeRow) $iterTermForFastHashMap.getValue();
-         $outputFunc($keyTerm, $bufferTerm);
-
-         if (shouldStop()) return;
-       }
-       $fastHashMapTerm.close();
-     """
+         |while ($iterTermForFastHashMap.next()) {
+         |  UnsafeRow $keyTerm = (UnsafeRow) $iterTermForFastHashMap.getKey();
+         |  UnsafeRow $bufferTerm = (UnsafeRow) $iterTermForFastHashMap.getValue();
+         |  $outputFunc($keyTerm, $bufferTerm);
+         |
+         |  if (shouldStop()) return;
+         |}
+         |$fastHashMapTerm.close();
+       """.stripMargin
     }
 
     // Iterate over the aggregate rows and convert them from ColumnarRow to UnsafeRow
     def outputFromVectorizedMap: String = {
-        val row = ctx.freshName("fastHashMapRow")
-        ctx.currentVars = null
-        ctx.INPUT_ROW = row
-        val generateKeyRow = GenerateUnsafeProjection.createCode(ctx,
-          groupingKeySchema.toAttributes.zipWithIndex
+      val row = ctx.freshName("fastHashMapRow")
+      ctx.currentVars = null
+      ctx.INPUT_ROW = row
+      val generateKeyRow = GenerateUnsafeProjection.createCode(ctx,
+        groupingKeySchema.toAttributes.zipWithIndex
           .map { case (attr, i) => BoundReference(i, attr.dataType, attr.nullable) }
-        )
-        val generateBufferRow = GenerateUnsafeProjection.createCode(ctx,
-          bufferSchema.toAttributes.zipWithIndex
-          .map { case (attr, i) =>
-            BoundReference(groupingKeySchema.length + i, attr.dataType, attr.nullable) })
-        s"""
-           | while ($iterTermForFastHashMap.hasNext()) {
-           |   $numOutput.add(1);
-           |   org.apache.spark.sql.execution.vectorized.ColumnarRow $row =
-           |     (org.apache.spark.sql.execution.vectorized.ColumnarRow)
-           |     $iterTermForFastHashMap.next();
-           |   ${generateKeyRow.code}
-           |   ${generateBufferRow.code}
-           |   $outputFunc(${generateKeyRow.value}, ${generateBufferRow.value});
-           |
-           |   if (shouldStop()) return;
-           | }
-           |
-           | $fastHashMapTerm.close();
-         """.stripMargin
+      )
+      val generateBufferRow = GenerateUnsafeProjection.createCode(ctx,
+        bufferSchema.toAttributes.zipWithIndex.map { case (attr, i) =>
+          BoundReference(groupingKeySchema.length + i, attr.dataType, attr.nullable)
+        })
+      val columnarRowCls = classOf[ColumnarRow].getName
+      s"""
+         |while ($iterTermForFastHashMap.hasNext()) {
+         |  $columnarRowCls $row = ($columnarRowCls) $iterTermForFastHashMap.next();
+         |  ${generateKeyRow.code}
+         |  ${generateBufferRow.code}
+         |  $outputFunc(${generateKeyRow.value}, ${generateBufferRow.value});
+         |
+         |  if (shouldStop()) return;
+         |}
+         |
+         |$fastHashMapTerm.close();
+       """.stripMargin
     }
 
+    def outputFromRegularHashMap: String = {
+      s"""
+         |while ($iterTerm.next()) {
+         |  UnsafeRow $keyTerm = (UnsafeRow) $iterTerm.getKey();
+         |  UnsafeRow $bufferTerm = (UnsafeRow) $iterTerm.getValue();
+         |  $outputFunc($keyTerm, $bufferTerm);
+         |
+         |  if (shouldStop()) return;
+         |}
+       """.stripMargin
+    }
 
     val aggTime = metricTerm(ctx, "aggTime")
     val beforeAgg = ctx.freshName("beforeAgg")
@@ -726,16 +725,8 @@ case class HashAggregateExec(
      }
 
      // output the result
-     ${outputFromGeneratedMap}
-
-     while ($iterTerm.next()) {
-       $numOutput.add(1);
-       UnsafeRow $keyTerm = (UnsafeRow) $iterTerm.getKey();
-       UnsafeRow $bufferTerm = (UnsafeRow) $iterTerm.getValue();
-       $outputFunc($keyTerm, $bufferTerm);
-
-       if (shouldStop()) return;
-     }
+     $outputFromFastHashMap
+     $outputFromRegularHashMap
 
      $iterTerm.close();
      if ($sorterTerm == null) {
@@ -745,13 +736,11 @@ case class HashAggregateExec(
   }
 
   private def doConsumeWithKeys(ctx: CodegenContext, input: Seq[ExprCode]): String = {
-
     // create grouping key
-    ctx.currentVars = input
     val unsafeRowKeyCode = GenerateUnsafeProjection.createCode(
       ctx, groupingExpressions.map(e => BindReferences.bindReference[Expression](e, child.output)))
     val fastRowKeys = ctx.generateExpressions(
-          groupingExpressions.map(e => BindReferences.bindReference[Expression](e, child.output)))
+      groupingExpressions.map(e => BindReferences.bindReference[Expression](e, child.output)))
     val unsafeRowKeys = unsafeRowKeyCode.value
     val unsafeRowBuffer = ctx.freshName("unsafeRowAggBuffer")
     val fastRowBuffer = ctx.freshName("fastAggBuffer")
@@ -768,12 +757,8 @@ case class HashAggregateExec(
 
     // generate hash code for key
     val hashExpr = Murmur3Hash(groupingExpressions, 42)
-    ctx.currentVars = input
     val hashEval = BindReferences.bindReference(hashExpr, child.output).genCode(ctx)
 
-    val inputAttr = aggregateBufferAttributes ++ child.output
-    ctx.currentVars = new Array[ExprCode](aggregateBufferAttributes.length) ++ input
-
     val (checkFallbackForGeneratedHashMap, checkFallbackForBytesToBytesMap, resetCounter,
     incCounter) = if (testFallbackStartsAt.isDefined) {
       val countTerm = ctx.freshName("fallbackCounter")
@@ -784,86 +769,65 @@ case class HashAggregateExec(
       ("true", "true", "", "")
     }
 
-    // We first generate code to probe and update the fast hash map. If the probe is
-    // successful the corresponding fast row buffer will hold the mutable row
-    val findOrInsertFastHashMap: Option[String] = {
+    val findOrInsertRegularHashMap: String =
+      s"""
+         |// generate grouping key
+         |${unsafeRowKeyCode.code.trim}
+         |${hashEval.code.trim}
+         |if ($checkFallbackForBytesToBytesMap) {
+         |  // try to get the buffer from hash map
+         |  $unsafeRowBuffer =
+         |    $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, ${hashEval.value});
+         |}
+         |// Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
+         |// aggregation after processing all input rows.
+         |if ($unsafeRowBuffer == null) {
+         |  if ($sorterTerm == null) {
+         |    $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
+         |  } else {
+         |    $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
+         |  }
+         |  $resetCounter
+         |  // the hash map had be spilled, it should have enough memory now,
+         |  // try to allocate buffer again.
+         |  $unsafeRowBuffer = $hashMapTerm.getAggregationBufferFromUnsafeRow(
+         |    $unsafeRowKeys, ${hashEval.value});
+         |  if ($unsafeRowBuffer == null) {
+         |    // failed to allocate the first page
+         |    throw new OutOfMemoryError("No enough memory for aggregation");
+         |  }
+         |}
+       """.stripMargin
+
+    val findOrInsertHashMap: String = {
       if (isFastHashMapEnabled) {
-        Option(
-          s"""
-             |
-             |if ($checkFallbackForGeneratedHashMap) {
-             |  ${fastRowKeys.map(_.code).mkString("\n")}
-             |  if (${fastRowKeys.map("!" + _.isNull).mkString(" && ")}) {
-             |    $fastRowBuffer = $fastHashMapTerm.findOrInsert(
-             |        ${fastRowKeys.map(_.value).mkString(", ")});
-             |  }
-             |}
-         """.stripMargin)
+        // If fast hash map is on, we first generate code to probe and update the fast hash map.
+        // If the probe is successful the corresponding fast row buffer will hold the mutable row.
+        s"""
+           |if ($checkFallbackForGeneratedHashMap) {
+           |  ${fastRowKeys.map(_.code).mkString("\n")}
+           |  if (${fastRowKeys.map("!" + _.isNull).mkString(" && ")}) {
+           |    $fastRowBuffer = $fastHashMapTerm.findOrInsert(
+           |      ${fastRowKeys.map(_.value).mkString(", ")});
+           |  }
+           |}
+           |// Cannot find the key in fast hash map, try regular hash map.
+           |if ($fastRowBuffer == null) {
+           |  $findOrInsertRegularHashMap
+           |}
+         """.stripMargin
       } else {
-        None
+        findOrInsertRegularHashMap
       }
     }
 
+    val inputAttr = aggregateBufferAttributes ++ child.output
+    // Here we set `currentVars(0)` to `currentVars(numBufferSlots)` to null, so that when
+    // generating code for buffer columns, we use `INPUT_ROW`(will be the buffer row), while
+    // generating input columns, we use `currentVars`.
+    ctx.currentVars = new Array[ExprCode](aggregateBufferAttributes.length) ++ input
 
-    def updateRowInFastHashMap(isVectorized: Boolean): Option[String] = {
-      ctx.INPUT_ROW = fastRowBuffer
-      val boundUpdateExpr = updateExpr.map(BindReferences.bindReference(_, inputAttr))
-      val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr)
-      val effectiveCodes = subExprs.codes.mkString("\n")
-      val fastRowEvals = ctx.withSubExprEliminationExprs(subExprs.states) {
-        boundUpdateExpr.map(_.genCode(ctx))
-      }
-      val updateFastRow = fastRowEvals.zipWithIndex.map { case (ev, i) =>
-        val dt = updateExpr(i).dataType
-        ctx.updateColumn(fastRowBuffer, dt, i, ev, updateExpr(i).nullable, isVectorized)
-      }
-      Option(
-        s"""
-           |// common sub-expressions
-           |$effectiveCodes
-           |// evaluate aggregate function
-           |${evaluateVariables(fastRowEvals)}
-           |// update fast row
-           |${updateFastRow.mkString("\n").trim}
-           |
-         """.stripMargin)
-    }
-
-    // Next, we generate code to probe and update the unsafe row hash map.
-    val findOrInsertInUnsafeRowMap: String = {
-      s"""
-         | if ($fastRowBuffer == null) {
-         |   // generate grouping key
-         |   ${unsafeRowKeyCode.code.trim}
-         |   ${hashEval.code.trim}
-         |   if ($checkFallbackForBytesToBytesMap) {
-         |     // try to get the buffer from hash map
-         |     $unsafeRowBuffer =
-         |       $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, ${hashEval.value});
-         |   }
-         |   // Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
-         |   // aggregation after processing all input rows.
-         |   if ($unsafeRowBuffer == null) {
-         |     if ($sorterTerm == null) {
-         |       $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
-         |     } else {
-         |       $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
-         |     }
-         |     $resetCounter
-         |     // the hash map had be spilled, it should have enough memory now,
-         |     // try to allocate buffer again.
-         |     $unsafeRowBuffer =
-         |       $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, ${hashEval.value});
-         |     if ($unsafeRowBuffer == null) {
-         |       // failed to allocate the first page
-         |       throw new OutOfMemoryError("No enough memory for aggregation");
-         |     }
-         |   }
-         | }
-       """.stripMargin
-    }
-
-    val updateRowInUnsafeRowMap: String = {
+    val updateRowInRegularHashMap: String = {
       ctx.INPUT_ROW = unsafeRowBuffer
       val boundUpdateExpr = updateExpr.map(BindReferences.bindReference(_, inputAttr))
       val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr)
@@ -882,45 +846,69 @@ case class HashAggregateExec(
          |${evaluateVariables(unsafeRowBufferEvals)}
          |// update unsafe row buffer
          |${updateUnsafeRowBuffer.mkString("\n").trim}
-           """.stripMargin
+       """.stripMargin
+    }
+
+    val updateRowInHashMap: String = {
+      if (isFastHashMapEnabled) {
+        ctx.INPUT_ROW = fastRowBuffer
+        val boundUpdateExpr = updateExpr.map(BindReferences.bindReference(_, inputAttr))
+        val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(boundUpdateExpr)
+        val effectiveCodes = subExprs.codes.mkString("\n")
+        val fastRowEvals = ctx.withSubExprEliminationExprs(subExprs.states) {
+          boundUpdateExpr.map(_.genCode(ctx))
+        }
+        val updateFastRow = fastRowEvals.zipWithIndex.map { case (ev, i) =>
+          val dt = updateExpr(i).dataType
+          ctx.updateColumn(
+            fastRowBuffer, dt, i, ev, updateExpr(i).nullable, isVectorizedHashMapEnabled)
+        }
+
+        // If fast hash map is on, we first generate code to update row in fast hash map, if the
+        // previous loop up hit fast hash map. Otherwise, update row in regular hash map.
+        s"""
+           |if ($fastRowBuffer != null) {
+           |  // common sub-expressions
+           |  $effectiveCodes
+           |  // evaluate aggregate function
+           |  ${evaluateVariables(fastRowEvals)}
+           |  // update fast row
+           |  ${updateFastRow.mkString("\n").trim}
+           |} else {
+           |  $updateRowInRegularHashMap
+           |}
+       """.stripMargin
+      } else {
+        updateRowInRegularHashMap
+      }
     }
 
+    val declareRowBuffer: String = if (isFastHashMapEnabled) {
+      val fastRowType = if (isVectorizedHashMapEnabled) {
+        classOf[MutableColumnarRow].getName
+      } else {
+        "UnsafeRow"
+      }
+      s"""
+         |UnsafeRow $unsafeRowBuffer = null;
+         |$fastRowType $fastRowBuffer = null;
+       """.stripMargin
+    } else {
+      s"UnsafeRow $unsafeRowBuffer = null;"
+    }
 
     // We try to do hash map based in-memory aggregation first. If there is not enough memory (the
     // hash map will return null for new key), we spill the hash map to disk to free memory, then
     // continue to do in-memory aggregation and spilling until all the rows had been processed.
     // Finally, sort the spilled aggregate buffers by key, and merge them together for same key.
     s"""
-     UnsafeRow $unsafeRowBuffer = null;
-     ${
-        if (isVectorizedHashMapEnabled) {
-          s"""
-             | ${classOf[MutableColumnarRow].getName} $fastRowBuffer = null;
-           """.stripMargin
-        } else {
-          s"""
-             | UnsafeRow $fastRowBuffer = null;
-           """.stripMargin
-        }
-      }
+     $declareRowBuffer
 
-     ${findOrInsertFastHashMap.getOrElse("")}
-
-     $findOrInsertInUnsafeRowMap
+     $findOrInsertHashMap
 
      $incCounter
 
-     if ($fastRowBuffer != null) {
-       // update fast row
-       ${
-          if (isFastHashMapEnabled) {
-            updateRowInFastHashMap(isVectorizedHashMapEnabled).getOrElse("")
-          } else ""
-        }
-     } else {
-       // update unsafe row
-       $updateRowInUnsafeRowMap
-     }
+     $updateRowInHashMap
      """
   }
 

From a8af4da12ce43cd5528a53b5f7f454e9dbe71d6e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 5 Dec 2017 12:43:05 +0800
Subject: [PATCH 037/356] [SPARK-22682][SQL] HashExpression does not need to
 create global variables

## What changes were proposed in this pull request?

It turns out that `HashExpression` can pass around some values via parameter when splitting codes into methods, to save some global variable slots.

This can also prevent a weird case that global variable appears in parameter list, which is discovered by https://github.com/apache/spark/pull/19865

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19878 from cloud-fan/minor.
---
 .../spark/sql/catalyst/expressions/hash.scala | 118 +++++++++++++-----
 .../expressions/HashExpressionsSuite.scala    |  34 +++--
 2 files changed, 106 insertions(+), 46 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index c3289b8299933..d0ed2ab8f3f0e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -270,17 +270,36 @@ abstract class HashExpression[E] extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     ev.isNull = "false"
-    val childrenHash = ctx.splitExpressions(children.map { child =>
+
+    val childrenHash = children.map { child =>
       val childGen = child.genCode(ctx)
       childGen.code + ctx.nullSafeExec(child.nullable, childGen.isNull) {
         computeHash(childGen.value, child.dataType, ev.value, ctx)
       }
-    })
+    }
+
+    val hashResultType = ctx.javaType(dataType)
+    val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
+      childrenHash.mkString("\n")
+    } else {
+      ctx.splitExpressions(
+        expressions = childrenHash,
+        funcName = "computeHash",
+        arguments = Seq("InternalRow" -> ctx.INPUT_ROW, hashResultType -> ev.value),
+        returnType = hashResultType,
+        makeSplitFunction = body =>
+          s"""
+             |$body
+             |return ${ev.value};
+           """.stripMargin,
+        foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
+    }
 
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
-    ev.copy(code = s"""
-      ${ev.value} = $seed;
-      $childrenHash""")
+    ev.copy(code =
+      s"""
+         |$hashResultType ${ev.value} = $seed;
+         |$codes
+       """.stripMargin)
   }
 
   protected def nullSafeElementHash(
@@ -389,13 +408,21 @@ abstract class HashExpression[E] extends Expression {
       input: String,
       result: String,
       fields: Array[StructField]): String = {
-    val hashes = fields.zipWithIndex.map { case (field, index) =>
+    val fieldsHash = fields.zipWithIndex.map { case (field, index) =>
       nullSafeElementHash(input, index.toString, field.nullable, field.dataType, result, ctx)
     }
+    val hashResultType = ctx.javaType(dataType)
     ctx.splitExpressions(
-      expressions = hashes,
-      funcName = "getHash",
-      arguments = Seq("InternalRow" -> input))
+      expressions = fieldsHash,
+      funcName = "computeHashForStruct",
+      arguments = Seq("InternalRow" -> input, hashResultType -> result),
+      returnType = hashResultType,
+      makeSplitFunction = body =>
+        s"""
+           |$body
+           |return $result;
+         """.stripMargin,
+      foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n"))
   }
 
   @tailrec
@@ -610,25 +637,44 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     ev.isNull = "false"
+
     val childHash = ctx.freshName("childHash")
-    val childrenHash = ctx.splitExpressions(children.map { child =>
+    val childrenHash = children.map { child =>
       val childGen = child.genCode(ctx)
       val codeToComputeHash = ctx.nullSafeExec(child.nullable, childGen.isNull) {
         computeHash(childGen.value, child.dataType, childHash, ctx)
       }
       s"""
          |${childGen.code}
+         |$childHash = 0;
          |$codeToComputeHash
          |${ev.value} = (31 * ${ev.value}) + $childHash;
-         |$childHash = 0;
        """.stripMargin
-    })
+    }
 
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
-    ctx.addMutableState(ctx.JAVA_INT, childHash, s"$childHash = 0;")
-    ev.copy(code = s"""
-      ${ev.value} = $seed;
-      $childrenHash""")
+    val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
+      childrenHash.mkString("\n")
+    } else {
+      ctx.splitExpressions(
+        expressions = childrenHash,
+        funcName = "computeHash",
+        arguments = Seq("InternalRow" -> ctx.INPUT_ROW, ctx.JAVA_INT -> ev.value),
+        returnType = ctx.JAVA_INT,
+        makeSplitFunction = body =>
+          s"""
+             |${ctx.JAVA_INT} $childHash = 0;
+             |$body
+             |return ${ev.value};
+           """.stripMargin,
+        foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
+    }
+
+    ev.copy(code =
+      s"""
+         |${ctx.JAVA_INT} ${ev.value} = $seed;
+         |${ctx.JAVA_INT} $childHash = 0;
+         |$codes
+       """.stripMargin)
   }
 
   override def eval(input: InternalRow = null): Int = {
@@ -730,23 +776,29 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
       input: String,
       result: String,
       fields: Array[StructField]): String = {
-    val localResult = ctx.freshName("localResult")
     val childResult = ctx.freshName("childResult")
-    fields.zipWithIndex.map { case (field, index) =>
+    val fieldsHash = fields.zipWithIndex.map { case (field, index) =>
+      val computeFieldHash = nullSafeElementHash(
+        input, index.toString, field.nullable, field.dataType, childResult, ctx)
       s"""
-         $childResult = 0;
-         ${nullSafeElementHash(input, index.toString, field.nullable, field.dataType,
-           childResult, ctx)}
-         $localResult = (31 * $localResult) + $childResult;
-       """
-    }.mkString(
-      s"""
-         int $localResult = 0;
-         int $childResult = 0;
-       """,
-      "",
-      s"$result = (31 * $result) + $localResult;"
-    )
+         |$childResult = 0;
+         |$computeFieldHash
+         |$result = (31 * $result) + $childResult;
+       """.stripMargin
+    }
+
+    s"${ctx.JAVA_INT} $childResult = 0;\n" + ctx.splitExpressions(
+      expressions = fieldsHash,
+      funcName = "computeHashForStruct",
+      arguments = Seq("InternalRow" -> input, ctx.JAVA_INT -> result),
+      returnType = ctx.JAVA_INT,
+      makeSplitFunction = body =>
+        s"""
+           |${ctx.JAVA_INT} $childResult = 0;
+           |$body
+           |return $result;
+           """.stripMargin,
+      foldFunctions = _.map(funcCall => s"$result = $funcCall;").mkString("\n"))
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
index 112a4a09728ae..4281c89ac475d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala
@@ -27,6 +27,7 @@ import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.{RandomDataGenerator, Row}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData}
@@ -620,23 +621,30 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   }
 
   test("SPARK-18207: Compute hash for a lot of expressions") {
+    def checkResult(schema: StructType, input: InternalRow): Unit = {
+      val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
+        BoundReference(i, f.dataType, true)
+      }
+      val murmur3HashExpr = Murmur3Hash(exprs, 42)
+      val murmur3HashPlan = GenerateMutableProjection.generate(Seq(murmur3HashExpr))
+      val murmursHashEval = Murmur3Hash(exprs, 42).eval(input)
+      assert(murmur3HashPlan(input).getInt(0) == murmursHashEval)
+
+      val hiveHashExpr = HiveHash(exprs)
+      val hiveHashPlan = GenerateMutableProjection.generate(Seq(hiveHashExpr))
+      val hiveHashEval = HiveHash(exprs).eval(input)
+      assert(hiveHashPlan(input).getInt(0) == hiveHashEval)
+    }
+
     val N = 1000
     val wideRow = new GenericInternalRow(
       Seq.tabulate(N)(i => UTF8String.fromString(i.toString)).toArray[Any])
-    val schema = StructType((1 to N).map(i => StructField("", StringType)))
-
-    val exprs = schema.fields.zipWithIndex.map { case (f, i) =>
-      BoundReference(i, f.dataType, true)
-    }
-    val murmur3HashExpr = Murmur3Hash(exprs, 42)
-    val murmur3HashPlan = GenerateMutableProjection.generate(Seq(murmur3HashExpr))
-    val murmursHashEval = Murmur3Hash(exprs, 42).eval(wideRow)
-    assert(murmur3HashPlan(wideRow).getInt(0) == murmursHashEval)
+    val schema = StructType((1 to N).map(i => StructField(i.toString, StringType)))
+    checkResult(schema, wideRow)
 
-    val hiveHashExpr = HiveHash(exprs)
-    val hiveHashPlan = GenerateMutableProjection.generate(Seq(hiveHashExpr))
-    val hiveHashEval = HiveHash(exprs).eval(wideRow)
-    assert(hiveHashPlan(wideRow).getInt(0) == hiveHashEval)
+    val nestedRow = InternalRow(wideRow)
+    val nestedSchema = new StructType().add("nested", schema)
+    checkResult(nestedSchema, nestedRow)
   }
 
   test("SPARK-22284: Compute hash for nested structs") {

From 53e5251bb36cfa36ffa9058887a9944f87f26879 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 5 Dec 2017 20:43:02 +0800
Subject: [PATCH 038/356] [SPARK-22675][SQL] Refactoring PropagateTypes in
 TypeCoercion

## What changes were proposed in this pull request?
PropagateTypes are called twice in TypeCoercion. We do not need to call it twice. Instead, we should call it after each change on the types.

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19874 from gatorsmile/deduplicatePropagateTypes.
---
 .../catalyst/analysis/DecimalPrecision.scala  |   4 +-
 .../sql/catalyst/analysis/TypeCoercion.scala  | 116 ++++++++++--------
 .../catalyst/expressions/Canonicalize.scala   |   2 +-
 .../apache/spark/sql/SQLQueryTestSuite.scala  |   2 +-
 .../apache/spark/sql/TPCDSQuerySuite.scala    |   3 +
 .../execution/HiveCompatibilitySuite.scala    |   2 +-
 6 files changed, 71 insertions(+), 58 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index fd2ac78b25dbd..070bc542e4852 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -58,7 +58,7 @@ import org.apache.spark.sql.types._
  * - FLOAT and DOUBLE cause fixed-length decimals to turn into DOUBLE
  */
 // scalastyle:on
-object DecimalPrecision extends Rule[LogicalPlan] {
+object DecimalPrecision extends TypeCoercionRule {
   import scala.math.{max, min}
 
   private def isFloat(t: DataType): Boolean = t == FloatType || t == DoubleType
@@ -78,7 +78,7 @@ object DecimalPrecision extends Rule[LogicalPlan] {
     PromotePrecision(Cast(e, dataType))
   }
 
-  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
     // fix decimal precision for expressions
     case q => q.transformExpressionsUp(
       decimalAndDecimal.orElse(integralAndDecimalLiteral).orElse(nondecimalAndDecimal))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 28be955e08a0d..1ee2f6e941045 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -22,6 +22,7 @@ import javax.annotation.Nullable
 import scala.annotation.tailrec
 import scala.collection.mutable
 
+import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -45,8 +46,7 @@ import org.apache.spark.sql.types._
 object TypeCoercion {
 
   val typeCoercionRules =
-    PropagateTypes ::
-      InConversion ::
+    InConversion ::
       WidenSetOperationTypes ::
       PromoteStrings ::
       DecimalPrecision ::
@@ -56,7 +56,6 @@ object TypeCoercion {
       IfCoercion ::
       StackCoercion ::
       Division ::
-      PropagateTypes ::
       ImplicitTypeCasts ::
       DateTimeOperations ::
       WindowFrameCoercion ::
@@ -220,38 +219,6 @@ object TypeCoercion {
   private def haveSameType(exprs: Seq[Expression]): Boolean =
     exprs.map(_.dataType).distinct.length == 1
 
-  /**
-   * Applies any changes to [[AttributeReference]] data types that are made by other rules to
-   * instances higher in the query tree.
-   */
-  object PropagateTypes extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-
-      // No propagation required for leaf nodes.
-      case q: LogicalPlan if q.children.isEmpty => q
-
-      // Don't propagate types from unresolved children.
-      case q: LogicalPlan if !q.childrenResolved => q
-
-      case q: LogicalPlan =>
-        val inputMap = q.inputSet.toSeq.map(a => (a.exprId, a)).toMap
-        q transformExpressions {
-          case a: AttributeReference =>
-            inputMap.get(a.exprId) match {
-              // This can happen when an Attribute reference is born in a non-leaf node, for
-              // example due to a call to an external script like in the Transform operator.
-              // TODO: Perhaps those should actually be aliases?
-              case None => a
-              // Leave the same if the dataTypes match.
-              case Some(newType) if a.dataType == newType.dataType => a
-              case Some(newType) =>
-                logDebug(s"Promoting $a to $newType in ${q.simpleString}")
-                newType
-            }
-        }
-    }
-  }
-
   /**
    * Widens numeric types and converts strings to numbers when appropriate.
    *
@@ -345,7 +312,7 @@ object TypeCoercion {
   /**
    * Promotes strings that appear in arithmetic expressions.
    */
-  object PromoteStrings extends Rule[LogicalPlan] {
+  object PromoteStrings extends TypeCoercionRule {
     private def castExpr(expr: Expression, targetType: DataType): Expression = {
       (expr.dataType, targetType) match {
         case (NullType, dt) => Literal.create(null, targetType)
@@ -354,7 +321,7 @@ object TypeCoercion {
       }
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -403,7 +370,7 @@ object TypeCoercion {
    *    operator type is found the original expression will be returned and an
    *    Analysis Exception will be raised at the type checking phase.
    */
-  object InConversion extends Rule[LogicalPlan] {
+  object InConversion extends TypeCoercionRule {
     private def flattenExpr(expr: Expression): Seq[Expression] = {
       expr match {
         // Multi columns in IN clause is represented as a CreateNamedStruct.
@@ -413,7 +380,7 @@ object TypeCoercion {
       }
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -512,8 +479,8 @@ object TypeCoercion {
   /**
    * This ensure that the types for various functions are as expected.
    */
-  object FunctionArgumentConversion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object FunctionArgumentConversion extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -602,8 +569,8 @@ object TypeCoercion {
    * Hive only performs integral division with the DIV operator. The arguments to / are always
    * converted to fractional types.
    */
-  object Division extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object Division extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       // Skip nodes who has not been resolved yet,
       // as this is an extra rule which should be applied at last.
       case e if !e.childrenResolved => e
@@ -624,8 +591,8 @@ object TypeCoercion {
   /**
    * Coerces the type of different branches of a CASE WHEN statement to a common type.
    */
-  object CaseWhenCoercion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object CaseWhenCoercion extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       case c: CaseWhen if c.childrenResolved && !c.valueTypesEqual =>
         val maybeCommonType = findWiderCommonType(c.valueTypes)
         maybeCommonType.map { commonType =>
@@ -654,8 +621,8 @@ object TypeCoercion {
   /**
    * Coerces the type of different branches of If statement to a common type.
    */
-  object IfCoercion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object IfCoercion extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       case e if !e.childrenResolved => e
       // Find tightest common type for If, if the true value and false value have different types.
       case i @ If(pred, left, right) if left.dataType != right.dataType =>
@@ -674,8 +641,8 @@ object TypeCoercion {
   /**
    * Coerces NullTypes in the Stack expression to the column types of the corresponding positions.
    */
-  object StackCoercion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+  object StackCoercion extends TypeCoercionRule {
+    override def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case s @ Stack(children) if s.childrenResolved && s.hasFoldableNumRows =>
         Stack(children.zipWithIndex.map {
           // The first child is the number of rows for stack.
@@ -711,8 +678,8 @@ object TypeCoercion {
   /**
    * Casts types according to the expected input types for [[Expression]]s.
    */
-  object ImplicitTypeCasts extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object ImplicitTypeCasts extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -828,8 +795,8 @@ object TypeCoercion {
   /**
    * Cast WindowFrame boundaries to the type they operate upon.
    */
-  object WindowFrameCoercion extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+  object WindowFrameCoercion extends TypeCoercionRule {
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
       case s @ WindowSpecDefinition(_, Seq(order), SpecifiedWindowFrame(RangeFrame, lower, upper))
           if order.resolved =>
         s.copy(frameSpecification = SpecifiedWindowFrame(
@@ -850,3 +817,46 @@ object TypeCoercion {
     }
   }
 }
+
+trait TypeCoercionRule extends Rule[LogicalPlan] with Logging {
+  /**
+   * Applies any changes to [[AttributeReference]] data types that are made by the transform method
+   * to instances higher in the query tree.
+   */
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    val newPlan = coerceTypes(plan)
+    if (plan.fastEquals(newPlan)) {
+      plan
+    } else {
+      propagateTypes(newPlan)
+    }
+  }
+
+  protected def coerceTypes(plan: LogicalPlan): LogicalPlan
+
+  private def propagateTypes(plan: LogicalPlan): LogicalPlan = plan transformUp {
+    // No propagation required for leaf nodes.
+    case q: LogicalPlan if q.children.isEmpty => q
+
+    // Don't propagate types from unresolved children.
+    case q: LogicalPlan if !q.childrenResolved => q
+
+    case q: LogicalPlan =>
+      val inputMap = q.inputSet.toSeq.map(a => (a.exprId, a)).toMap
+      q transformExpressions {
+        case a: AttributeReference =>
+          inputMap.get(a.exprId) match {
+            // This can happen when an Attribute reference is born in a non-leaf node, for
+            // example due to a call to an external script like in the Transform operator.
+            // TODO: Perhaps those should actually be aliases?
+            case None => a
+            // Leave the same if the dataTypes match.
+            case Some(newType) if a.dataType == newType.dataType => a
+            case Some(newType) =>
+              logDebug(
+                s"Promoting $a from ${a.dataType} to ${newType.dataType} in ${q.simpleString}")
+              newType
+          }
+      }
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
index 65e497afc12cd..d848ba18356d3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
@@ -31,7 +31,7 @@ package org.apache.spark.sql.catalyst.expressions
  *  - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`.
  *  - Other comparisons ([[GreaterThan]], [[LessThan]]) are reversed by `hashCode`.
  */
-object Canonicalize extends {
+object Canonicalize {
   def execute(e: Expression): Expression = {
     expressionReorder(ignoreNamesTypes(e))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index e3901af4b9988..cac3e12dde3e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -300,7 +300,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       Locale.setDefault(originalLocale)
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logWarning(RuleExecutor.dumpTimeSpent())
+      logInfo(RuleExecutor.dumpTimeSpent())
     } finally {
       super.afterAll()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
index e47d4b0ee25d4..dbea03689785c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.resourceToString
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
@@ -39,6 +40,8 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
    */
   protected override def afterAll(): Unit = {
     try {
+      // For debugging dump some statistics about how much time was spent in various optimizer rules
+      logInfo(RuleExecutor.dumpTimeSpent())
       spark.sessionState.catalog.reset()
     } finally {
       super.afterAll()
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 45791c69b4cb7..def70a516a96b 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -76,7 +76,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
       TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone)
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logWarning(RuleExecutor.dumpTimeSpent())
+      logInfo(RuleExecutor.dumpTimeSpent())
     } finally {
       super.afterAll()
     }

From 326f1d6728a7734c228d8bfaa69442a1c7b92e9b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 5 Dec 2017 20:46:35 +0800
Subject: [PATCH 039/356] [SPARK-20728][SQL] Make OrcFileFormat configurable
 between sql/hive and sql/core

## What changes were proposed in this pull request?

This PR aims to provide a configuration to choose the default `OrcFileFormat` from legacy `sql/hive` module or new `sql/core` module.

For example, this configuration will affects the following operations.
```scala
spark.read.orc(...)
```
```sql
CREATE TABLE t
USING ORC
...
```

## How was this patch tested?

Pass the Jenkins with new test suites.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19871 from dongjoon-hyun/spark-sql-orc-enabled.
---
 .../apache/spark/sql/internal/SQLConf.scala   |  8 ++
 ...pache.spark.sql.sources.DataSourceRegister |  1 +
 .../apache/spark/sql/DataFrameReader.scala    |  2 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  2 +-
 .../spark/sql/execution/command/tables.scala  |  6 +-
 .../execution/datasources/DataSource.scala    | 23 +++++-
 .../sql/execution/datasources/rules.scala     |  5 +-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 23 +++++-
 .../sql/sources/DDLSourceLoadSuite.scala      |  7 --
 .../sql/test/DataFrameReaderWriterSuite.scala | 79 +++++++++++--------
 .../spark/sql/hive/HiveStrategies.scala       | 16 +++-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    | 19 ++++-
 12 files changed, 136 insertions(+), 55 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 8abb4262d7352..ce9cc562b220f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -363,6 +363,14 @@ object SQLConf {
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
     .createWithDefault("snappy")
 
+  val ORC_IMPLEMENTATION = buildConf("spark.sql.orc.impl")
+    .doc("When native, use the native version of ORC support instead of the ORC library in Hive " +
+      "1.2.1. It is 'hive' by default prior to Spark 2.3.")
+    .internal()
+    .stringConf
+    .checkValues(Set("hive", "native"))
+    .createWithDefault("native")
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index 0c5f3f22e31e8..6cdfe2fae5642 100644
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -1,6 +1,7 @@
 org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 org.apache.spark.sql.execution.datasources.text.TextFileFormat
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 17966eecfc051..ea1cf66775235 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -182,7 +182,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
         "read files of Hive data source directly.")
     }
 
-    val cls = DataSource.lookupDataSource(source)
+    val cls = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
       val options = new DataSourceV2Options(extraOptions.asJava)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 35abeccfd514a..59a01e61124f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -234,7 +234,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
     assertNotBucketed("save")
 
-    val cls = DataSource.lookupDataSource(source)
+    val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
       cls.newInstance() match {
         case ds: WriteSupport =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index c42e6c3257fad..e400975f19708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.datasources.{DataSource, PartitioningUtils
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.SchemaUtils
 import org.apache.spark.util.Utils
@@ -190,7 +191,7 @@ case class AlterTableAddColumnsCommand(
     colsToAdd: Seq[StructField]) extends RunnableCommand {
   override def run(sparkSession: SparkSession): Seq[Row] = {
     val catalog = sparkSession.sessionState.catalog
-    val catalogTable = verifyAlterTableAddColumn(catalog, table)
+    val catalogTable = verifyAlterTableAddColumn(sparkSession.sessionState.conf, catalog, table)
 
     try {
       sparkSession.catalog.uncacheTable(table.quotedString)
@@ -216,6 +217,7 @@ case class AlterTableAddColumnsCommand(
    * For datasource table, it currently only supports parquet, json, csv.
    */
   private def verifyAlterTableAddColumn(
+      conf: SQLConf,
       catalog: SessionCatalog,
       table: TableIdentifier): CatalogTable = {
     val catalogTable = catalog.getTempViewOrPermanentTableMetadata(table)
@@ -229,7 +231,7 @@ case class AlterTableAddColumnsCommand(
     }
 
     if (DDLUtils.isDatasourceTable(catalogTable)) {
-      DataSource.lookupDataSource(catalogTable.provider.get).newInstance() match {
+      DataSource.lookupDataSource(catalogTable.provider.get, conf).newInstance() match {
         // For datasource table, this command can only support the following File format.
         // TextFileFormat only default to one column "value"
         // Hive type is already considered as hive serde table, so the logic will not
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index b43d282bd434c..5f12d5f93a35c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -36,8 +36,10 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.execution.datasources.csv.CSVFileFormat
 import org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{CalendarIntervalType, StructType}
@@ -85,7 +87,8 @@ case class DataSource(
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
-  lazy val providingClass: Class[_] = DataSource.lookupDataSource(className)
+  lazy val providingClass: Class[_] =
+    DataSource.lookupDataSource(className, sparkSession.sessionState.conf)
   lazy val sourceInfo: SourceInfo = sourceSchema()
   private val caseInsensitiveOptions = CaseInsensitiveMap(options)
   private val equality = sparkSession.sessionState.conf.resolver
@@ -537,6 +540,7 @@ object DataSource extends Logging {
     val csv = classOf[CSVFileFormat].getCanonicalName
     val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
     val orc = "org.apache.spark.sql.hive.orc.OrcFileFormat"
+    val nativeOrc = classOf[OrcFileFormat].getCanonicalName
 
     Map(
       "org.apache.spark.sql.jdbc" -> jdbc,
@@ -553,6 +557,8 @@ object DataSource extends Logging {
       "org.apache.spark.sql.execution.datasources.parquet.DefaultSource" -> parquet,
       "org.apache.spark.sql.hive.orc.DefaultSource" -> orc,
       "org.apache.spark.sql.hive.orc" -> orc,
+      "org.apache.spark.sql.execution.datasources.orc.DefaultSource" -> nativeOrc,
+      "org.apache.spark.sql.execution.datasources.orc" -> nativeOrc,
       "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
       "org.apache.spark.ml.source.libsvm" -> libsvm,
       "com.databricks.spark.csv" -> csv
@@ -568,8 +574,16 @@ object DataSource extends Logging {
     "org.apache.spark.Logging")
 
   /** Given a provider name, look up the data source class definition. */
-  def lookupDataSource(provider: String): Class[_] = {
-    val provider1 = backwardCompatibilityMap.getOrElse(provider, provider)
+  def lookupDataSource(provider: String, conf: SQLConf): Class[_] = {
+    val provider1 = backwardCompatibilityMap.getOrElse(provider, provider) match {
+      case name if name.equalsIgnoreCase("orc") &&
+          conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native" =>
+        classOf[OrcFileFormat].getCanonicalName
+      case name if name.equalsIgnoreCase("orc") &&
+          conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "hive" =>
+        "org.apache.spark.sql.hive.orc.OrcFileFormat"
+      case name => name
+    }
     val provider2 = s"$provider1.DefaultSource"
     val loader = Utils.getContextOrSparkClassLoader
     val serviceLoader = ServiceLoader.load(classOf[DataSourceRegister], loader)
@@ -587,7 +601,8 @@ object DataSource extends Logging {
                 if (provider1.toLowerCase(Locale.ROOT) == "orc" ||
                   provider1.startsWith("org.apache.spark.sql.hive.orc")) {
                   throw new AnalysisException(
-                    "The ORC data source must be used with Hive support enabled")
+                    "Hive-based ORC data source must be used with Hive support enabled. " +
+                    "Please use native ORC data source instead")
                 } else if (provider1.toLowerCase(Locale.ROOT) == "avro" ||
                   provider1 == "com.databricks.spark.avro") {
                   throw new AnalysisException(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 60c430bcfece2..6e08df75b8a4c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -108,8 +108,9 @@ case class PreprocessTableCreation(sparkSession: SparkSession) extends Rule[Logi
       }
 
       // Check if the specified data source match the data source of the existing table.
-      val existingProvider = DataSource.lookupDataSource(existingTable.provider.get)
-      val specifiedProvider = DataSource.lookupDataSource(tableDesc.provider.get)
+      val conf = sparkSession.sessionState.conf
+      val existingProvider = DataSource.lookupDataSource(existingTable.provider.get, conf)
+      val specifiedProvider = DataSource.lookupDataSource(tableDesc.provider.get, conf)
       // TODO: Check that options from the resolved relation match the relation that we are
       // inserting into (i.e. using the same compression).
       if (existingProvider != specifiedProvider) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 31d9b909ad463..86bd9b95bca6a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -28,6 +28,8 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -1664,7 +1666,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     e = intercept[AnalysisException] {
       sql(s"select id from `org.apache.spark.sql.hive.orc`.`file_path`")
     }
-    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
+    assert(e.message.contains("Hive-based ORC data source must be used with Hive support"))
 
     e = intercept[AnalysisException] {
       sql(s"select id from `com.databricks.spark.avro`.`file_path`")
@@ -2782,4 +2784,23 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       assert(spark.read.format(orc).load(path).collect().length == 2)
     }
   }
+
+  test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {
+      val e = intercept[AnalysisException] {
+        sql("CREATE TABLE spark_20728(a INT) USING ORC")
+      }
+      assert(e.message.contains("Hive-based ORC data source must be used with Hive support"))
+    }
+
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
+      withTable("spark_20728") {
+        sql("CREATE TABLE spark_20728(a INT) USING ORC")
+        val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
+          case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
+        }
+        assert(fileFormat == Some(classOf[OrcFileFormat]))
+      }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
index 3ce6ae3c52927..f22d843bfabde 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DDLSourceLoadSuite.scala
@@ -53,13 +53,6 @@ class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {
     assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
       .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))))
   }
-
-  test("should fail to load ORC without Hive Support") {
-    val e = intercept[AnalysisException] {
-      spark.read.format("orc").load()
-    }
-    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
-  }
 }
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index a5d7e6257a6df..8c9bb7d56a35f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -155,7 +155,6 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
     }
   }
 
-
   test("resolve default source") {
     spark.read
       .format("org.apache.spark.sql.test")
@@ -478,42 +477,56 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
       spark.read.schema(userSchema).parquet(Seq(dir, dir): _*), expData ++ expData, userSchema)
   }
 
-  /**
-   * This only tests whether API compiles, but does not run it as orc()
-   * cannot be run without Hive classes.
-   */
-  ignore("orc - API") {
-    // Reader, with user specified schema
-    // Refer to csv-specific test suites for behavior without user specified schema
-    spark.read.schema(userSchema).orc()
-    spark.read.schema(userSchema).orc(dir)
-    spark.read.schema(userSchema).orc(dir, dir, dir)
-    spark.read.schema(userSchema).orc(Seq(dir, dir): _*)
-    Option(dir).map(spark.read.schema(userSchema).orc)
+  test("orc - API and behavior regarding schema") {
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
+      // Writer
+      spark.createDataset(data).toDF("str").write.mode(SaveMode.Overwrite).orc(dir)
+      val df = spark.read.orc(dir)
+      checkAnswer(df, spark.createDataset(data).toDF())
+      val schema = df.schema
 
-    // Writer
-    spark.range(10).write.orc(dir)
+      // Reader, without user specified schema
+      intercept[AnalysisException] {
+        testRead(spark.read.orc(), Seq.empty, schema)
+      }
+      testRead(spark.read.orc(dir), data, schema)
+      testRead(spark.read.orc(dir, dir), data ++ data, schema)
+      testRead(spark.read.orc(Seq(dir, dir): _*), data ++ data, schema)
+      // Test explicit calls to single arg method - SPARK-16009
+      testRead(Option(dir).map(spark.read.orc).get, data, schema)
+
+      // Reader, with user specified schema, data should be nulls as schema in file different
+      // from user schema
+      val expData = Seq[String](null, null, null)
+      testRead(spark.read.schema(userSchema).orc(), Seq.empty, userSchema)
+      testRead(spark.read.schema(userSchema).orc(dir), expData, userSchema)
+      testRead(spark.read.schema(userSchema).orc(dir, dir), expData ++ expData, userSchema)
+      testRead(
+        spark.read.schema(userSchema).orc(Seq(dir, dir): _*), expData ++ expData, userSchema)
+    }
   }
 
   test("column nullability and comment - write and then read") {
-    Seq("json", "parquet", "csv").foreach { format =>
-      val schema = StructType(
-        StructField("cl1", IntegerType, nullable = false).withComment("test") ::
-          StructField("cl2", IntegerType, nullable = true) ::
-          StructField("cl3", IntegerType, nullable = true) :: Nil)
-      val row = Row(3, null, 4)
-      val df = spark.createDataFrame(sparkContext.parallelize(row :: Nil), schema)
-
-      val tableName = "tab"
-      withTable(tableName) {
-        df.write.format(format).mode("overwrite").saveAsTable(tableName)
-        // Verify the DDL command result: DESCRIBE TABLE
-        checkAnswer(
-          sql(s"desc $tableName").select("col_name", "comment").where($"comment" === "test"),
-          Row("cl1", "test") :: Nil)
-        // Verify the schema
-        val expectedFields = schema.fields.map(f => f.copy(nullable = true))
-        assert(spark.table(tableName).schema == schema.copy(fields = expectedFields))
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
+      Seq("json", "orc", "parquet", "csv").foreach { format =>
+        val schema = StructType(
+          StructField("cl1", IntegerType, nullable = false).withComment("test") ::
+            StructField("cl2", IntegerType, nullable = true) ::
+            StructField("cl3", IntegerType, nullable = true) :: Nil)
+        val row = Row(3, null, 4)
+        val df = spark.createDataFrame(sparkContext.parallelize(row :: Nil), schema)
+
+        val tableName = "tab"
+        withTable(tableName) {
+          df.write.format(format).mode("overwrite").saveAsTable(tableName)
+          // Verify the DDL command result: DESCRIBE TABLE
+          checkAnswer(
+            sql(s"desc $tableName").select("col_name", "comment").where($"comment" === "test"),
+            Row("cl1", "test") :: Nil)
+          // Verify the schema
+          val expectedFields = schema.fields.map(f => f.copy(nullable = true))
+          assert(spark.table(tableName).schema == schema.copy(fields = expectedFields))
+        }
       }
     }
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index ee1f6ee173063..3018c0642f062 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils}
 import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions}
 import org.apache.spark.sql.hive.execution._
-import org.apache.spark.sql.hive.orc.OrcFileFormat
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
 
 
@@ -195,8 +194,19 @@ case class RelationConversions(
         .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet")
     } else {
       val options = relation.tableMeta.storage.properties
-      sessionCatalog.metastoreCatalog
-        .convertToLogicalRelation(relation, options, classOf[OrcFileFormat], "orc")
+      if (conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native") {
+        sessionCatalog.metastoreCatalog.convertToLogicalRelation(
+          relation,
+          options,
+          classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat],
+          "orc")
+      } else {
+        sessionCatalog.metastoreCatalog.convertToLogicalRelation(
+          relation,
+          options,
+          classOf[org.apache.spark.sql.hive.orc.OrcFileFormat],
+          "orc")
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 1fa9091f967a3..1ffaf30311037 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -28,7 +28,7 @@ import org.scalatest.BeforeAndAfterAll
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
-import org.apache.spark.sql.execution.datasources.{LogicalRelation, RecordReaderIterator}
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator}
 import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
@@ -621,4 +621,21 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
      makeOrcFile((1 to 10).map(Tuple1.apply), path2)
      assertResult(20)(read.orc(path1.getCanonicalPath, path2.getCanonicalPath).count())
    }
+
+  test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
+    Seq(
+      ("native", classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
+      ("hive", classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (i, format) =>
+
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> i) {
+        withTable("spark_20728") {
+          sql("CREATE TABLE spark_20728(a INT) USING ORC")
+          val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
+            case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
+          }
+          assert(fileFormat == Some(format))
+        }
+      }
+    }
+  }
 }

From 03fdc92e42d260a2b7c0090115f162ba5c091aae Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Tue, 5 Dec 2017 09:15:22 -0800
Subject: [PATCH 040/356] [SPARK-22681] Accumulator should only be updated once
 for each task in result stage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
As the doc says "For accumulator updates performed inside actions only, Spark guarantees that each task’s update to the accumulator will only be applied once, i.e. restarted tasks will not update the value."
But currently the code doesn't guarantee this.

## How was this patch tested?
New added tests.

Author: Carson Wang <carson.wang@intel.com>

Closes #19877 from carsonwang/fixAccum.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 14 ++++++++++---
 .../spark/scheduler/DAGSchedulerSuite.scala   | 20 +++++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 9153751d03c1b..c2498d4808e91 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1187,9 +1187,17 @@ class DAGScheduler(
     // only updated in certain cases.
     event.reason match {
       case Success =>
-        stage match {
-          case rs: ResultStage if rs.activeJob.isEmpty =>
-            // Ignore update if task's job has finished.
+        task match {
+          case rt: ResultTask[_, _] =>
+            val resultStage = stage.asInstanceOf[ResultStage]
+            resultStage.activeJob match {
+              case Some(job) =>
+                // Only update the accumulator once for each result task.
+                if (!job.finished(rt.outputId)) {
+                  updateAccumulators(event)
+                }
+              case None => // Ignore update if task's job has finished.
+            }
           case _ =>
             updateAccumulators(event)
         }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index feefb6a4d73f0..d812b5bd92c1b 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -1832,6 +1832,26 @@ class DAGSchedulerSuite extends SparkFunSuite with LocalSparkContext with TimeLi
     assertDataStructuresEmpty()
   }
 
+  test("accumulator not calculated for resubmitted task in result stage") {
+    val accum = AccumulatorSuite.createLongAccum("a")
+    val finalRdd = new MyRDD(sc, 2, Nil)
+    submit(finalRdd, Array(0, 1))
+    // finish the first task
+    completeWithAccumulator(accum.id, taskSets(0), Seq((Success, 42)))
+    // verify stage exists
+    assert(scheduler.stageIdToStage.contains(0))
+
+    // finish the first task again (simulate a speculative task or a resubmitted task)
+    completeWithAccumulator(accum.id, taskSets(0), Seq((Success, 42)))
+    assert(results === Map(0 -> 42))
+
+    // The accumulator should only be updated once.
+    assert(accum.value === 1)
+
+    runEvent(makeCompletionEvent(taskSets(0).tasks(1), Success, 42))
+    assertDataStructuresEmpty()
+  }
+
   test("accumulators are updated on exception failures") {
     val acc1 = AccumulatorSuite.createLongAccum("ingenieur")
     val acc2 = AccumulatorSuite.createLongAccum("boulanger")

From ced6ccf0d6f362e299f270ed2a474f2e14f845da Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 5 Dec 2017 10:15:15 -0800
Subject: [PATCH 041/356] [SPARK-22701][SQL] add
 ctx.splitExpressionsWithCurrentInputs

## What changes were proposed in this pull request?

This pattern appears many times in the codebase:
```
if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
  exprs.mkString("\n")
} else {
  ctx.splitExpressions(...)
}
```

This PR adds a `ctx.splitExpressionsWithCurrentInputs` for this pattern

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19895 from cloud-fan/splitExpression.
---
 .../sql/catalyst/expressions/arithmetic.scala |  4 +-
 .../expressions/codegen/CodeGenerator.scala   | 44 ++++-----
 .../codegen/GenerateMutableProjection.scala   |  4 +-
 .../codegen/GenerateSafeProjection.scala      |  2 +-
 .../expressions/complexTypeCreator.scala      |  6 +-
 .../expressions/conditionalExpressions.scala  | 84 ++++++++---------
 .../sql/catalyst/expressions/generators.scala |  2 +-
 .../spark/sql/catalyst/expressions/hash.scala | 55 +++++------
 .../expressions/nullExpressions.scala         | 94 ++++++++-----------
 .../expressions/objects/objects.scala         |  6 +-
 .../sql/catalyst/expressions/predicates.scala | 47 +++++-----
 .../expressions/stringExpressions.scala       | 37 ++++----
 12 files changed, 179 insertions(+), 206 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index d98f7b3d8efe6..739bd13c5078d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -614,7 +614,7 @@ case class Least(children: Seq[Expression]) extends Expression {
         }
       """
     }
-    val codes = ctx.splitExpressions(evalChildren.map(updateEval))
+    val codes = ctx.splitExpressionsWithCurrentInputs(evalChildren.map(updateEval))
     ev.copy(code = s"""
       ${ev.isNull} = true;
       ${ev.value} = ${ctx.defaultValue(dataType)};
@@ -680,7 +680,7 @@ case class Greatest(children: Seq[Expression]) extends Expression {
         }
       """
     }
-    val codes = ctx.splitExpressions(evalChildren.map(updateEval))
+    val codes = ctx.splitExpressionsWithCurrentInputs(evalChildren.map(updateEval))
     ev.copy(code = s"""
       ${ev.isNull} = true;
       ${ev.value} = ${ctx.defaultValue(dataType)};
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 1645db12c53f0..670c82eff9286 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -781,29 +781,26 @@ class CodegenContext {
    * beyond 1000kb, we declare a private, inner sub-class, and the function is inlined to it
    * instead, because classes have a constant pool limit of 65,536 named values.
    *
-   * Note that we will extract the current inputs of this context and pass them to the generated
-   * functions. The input is `INPUT_ROW` for normal codegen path, and `currentVars` for whole
-   * stage codegen path. Whole stage codegen path is not supported yet.
-   *
-   * @param expressions the codes to evaluate expressions.
-   */
-  def splitExpressions(expressions: Seq[String]): String = {
-    splitExpressions(expressions, funcName = "apply", extraArguments = Nil)
-  }
-
-  /**
-   * Similar to [[splitExpressions(expressions: Seq[String])]], but has customized function name
-   * and extra arguments.
+   * Note that different from `splitExpressions`, we will extract the current inputs of this
+   * context and pass them to the generated functions. The input is `INPUT_ROW` for normal codegen
+   * path, and `currentVars` for whole stage codegen path. Whole stage codegen path is not
+   * supported yet.
    *
    * @param expressions the codes to evaluate expressions.
    * @param funcName the split function name base.
-   * @param extraArguments the list of (type, name) of the arguments of the split function
-   *                       except for ctx.INPUT_ROW
-  */
-  def splitExpressions(
+   * @param extraArguments the list of (type, name) of the arguments of the split function,
+   *                       except for the current inputs like `ctx.INPUT_ROW`.
+   * @param returnType the return type of the split function.
+   * @param makeSplitFunction makes split function body, e.g. add preparation or cleanup.
+   * @param foldFunctions folds the split function calls.
+   */
+  def splitExpressionsWithCurrentInputs(
       expressions: Seq[String],
-      funcName: String,
-      extraArguments: Seq[(String, String)]): String = {
+      funcName: String = "apply",
+      extraArguments: Seq[(String, String)] = Nil,
+      returnType: String = "void",
+      makeSplitFunction: String => String = identity,
+      foldFunctions: Seq[String] => String = _.mkString("", ";\n", ";")): String = {
     // TODO: support whole stage codegen
     if (INPUT_ROW == null || currentVars != null) {
       expressions.mkString("\n")
@@ -811,13 +808,18 @@ class CodegenContext {
       splitExpressions(
         expressions,
         funcName,
-        arguments = ("InternalRow", INPUT_ROW) +: extraArguments)
+        ("InternalRow", INPUT_ROW) +: extraArguments,
+        returnType,
+        makeSplitFunction,
+        foldFunctions)
     }
   }
 
   /**
    * Splits the generated code of expressions into multiple functions, because function has
-   * 64kb code size limit in JVM
+   * 64kb code size limit in JVM. If the class to which the function would be inlined would grow
+   * beyond 1000kb, we declare a private, inner sub-class, and the function is inlined to it
+   * instead, because classes have a constant pool limit of 65,536 named values.
    *
    * @param expressions the codes to evaluate expressions.
    * @param funcName the split function name base.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index 5fdbda51b4ad1..bd8312eb8b7fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -91,8 +91,8 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
         ctx.updateColumn("mutableRow", e.dataType, i, ev, e.nullable)
     }
 
-    val allProjections = ctx.splitExpressions(projectionCodes)
-    val allUpdates = ctx.splitExpressions(updates)
+    val allProjections = ctx.splitExpressionsWithCurrentInputs(projectionCodes)
+    val allUpdates = ctx.splitExpressionsWithCurrentInputs(updates)
 
     val codeBody = s"""
       public java.lang.Object generate(Object[] references) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 5d35cce1a91cb..44e7148e5d98f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -159,7 +159,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
             }
           """
     }
-    val allExpressions = ctx.splitExpressions(expressionCodes)
+    val allExpressions = ctx.splitExpressionsWithCurrentInputs(expressionCodes)
 
     val codeBody = s"""
       public java.lang.Object generate(Object[] references) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index fc68bf478e1c8..087b21043b309 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -108,7 +108,7 @@ private [sql] object GenArrayData {
          }
        """
       }
-      val assignmentString = ctx.splitExpressions(
+      val assignmentString = ctx.splitExpressionsWithCurrentInputs(
         expressions = assignments,
         funcName = "apply",
         extraArguments = ("Object[]", arrayDataName) :: Nil)
@@ -139,7 +139,7 @@ private [sql] object GenArrayData {
          }
        """
       }
-      val assignmentString = ctx.splitExpressions(
+      val assignmentString = ctx.splitExpressionsWithCurrentInputs(
         expressions = assignments,
         funcName = "apply",
         extraArguments = ("UnsafeArrayData", arrayDataName) :: Nil)
@@ -357,7 +357,7 @@ case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStruc
     val rowClass = classOf[GenericInternalRow].getName
     val values = ctx.freshName("values")
     ctx.addMutableState("Object[]", values, s"$values = null;")
-    val valuesCode = ctx.splitExpressions(
+    val valuesCode = ctx.splitExpressionsWithCurrentInputs(
       valExprs.zipWithIndex.map { case (e, i) =>
         val eval = e.genCode(ctx)
         s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 43e643178c899..ae5f7140847db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -219,57 +219,51 @@ case class CaseWhen(
 
     val allConditions = cases ++ elseCode
 
-    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-        allConditions.mkString("\n")
-      } else {
-        // This generates code like:
-        //   conditionMet = caseWhen_1(i);
-        //   if(conditionMet) {
-        //     continue;
-        //   }
-        //   conditionMet = caseWhen_2(i);
-        //   if(conditionMet) {
-        //     continue;
-        //   }
-        //   ...
-        // and the declared methods are:
-        //   private boolean caseWhen_1234() {
-        //     boolean conditionMet = false;
-        //     do {
-        //       // here the evaluation of the conditions
-        //     } while (false);
-        //     return conditionMet;
-        //   }
-        ctx.splitExpressions(allConditions, "caseWhen",
-          ("InternalRow", ctx.INPUT_ROW) :: Nil,
-          returnType = ctx.JAVA_BOOLEAN,
-          makeSplitFunction = {
-            func =>
-              s"""
-                ${ctx.JAVA_BOOLEAN} $conditionMet = false;
-                do {
-                  $func
-                } while (false);
-                return $conditionMet;
-              """
-          },
-          foldFunctions = { funcCalls =>
-            funcCalls.map { funcCall =>
-              s"""
-                $conditionMet = $funcCall;
-                if ($conditionMet) {
-                  continue;
-                }"""
-            }.mkString
-          })
-      }
+    // This generates code like:
+    //   conditionMet = caseWhen_1(i);
+    //   if(conditionMet) {
+    //     continue;
+    //   }
+    //   conditionMet = caseWhen_2(i);
+    //   if(conditionMet) {
+    //     continue;
+    //   }
+    //   ...
+    // and the declared methods are:
+    //   private boolean caseWhen_1234() {
+    //     boolean conditionMet = false;
+    //     do {
+    //       // here the evaluation of the conditions
+    //     } while (false);
+    //     return conditionMet;
+    //   }
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = allConditions,
+      funcName = "caseWhen",
+      returnType = ctx.JAVA_BOOLEAN,
+      makeSplitFunction = func =>
+        s"""
+           |${ctx.JAVA_BOOLEAN} $conditionMet = false;
+           |do {
+           |  $func
+           |} while (false);
+           |return $conditionMet;
+         """.stripMargin,
+      foldFunctions = _.map { funcCall =>
+        s"""
+           |$conditionMet = $funcCall;
+           |if ($conditionMet) {
+           |  continue;
+           |}
+         """.stripMargin
+      }.mkString)
 
     ev.copy(code = s"""
       ${ev.isNull} = true;
       ${ev.value} = ${ctx.defaultValue(dataType)};
       ${ctx.JAVA_BOOLEAN} $conditionMet = false;
       do {
-        $code
+        $codes
       } while (false);""")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index f1aa130669266..cd38783a731ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -203,7 +203,7 @@ case class Stack(children: Seq[Expression]) extends Generator {
     ctx.addMutableState("InternalRow[]", rowData, s"$rowData = new InternalRow[$numRows];")
     val values = children.tail
     val dataTypes = values.take(numFields).map(_.dataType)
-    val code = ctx.splitExpressions(Seq.tabulate(numRows) { row =>
+    val code = ctx.splitExpressionsWithCurrentInputs(Seq.tabulate(numRows) { row =>
       val fields = Seq.tabulate(numFields) { col =>
         val index = row * numFields + col
         if (index < values.length) values(index) else Literal(null, dataTypes(col))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index d0ed2ab8f3f0e..055ebf6c0da54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -279,21 +279,17 @@ abstract class HashExpression[E] extends Expression {
     }
 
     val hashResultType = ctx.javaType(dataType)
-    val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-      childrenHash.mkString("\n")
-    } else {
-      ctx.splitExpressions(
-        expressions = childrenHash,
-        funcName = "computeHash",
-        arguments = Seq("InternalRow" -> ctx.INPUT_ROW, hashResultType -> ev.value),
-        returnType = hashResultType,
-        makeSplitFunction = body =>
-          s"""
-             |$body
-             |return ${ev.value};
-           """.stripMargin,
-        foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
-    }
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = childrenHash,
+      funcName = "computeHash",
+      extraArguments = Seq(hashResultType -> ev.value),
+      returnType = hashResultType,
+      makeSplitFunction = body =>
+        s"""
+           |$body
+           |return ${ev.value};
+         """.stripMargin,
+      foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
 
     ev.copy(code =
       s"""
@@ -652,22 +648,19 @@ case class HiveHash(children: Seq[Expression]) extends HashExpression[Int] {
        """.stripMargin
     }
 
-    val codes = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-      childrenHash.mkString("\n")
-    } else {
-      ctx.splitExpressions(
-        expressions = childrenHash,
-        funcName = "computeHash",
-        arguments = Seq("InternalRow" -> ctx.INPUT_ROW, ctx.JAVA_INT -> ev.value),
-        returnType = ctx.JAVA_INT,
-        makeSplitFunction = body =>
-          s"""
-             |${ctx.JAVA_INT} $childHash = 0;
-             |$body
-             |return ${ev.value};
-           """.stripMargin,
-        foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
-    }
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = childrenHash,
+      funcName = "computeHash",
+      extraArguments = Seq(ctx.JAVA_INT -> ev.value),
+      returnType = ctx.JAVA_INT,
+      makeSplitFunction = body =>
+        s"""
+           |${ctx.JAVA_INT} $childHash = 0;
+           |$body
+           |return ${ev.value};
+         """.stripMargin,
+      foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
+
 
     ev.copy(code =
       s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 3b52a0efd404a..26c9a41efc9f9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -87,37 +87,32 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
          |}
        """.stripMargin
     }
-    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-        evals.mkString("\n")
-      } else {
-        ctx.splitExpressions(evals, "coalesce",
-          ("InternalRow", ctx.INPUT_ROW) :: Nil,
-          makeSplitFunction = {
-            func =>
-              s"""
-                |do {
-                |  $func
-                |} while (false);
-              """.stripMargin
-          },
-          foldFunctions = { funcCalls =>
-            funcCalls.map { funcCall =>
-              s"""
-                 |$funcCall;
-                 |if (!${ev.isNull}) {
-                 |  continue;
-                 |}
-               """.stripMargin
-            }.mkString
-          })
-      }
+
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = evals,
+      funcName = "coalesce",
+      makeSplitFunction = func =>
+        s"""
+           |do {
+           |  $func
+           |} while (false);
+         """.stripMargin,
+      foldFunctions = _.map { funcCall =>
+        s"""
+           |$funcCall;
+           |if (!${ev.isNull}) {
+           |  continue;
+           |}
+         """.stripMargin
+      }.mkString)
+
 
     ev.copy(code =
       s"""
          |${ev.isNull} = true;
          |${ev.value} = ${ctx.defaultValue(dataType)};
          |do {
-         |  $code
+         |  $codes
          |} while (false);
        """.stripMargin)
   }
@@ -415,39 +410,32 @@ case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate
       }
     }
 
-    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-        evals.mkString("\n")
-      } else {
-        ctx.splitExpressions(
-          expressions = evals,
-          funcName = "atLeastNNonNulls",
-          arguments = ("InternalRow", ctx.INPUT_ROW) :: (ctx.JAVA_INT, nonnull) :: Nil,
-          returnType = ctx.JAVA_INT,
-          makeSplitFunction = { body =>
-            s"""
-               |do {
-               |  $body
-               |} while (false);
-               |return $nonnull;
-             """.stripMargin
-          },
-          foldFunctions = { funcCalls =>
-            funcCalls.map(funcCall =>
-              s"""
-                 |$nonnull = $funcCall;
-                 |if ($nonnull >= $n) {
-                 |  continue;
-                 |}
-               """.stripMargin).mkString("\n")
-          }
-        )
-      }
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = evals,
+      funcName = "atLeastNNonNulls",
+      extraArguments = (ctx.JAVA_INT, nonnull) :: Nil,
+      returnType = ctx.JAVA_INT,
+      makeSplitFunction = body =>
+        s"""
+           |do {
+           |  $body
+           |} while (false);
+           |return $nonnull;
+         """.stripMargin,
+      foldFunctions = _.map { funcCall =>
+        s"""
+           |$nonnull = $funcCall;
+           |if ($nonnull >= $n) {
+           |  continue;
+           |}
+         """.stripMargin
+      }.mkString)
 
     ev.copy(code =
       s"""
          |${ctx.JAVA_INT} $nonnull = 0;
          |do {
-         |  $code
+         |  $codes
          |} while (false);
          |${ctx.JAVA_BOOLEAN} ${ev.value} = $nonnull >= $n;
        """.stripMargin, isNull = "false")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index e2bc79d98b33d..730b2ff96da6c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -101,7 +101,7 @@ trait InvokeLike extends Expression with NonSQLExpression {
         """
       }
     }
-    val argCode = ctx.splitExpressions(argCodes)
+    val argCode = ctx.splitExpressionsWithCurrentInputs(argCodes)
 
     (argCode, argValues.mkString(", "), resultIsNull)
   }
@@ -1119,7 +1119,7 @@ case class CreateExternalRow(children: Seq[Expression], schema: StructType)
          """
     }
 
-    val childrenCode = ctx.splitExpressions(childrenCodes)
+    val childrenCode = ctx.splitExpressionsWithCurrentInputs(childrenCodes)
     val schemaField = ctx.addReferenceObj("schema", schema)
 
     val code = s"""
@@ -1254,7 +1254,7 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
            ${javaBeanInstance}.$setterMethod(${fieldGen.value});
          """
     }
-    val initializeCode = ctx.splitExpressions(initialize.toSeq)
+    val initializeCode = ctx.splitExpressionsWithCurrentInputs(initialize.toSeq)
 
     val code = s"""
       ${instanceGen.code}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 75cc9b3bd8045..04e669492ec6d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -253,31 +253,26 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
          |  continue;
          |}
        """.stripMargin)
-    val code = if (ctx.INPUT_ROW == null || ctx.currentVars != null) {
-        listCode.mkString("\n")
-      } else {
-        ctx.splitExpressions(
-          expressions = listCode,
-          funcName = "valueIn",
-          arguments = ("InternalRow", ctx.INPUT_ROW) :: (javaDataType, valueArg) :: Nil,
-          makeSplitFunction = { body =>
-            s"""
-               |do {
-               |  $body
-               |} while (false);
-             """.stripMargin
-          },
-          foldFunctions = { funcCalls =>
-            funcCalls.map(funcCall =>
-              s"""
-                 |$funcCall;
-                 |if (${ev.value}) {
-                 |  continue;
-                 |}
-               """.stripMargin).mkString("\n")
-          }
-        )
-      }
+
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = listCode,
+      funcName = "valueIn",
+      extraArguments = (javaDataType, valueArg) :: Nil,
+      makeSplitFunction = body =>
+        s"""
+           |do {
+           |  $body
+           |} while (false);
+         """.stripMargin,
+      foldFunctions = _.map { funcCall =>
+        s"""
+           |$funcCall;
+           |if (${ev.value}) {
+           |  continue;
+           |}
+         """.stripMargin
+      }.mkString("\n"))
+
     ev.copy(code =
       s"""
          |${valueGen.code}
@@ -286,7 +281,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
          |if (!${ev.isNull}) {
          |  $javaDataType $valueArg = ${valueGen.value};
          |  do {
-         |    $code
+         |    $codes
          |  } while (false);
          |}
        """.stripMargin)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 34917ace001fa..47f0b5741f67f 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -73,7 +73,7 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
         }
       """
     }
-    val codes = ctx.splitExpressions(
+    val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = inputs,
       funcName = "valueConcat",
       extraArguments = ("UTF8String[]", args) :: Nil)
@@ -152,7 +152,7 @@ case class ConcatWs(children: Seq[Expression])
           ""
         }
       }
-      val codes = ctx.splitExpressions(
+      val codes = ctx.splitExpressionsWithCurrentInputs(
           expressions = inputs,
           funcName = "valueConcatWs",
           extraArguments = ("UTF8String[]", args) :: Nil)
@@ -200,31 +200,32 @@ case class ConcatWs(children: Seq[Expression])
         }
       }.unzip
 
-      val codes = ctx.splitExpressions(evals.map(_.code))
-      val varargCounts = ctx.splitExpressions(
+      val codes = ctx.splitExpressionsWithCurrentInputs(evals.map(_.code))
+
+      val varargCounts = ctx.splitExpressionsWithCurrentInputs(
         expressions = varargCount,
         funcName = "varargCountsConcatWs",
-        arguments = ("InternalRow", ctx.INPUT_ROW) :: Nil,
         returnType = "int",
         makeSplitFunction = body =>
           s"""
-           int $varargNum = 0;
-           $body
-           return $varargNum;
-           """,
-        foldFunctions = _.mkString(s"$varargNum += ", s";\n$varargNum += ", ";"))
-      val varargBuilds = ctx.splitExpressions(
+             |int $varargNum = 0;
+             |$body
+             |return $varargNum;
+           """.stripMargin,
+        foldFunctions = _.map(funcCall => s"$varargNum += $funcCall;").mkString("\n"))
+
+      val varargBuilds = ctx.splitExpressionsWithCurrentInputs(
         expressions = varargBuild,
         funcName = "varargBuildsConcatWs",
-        arguments =
-          ("InternalRow", ctx.INPUT_ROW) :: ("UTF8String []", array) :: ("int", idxInVararg) :: Nil,
+        extraArguments = ("UTF8String []", array) :: ("int", idxInVararg) :: Nil,
         returnType = "int",
         makeSplitFunction = body =>
           s"""
-           $body
-           return $idxInVararg;
-           """,
-        foldFunctions = _.mkString(s"$idxInVararg = ", s";\n$idxInVararg = ", ";"))
+             |$body
+             |return $idxInVararg;
+           """.stripMargin,
+        foldFunctions = _.map(funcCall => s"$idxInVararg = $funcCall;").mkString("\n"))
+
       ev.copy(
         s"""
         $codes
@@ -1380,7 +1381,7 @@ case class FormatString(children: Expression*) extends Expression with ImplicitC
          $argList[$index] = $value;
        """
     }
-    val argListCodes = ctx.splitExpressions(
+    val argListCodes = ctx.splitExpressionsWithCurrentInputs(
       expressions = argListCode,
       funcName = "valueFormatString",
       extraArguments = ("Object[]", argList) :: Nil)

From 132a3f470811bb98f265d0c9ad2c161698e0237b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 5 Dec 2017 11:40:13 -0800
Subject: [PATCH 042/356] [SPARK-22500][SQL][FOLLOWUP] cast for struct can
 split code even with whole stage codegen

## What changes were proposed in this pull request?

A followup of https://github.com/apache/spark/pull/19730, we can split the code for casting struct even with whole stage codegen.

This PR also has some renaming to make the code easier to read.

## How was this patch tested?

existing test

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19891 from cloud-fan/cast.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 52 +++++++++----------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f4ecbdb8393ad..b8d3661a00abc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -548,8 +548,8 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType, nullSafeCast))
   }
 
-  // three function arguments are: child.primitive, result.primitive and result.isNull
-  // it returns the code snippets to be put in null safe evaluation region
+  // The function arguments are: `input`, `result` and `resultIsNull`. We don't need `inputIsNull`
+  // in parameter list, because the returned code will be put in null safe evaluation region.
   private[this] type CastFunction = (String, String, String) => String
 
   private[this] def nullSafeCastFunction(
@@ -584,15 +584,15 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       throw new SparkException(s"Cannot cast $from to $to.")
   }
 
-  // Since we need to cast child expressions recursively inside ComplexTypes, such as Map's
+  // Since we need to cast input expressions recursively inside ComplexTypes, such as Map's
   // Key and Value, Struct's field, we need to name out all the variable names involved in a cast.
-  private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String,
-    resultPrim: String, resultNull: String, resultType: DataType, cast: CastFunction): String = {
+  private[this] def castCode(ctx: CodegenContext, input: String, inputIsNull: String,
+    result: String, resultIsNull: String, resultType: DataType, cast: CastFunction): String = {
     s"""
-      boolean $resultNull = $childNull;
-      ${ctx.javaType(resultType)} $resultPrim = ${ctx.defaultValue(resultType)};
-      if (!$childNull) {
-        ${cast(childPrim, resultPrim, resultNull)}
+      boolean $resultIsNull = $inputIsNull;
+      ${ctx.javaType(resultType)} $result = ${ctx.defaultValue(resultType)};
+      if (!$inputIsNull) {
+        ${cast(input, result, resultIsNull)}
       }
     """
   }
@@ -1014,8 +1014,8 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       case (fromField, toField) => nullSafeCastFunction(fromField.dataType, toField.dataType, ctx)
     }
     val rowClass = classOf[GenericInternalRow].getName
-    val result = ctx.freshName("result")
-    val tmpRow = ctx.freshName("tmpRow")
+    val tmpResult = ctx.freshName("tmpResult")
+    val tmpInput = ctx.freshName("tmpInput")
 
     val fieldsEvalCode = fieldsCasts.zipWithIndex.map { case (cast, i) =>
       val fromFieldPrim = ctx.freshName("ffp")
@@ -1024,37 +1024,33 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       val toFieldNull = ctx.freshName("tfn")
       val fromType = ctx.javaType(from.fields(i).dataType)
       s"""
-        boolean $fromFieldNull = $tmpRow.isNullAt($i);
+        boolean $fromFieldNull = $tmpInput.isNullAt($i);
         if ($fromFieldNull) {
-          $result.setNullAt($i);
+          $tmpResult.setNullAt($i);
         } else {
           $fromType $fromFieldPrim =
-            ${ctx.getValue(tmpRow, from.fields(i).dataType, i.toString)};
+            ${ctx.getValue(tmpInput, from.fields(i).dataType, i.toString)};
           ${castCode(ctx, fromFieldPrim,
             fromFieldNull, toFieldPrim, toFieldNull, to.fields(i).dataType, cast)}
           if ($toFieldNull) {
-            $result.setNullAt($i);
+            $tmpResult.setNullAt($i);
           } else {
-            ${ctx.setColumn(result, to.fields(i).dataType, i, toFieldPrim)};
+            ${ctx.setColumn(tmpResult, to.fields(i).dataType, i, toFieldPrim)};
           }
         }
        """
     }
-    val fieldsEvalCodes = if (ctx.currentVars == null) {
-      ctx.splitExpressions(
-        expressions = fieldsEvalCode,
-        funcName = "castStruct",
-        arguments = ("InternalRow", tmpRow) :: (rowClass, result) :: Nil)
-    } else {
-      fieldsEvalCode.mkString("\n")
-    }
+    val fieldsEvalCodes = ctx.splitExpressions(
+      expressions = fieldsEvalCode,
+      funcName = "castStruct",
+      arguments = ("InternalRow", tmpInput) :: (rowClass, tmpResult) :: Nil)
 
-    (c, evPrim, evNull) =>
+    (input, result, resultIsNull) =>
       s"""
-        final $rowClass $result = new $rowClass(${fieldsCasts.length});
-        final InternalRow $tmpRow = $c;
+        final $rowClass $tmpResult = new $rowClass(${fieldsCasts.length});
+        final InternalRow $tmpInput = $input;
         $fieldsEvalCodes
-        $evPrim = $result;
+        $result = $tmpResult;
       """
   }
 

From 1e17ab83de29bca1823a537d7c57ffc4de8a26ee Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Tue, 5 Dec 2017 15:15:32 -0800
Subject: [PATCH 043/356] [SPARK-22662][SQL] Failed to prune columns after
 rewriting predicate subquery

## What changes were proposed in this pull request?

As a simple example:
```
spark-sql> create table base (a int, b int) using parquet;
Time taken: 0.066 seconds
spark-sql> create table relInSubq ( x int, y int, z int) using parquet;
Time taken: 0.042 seconds
spark-sql> explain select a from base where a in (select x from relInSubq);
== Physical Plan ==
*Project [a#83]
+- *BroadcastHashJoin [a#83], [x#85], LeftSemi, BuildRight
   :- *FileScan parquet default.base[a#83,b#84] Batched: true, Format: Parquet, Location: InMemoryFileIndex[hdfs://100.0.0.4:9000/wzh/base], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<a:int,b:int>
   +- BroadcastExchange HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)))
      +- *Project [x#85]
         +- *FileScan parquet default.relinsubq[x#85] Batched: true, Format: Parquet, Location: InMemoryFileIndex[hdfs://100.0.0.4:9000/wzh/relinsubq], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<x:int>
```
We only need column `a` in table `base`, but all columns (`a`, `b`) are fetched.

The reason is that, in "Operator Optimizations" batch, `ColumnPruning` first produces a `Project` on table `base`, but then it's removed by `removeProjectBeforeFilter`. Because at that time, the predicate subquery is in filter form. Then, in "Rewrite Subquery" batch, `RewritePredicateSubquery` converts the subquery into a LeftSemi join, but this batch doesn't have the `ColumnPruning` rule. This results in reading all columns for the `base` table.

## How was this patch tested?
Added a new test case.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19855 from wzhfy/column_pruning_subquery.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  4 +-
 .../optimizer/RewriteSubquerySuite.scala      | 55 +++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 8a5c486912abf..484cd8c2475f5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -141,7 +141,9 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       CheckCartesianProducts) ::
     Batch("RewriteSubquery", Once,
       RewritePredicateSubquery,
-      CollapseProject) :: Nil
+      ColumnPruning,
+      CollapseProject,
+      RemoveRedundantProject) :: Nil
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
new file mode 100644
index 0000000000000..6b3739c372c3a
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteSubquerySuite.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions.ListQuery
+import org.apache.spark.sql.catalyst.plans.{LeftSemi, PlanTest}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+
+class RewriteSubquerySuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("Column Pruning", FixedPoint(100), ColumnPruning) ::
+      Batch("Rewrite Subquery", FixedPoint(1),
+        RewritePredicateSubquery,
+        ColumnPruning,
+        CollapseProject,
+        RemoveRedundantProject) :: Nil
+  }
+
+  test("Column pruning after rewriting predicate subquery") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val relInSubquery = LocalRelation('x.int, 'y.int, 'z.int)
+
+    val query = relation.where('a.in(ListQuery(relInSubquery.select('x)))).select('a)
+
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = relation
+      .select('a)
+      .join(relInSubquery.select('x), LeftSemi, Some('a === 'x))
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+}

From 59aa3d56af956267261b135936651bbbd41b1bfc Mon Sep 17 00:00:00 2001
From: Mark Petruska <petruska.mark@gmail.com>
Date: Tue, 5 Dec 2017 18:08:36 -0600
Subject: [PATCH 044/356] [SPARK-20706][SPARK-SHELL] Spark-shell not overriding
 method/variable definition

## What changes were proposed in this pull request?

[SPARK-20706](https://issues.apache.org/jira/browse/SPARK-20706): Spark-shell not overriding method/variable definition
This is a Scala repl bug ( [SI-9740](https://github.com/scala/bug/issues/9740) ), was fixed in version 2.11.9 ( [see the original PR](https://github.com/scala/scala/pull/5090) )

## How was this patch tested?

Added a new test case in `ReplSuite`.

Author: Mark Petruska <petruska.mark@gmail.com>

Closes #19879 from mpetruska/SPARK-20706.
---
 .../spark/repl/SparkILoopInterpreter.scala    | 138 +++++++++++++++++-
 .../org/apache/spark/repl/ReplSuite.scala     |  39 +++--
 2 files changed, 167 insertions(+), 10 deletions(-)

diff --git a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
index 0803426403af5..e736607a9a6b9 100644
--- a/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
+++ b/repl/scala-2.11/src/main/scala/org/apache/spark/repl/SparkILoopInterpreter.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.repl
 
+import scala.collection.mutable
 import scala.tools.nsc.Settings
 import scala.tools.nsc.interpreter._
 
@@ -30,7 +31,7 @@ class SparkILoopInterpreter(settings: Settings, out: JPrintWriter) extends IMain
 
     override def chooseHandler(member: intp.global.Tree): MemberHandler = member match {
       case member: Import => new SparkImportHandler(member)
-      case _ => super.chooseHandler (member)
+      case _ => super.chooseHandler(member)
     }
 
     class SparkImportHandler(imp: Import) extends ImportHandler(imp: Import) {
@@ -100,4 +101,139 @@ class SparkILoopInterpreter(settings: Settings, out: JPrintWriter) extends IMain
   override def typeOfExpression(expr: String, silent: Boolean): global.Type =
     expressionTyper.typeOfExpression(expr, silent)
 
+
+  import global.Name
+  override def importsCode(wanted: Set[Name], wrapper: Request#Wrapper,
+                           definesClass: Boolean, generousImports: Boolean): ComputedImports = {
+
+    import global._
+    import definitions.{ ObjectClass, ScalaPackage, JavaLangPackage, PredefModule }
+    import memberHandlers._
+
+    val header, code, trailingBraces, accessPath = new StringBuilder
+    val currentImps = mutable.HashSet[Name]()
+    // only emit predef import header if name not resolved in history, loosely
+    var predefEscapes = false
+
+    /**
+     * Narrow down the list of requests from which imports
+     * should be taken.  Removes requests which cannot contribute
+     * useful imports for the specified set of wanted names.
+     */
+    case class ReqAndHandler(req: Request, handler: MemberHandler)
+
+    def reqsToUse: List[ReqAndHandler] = {
+      /**
+       * Loop through a list of MemberHandlers and select which ones to keep.
+       * 'wanted' is the set of names that need to be imported.
+       */
+      def select(reqs: List[ReqAndHandler], wanted: Set[Name]): List[ReqAndHandler] = {
+        // Single symbol imports might be implicits! See bug #1752.  Rather than
+        // try to finesse this, we will mimic all imports for now.
+        def keepHandler(handler: MemberHandler) = handler match {
+          // While defining classes in class based mode - implicits are not needed.
+          case h: ImportHandler if isClassBased && definesClass =>
+            h.importedNames.exists(x => wanted.contains(x))
+          case _: ImportHandler => true
+          case x if generousImports => x.definesImplicit ||
+            (x.definedNames exists (d => wanted.exists(w => d.startsWith(w))))
+          case x => x.definesImplicit ||
+            (x.definedNames exists wanted)
+        }
+
+        reqs match {
+          case Nil =>
+            predefEscapes = wanted contains PredefModule.name ; Nil
+          case rh :: rest if !keepHandler(rh.handler) => select(rest, wanted)
+          case rh :: rest =>
+            import rh.handler._
+            val augment = rh match {
+              case ReqAndHandler(_, _: ImportHandler) => referencedNames
+              case _ => Nil
+            }
+            val newWanted = wanted ++ augment -- definedNames -- importedNames
+            rh :: select(rest, newWanted)
+        }
+      }
+
+      /** Flatten the handlers out and pair each with the original request */
+      select(allReqAndHandlers reverseMap { case (r, h) => ReqAndHandler(r, h) }, wanted).reverse
+    }
+
+    // add code for a new object to hold some imports
+    def addWrapper() {
+      import nme.{ INTERPRETER_IMPORT_WRAPPER => iw }
+      code append (wrapper.prewrap format iw)
+      trailingBraces append wrapper.postwrap
+      accessPath append s".$iw"
+      currentImps.clear()
+    }
+
+    def maybeWrap(names: Name*) = if (names exists currentImps) addWrapper()
+
+    def wrapBeforeAndAfter[T](op: => T): T = {
+      addWrapper()
+      try op finally addWrapper()
+    }
+
+    // imports from Predef are relocated to the template header to allow hiding.
+    def checkHeader(h: ImportHandler) = h.referencedNames contains PredefModule.name
+
+    // loop through previous requests, adding imports for each one
+    wrapBeforeAndAfter {
+      // Reusing a single temporary value when import from a line with multiple definitions.
+      val tempValLines = mutable.Set[Int]()
+      for (ReqAndHandler(req, handler) <- reqsToUse) {
+        val objName = req.lineRep.readPathInstance
+        handler match {
+          case h: ImportHandler if checkHeader(h) =>
+            header.clear()
+            header append f"${h.member}%n"
+          // If the user entered an import, then just use it; add an import wrapping
+          // level if the import might conflict with some other import
+          case x: ImportHandler if x.importsWildcard =>
+            wrapBeforeAndAfter(code append (x.member + "\n"))
+          case x: ImportHandler =>
+            maybeWrap(x.importedNames: _*)
+            code append (x.member + "\n")
+            currentImps ++= x.importedNames
+
+          case x if isClassBased =>
+            for (sym <- x.definedSymbols) {
+              maybeWrap(sym.name)
+              x match {
+                case _: ClassHandler =>
+                  code.append(s"import ${objName}${req.accessPath}.`${sym.name}`\n")
+                case _ =>
+                  val valName = s"${req.lineRep.packageName}${req.lineRep.readName}"
+                  if (!tempValLines.contains(req.lineRep.lineId)) {
+                    code.append(s"val $valName: ${objName}.type = $objName\n")
+                    tempValLines += req.lineRep.lineId
+                  }
+                  code.append(s"import ${valName}${req.accessPath}.`${sym.name}`\n")
+              }
+              currentImps += sym.name
+            }
+          // For other requests, import each defined name.
+          // import them explicitly instead of with _, so that
+          // ambiguity errors will not be generated. Also, quote
+          // the name of the variable, so that we don't need to
+          // handle quoting keywords separately.
+          case x =>
+            for (sym <- x.definedSymbols) {
+              maybeWrap(sym.name)
+              code append s"import ${x.path}\n"
+              currentImps += sym.name
+            }
+        }
+      }
+    }
+
+    val computedHeader = if (predefEscapes) header.toString else ""
+    ComputedImports(computedHeader, code.toString, trailingBraces.toString, accessPath.toString)
+  }
+
+  private def allReqAndHandlers =
+    prevRequestList flatMap (req => req.handlers map (req -> _))
+
 }
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index a5053521f8e31..cdd5cdd841740 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -227,14 +227,35 @@ class ReplSuite extends SparkFunSuite {
     assertDoesNotContain("error: not found: value sc", output)
   }
 
-   test("spark-shell should find imported types in class constructors and extends clause") {
-     val output = runInterpreter("local",
-       """
-         |import org.apache.spark.Partition
-         |class P(p: Partition)
-         |class P(val index: Int) extends Partition
-       """.stripMargin)
-     assertDoesNotContain("error: not found: type Partition", output)
-   }
+  test("spark-shell should find imported types in class constructors and extends clause") {
+    val output = runInterpreter("local",
+      """
+        |import org.apache.spark.Partition
+        |class P(p: Partition)
+        |class P(val index: Int) extends Partition
+      """.stripMargin)
+    assertDoesNotContain("error: not found: type Partition", output)
+  }
+
+  test("spark-shell should shadow val/def definitions correctly") {
+    val output1 = runInterpreter("local",
+      """
+        |def myMethod() = "first definition"
+        |val tmp = myMethod(); val out = tmp
+        |def myMethod() = "second definition"
+        |val tmp = myMethod(); val out = s"$tmp aabbcc"
+      """.stripMargin)
+    assertContains("second definition aabbcc", output1)
+
+    val output2 = runInterpreter("local",
+      """
+        |val a = 1
+        |val b = a; val c = b;
+        |val a = 2
+        |val b = a; val c = b;
+        |s"!!$b!!"
+      """.stripMargin)
+    assertContains("!!2!!", output2)
+  }
 
 }

From 82183f7b57f2a93e646c56a9e37fac64b348ff0b Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 6 Dec 2017 10:52:29 +0800
Subject: [PATCH 045/356] [SPARK-22686][SQL] DROP TABLE IF EXISTS should not
 show AnalysisException

## What changes were proposed in this pull request?

During [SPARK-22488](https://github.com/apache/spark/pull/19713) to fix view resolution issue, there occurs a regression at `2.2.1` and `master` branch like the following. This PR fixes that.

```scala
scala> spark.version
res2: String = 2.2.1

scala> sql("DROP TABLE IF EXISTS t").show
17/12/04 21:01:06 WARN DropTableCommand: org.apache.spark.sql.AnalysisException:
Table or view not found: t;
org.apache.spark.sql.AnalysisException: Table or view not found: t;
```

## How was this patch tested?

Manual.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19888 from dongjoon-hyun/SPARK-22686.
---
 .../spark/sql/execution/command/ddl.scala     | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 568567aa8ea88..0142f17ce62e2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -203,14 +203,20 @@ case class DropTableCommand(
         case _ =>
       }
     }
-    try {
-      sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName))
-    } catch {
-      case _: NoSuchTableException if ifExists =>
-      case NonFatal(e) => log.warn(e.toString, e)
+
+    if (catalog.isTemporaryTable(tableName) || catalog.tableExists(tableName)) {
+      try {
+        sparkSession.sharedState.cacheManager.uncacheQuery(sparkSession.table(tableName))
+      } catch {
+        case NonFatal(e) => log.warn(e.toString, e)
+      }
+      catalog.refreshTable(tableName)
+      catalog.dropTable(tableName, ifExists, purge)
+    } else if (ifExists) {
+      // no-op
+    } else {
+      throw new AnalysisException(s"Table or view not found: ${tableName.identifier}")
     }
-    catalog.refreshTable(tableName)
-    catalog.dropTable(tableName, ifExists, purge)
     Seq.empty[Row]
   }
 }

From 00d176d2fe7bbdf55cb3146a9cb04ca99b1858b7 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Tue, 5 Dec 2017 21:43:41 -0800
Subject: [PATCH 046/356] [SPARK-20392][SQL] Set barrier to prevent re-entering
 a tree

## What changes were proposed in this pull request?

The SQL `Analyzer` goes through a whole query plan even most part of it is analyzed. This increases the time spent on query analysis for long pipelines in ML, especially.

This patch adds a logical node called `AnalysisBarrier` that wraps an analyzed logical plan to prevent it from analysis again. The barrier is applied to the analyzed logical plan in `Dataset`. It won't change the output of wrapped logical plan and just acts as a wrapper to hide it from analyzer. New operations on the dataset will be put on the barrier, so only the new nodes created will be analyzed.

This analysis barrier will be removed at the end of analysis stage.

## How was this patch tested?

Added tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19873 from viirya/SPARK-20392-reopen.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 77 ++++++++++------
 .../sql/catalyst/analysis/CheckAnalysis.scala |  4 -
 .../catalyst/analysis/DecimalPrecision.scala  |  2 +-
 .../ResolveTableValuedFunctions.scala         |  2 +-
 .../SubstituteUnresolvedOrdinals.scala        |  2 +-
 .../sql/catalyst/analysis/TypeCoercion.scala  | 32 ++++---
 .../catalyst/analysis/timeZoneAnalysis.scala  |  2 +-
 .../spark/sql/catalyst/analysis/view.scala    |  2 +-
 .../sql/catalyst/optimizer/subquery.scala     |  2 +-
 .../catalyst/plans/logical/LogicalPlan.scala  | 49 ----------
 .../plans/logical/basicLogicalOperators.scala | 19 ++++
 .../sql/catalyst/analysis/AnalysisSuite.scala | 14 +++
 .../sql/catalyst/plans/LogicalPlanSuite.scala | 42 ++++++---
 .../scala/org/apache/spark/sql/Dataset.scala  | 91 ++++++++++---------
 .../sql/execution/datasources/rules.scala     |  2 +-
 .../spark/sql/execution/PlannerSuite.scala    |  2 +-
 .../spark/sql/hive/HiveStrategies.scala       |  6 +-
 17 files changed, 185 insertions(+), 165 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e5c93b5f0e059..0d5e866c0683e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -165,14 +165,15 @@ class Analyzer(
     Batch("Subquery", Once,
       UpdateOuterReferences),
     Batch("Cleanup", fixedPoint,
-      CleanupAliases)
+      CleanupAliases,
+      EliminateBarriers)
   )
 
   /**
    * Analyze cte definitions and substitute child plan with analyzed cte definitions.
    */
   object CTESubstitution extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators  {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp  {
       case With(child, relations) =>
         substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) {
           case (resolved, (name, relation)) =>
@@ -200,7 +201,7 @@ class Analyzer(
    * Substitute child plan with WindowSpecDefinitions.
    */
   object WindowsSubstitution extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       // Lookup WindowSpecDefinitions. This rule works with unresolved children.
       case WithWindowDefinition(windowDefinitions, child) =>
         child.transform {
@@ -242,7 +243,7 @@ class Analyzer(
     private def hasUnresolvedAlias(exprs: Seq[NamedExpression]) =
       exprs.exists(_.find(_.isInstanceOf[UnresolvedAlias]).isDefined)
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case Aggregate(groups, aggs, child) if child.resolved && hasUnresolvedAlias(aggs) =>
         Aggregate(groups, assignAliases(aggs), child)
 
@@ -611,7 +612,7 @@ class Analyzer(
       case _ => plan
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case i @ InsertIntoTable(u: UnresolvedRelation, parts, child, _, _) if child.resolved =>
         EliminateSubqueryAliases(lookupTableFromCatalog(u)) match {
           case v: View =>
@@ -666,7 +667,9 @@ class Analyzer(
      * Generate a new logical plan for the right child with different expression IDs
      * for all conflicting attributes.
      */
-    private def dedupRight (left: LogicalPlan, right: LogicalPlan): LogicalPlan = {
+    private def dedupRight (left: LogicalPlan, originalRight: LogicalPlan): LogicalPlan = {
+      // Remove analysis barrier if any.
+      val right = EliminateBarriers(originalRight)
       val conflictingAttributes = left.outputSet.intersect(right.outputSet)
       logDebug(s"Conflicting attributes ${conflictingAttributes.mkString(",")} " +
         s"between $left and $right")
@@ -709,7 +712,7 @@ class Analyzer(
            * that this rule cannot handle. When that is the case, there must be another rule
            * that resolves these conflicts. Otherwise, the analysis will fail.
            */
-          right
+          originalRight
         case Some((oldRelation, newRelation)) =>
           val attributeRewrites = AttributeMap(oldRelation.output.zip(newRelation.output))
           val newRight = right transformUp {
@@ -722,7 +725,7 @@ class Analyzer(
                 s.withNewPlan(dedupOuterReferencesInSubquery(s.plan, attributeRewrites))
             }
           }
-          newRight
+          AnalysisBarrier(newRight)
       }
     }
 
@@ -799,7 +802,7 @@ class Analyzer(
       case _ => e.mapChildren(resolve(_, q))
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p: LogicalPlan if !p.childrenResolved => p
 
       // If the projection list contains Stars, expand it.
@@ -993,7 +996,7 @@ class Analyzer(
    * have no effect on the results.
    */
   object ResolveOrdinalInOrderByAndGroupBy extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.childrenResolved => p
       // Replace the index with the related attribute for ORDER BY,
       // which is a 1-base position of the projection list.
@@ -1049,7 +1052,7 @@ class Analyzer(
       }}
     }
 
-    override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case agg @ Aggregate(groups, aggs, child)
           if conf.groupByAliases && child.resolved && aggs.forall(_.resolved) &&
             groups.exists(!_.resolved) =>
@@ -1073,11 +1076,13 @@ class Analyzer(
    * The HAVING clause could also used a grouping columns that is not presented in the SELECT.
    */
   object ResolveMissingReferences extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       // Skip sort with aggregate. This will be handled in ResolveAggregateFunctions
+      case sa @ Sort(_, _, AnalysisBarrier(child: Aggregate)) => sa
       case sa @ Sort(_, _, child: Aggregate) => sa
 
-      case s @ Sort(order, _, child) if !s.resolved && child.resolved =>
+      case s @ Sort(order, _, originalChild) if !s.resolved && originalChild.resolved =>
+        val child = EliminateBarriers(originalChild)
         try {
           val newOrder = order.map(resolveExpressionRecursively(_, child).asInstanceOf[SortOrder])
           val requiredAttrs = AttributeSet(newOrder).filter(_.resolved)
@@ -1098,7 +1103,8 @@ class Analyzer(
           case ae: AnalysisException => s
         }
 
-      case f @ Filter(cond, child) if !f.resolved && child.resolved =>
+      case f @ Filter(cond, originalChild) if !f.resolved && originalChild.resolved =>
+        val child = EliminateBarriers(originalChild)
         try {
           val newCond = resolveExpressionRecursively(cond, child)
           val requiredAttrs = newCond.references.filter(_.resolved)
@@ -1125,7 +1131,7 @@ class Analyzer(
      */
     private def addMissingAttr(plan: LogicalPlan, missingAttrs: AttributeSet): LogicalPlan = {
       if (missingAttrs.isEmpty) {
-        return plan
+        return AnalysisBarrier(plan)
       }
       plan match {
         case p: Project =>
@@ -1197,7 +1203,7 @@ class Analyzer(
    * Replaces [[UnresolvedFunction]]s with concrete [[Expression]]s.
    */
   object ResolveFunctions extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case q: LogicalPlan =>
         q transformExpressions {
           case u if !u.childrenResolved => u // Skip until children are resolved.
@@ -1334,7 +1340,7 @@ class Analyzer(
     /**
      * Resolve and rewrite all subqueries in an operator tree..
      */
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       // In case of HAVING (a filter after an aggregate) we use both the aggregate and
       // its child for resolution.
       case f @ Filter(_, a: Aggregate) if f.childrenResolved =>
@@ -1350,7 +1356,7 @@ class Analyzer(
    */
   object ResolveSubqueryColumnAliases extends Rule[LogicalPlan] {
 
-     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+     def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case u @ UnresolvedSubqueryColumnAliases(columnNames, child) if child.resolved =>
         // Resolves output attributes if a query has alias names in its subquery:
         // e.g., SELECT * FROM (SELECT 1 AS a, 1 AS b) t(col1, col2)
@@ -1373,7 +1379,7 @@ class Analyzer(
    * Turns projections that contain aggregate expressions into aggregations.
    */
   object GlobalAggregates extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case Project(projectList, child) if containsAggregates(projectList) =>
         Aggregate(Nil, projectList, child)
     }
@@ -1399,7 +1405,9 @@ class Analyzer(
    * underlying aggregate operator and then projected away after the original operator.
    */
   object ResolveAggregateFunctions extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
+      case filter @ Filter(havingCondition, AnalysisBarrier(aggregate: Aggregate)) =>
+        apply(Filter(havingCondition, aggregate)).mapChildren(AnalysisBarrier)
       case filter @ Filter(havingCondition,
              aggregate @ Aggregate(grouping, originalAggExprs, child))
           if aggregate.resolved =>
@@ -1459,6 +1467,8 @@ class Analyzer(
           case ae: AnalysisException => filter
         }
 
+      case sort @ Sort(sortOrder, global, AnalysisBarrier(aggregate: Aggregate)) =>
+        apply(Sort(sortOrder, global, aggregate)).mapChildren(AnalysisBarrier)
       case sort @ Sort(sortOrder, global, aggregate: Aggregate) if aggregate.resolved =>
 
         // Try resolving the ordering as though it is in the aggregate clause.
@@ -1571,7 +1581,7 @@ class Analyzer(
       }
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case Project(projectList, _) if projectList.exists(hasNestedGenerator) =>
         val nestedGenerator = projectList.find(hasNestedGenerator).get
         throw new AnalysisException("Generators are not supported when it's nested in " +
@@ -1629,7 +1639,7 @@ class Analyzer(
    * that wrap the [[Generator]].
    */
   object ResolveGenerate extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case g: Generate if !g.child.resolved || !g.generator.resolved => g
       case g: Generate if !g.resolved =>
         g.copy(generatorOutput = makeGeneratorOutput(g.generator, g.generatorOutput.map(_.name)))
@@ -1946,7 +1956,7 @@ class Analyzer(
    * put them into an inner Project and finally project them away at the outer Project.
    */
   object PullOutNondeterministic extends Rule[LogicalPlan] {
-    override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.resolved => p // Skip unresolved nodes.
       case p: Project => p
       case f: Filter => f
@@ -1991,7 +2001,7 @@ class Analyzer(
    * and we should return null if the input is null.
    */
   object HandleNullInputsForUDF extends Rule[LogicalPlan] {
-    override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.resolved => p // Skip unresolved nodes.
 
       case p => p transformExpressionsUp {
@@ -2056,7 +2066,7 @@ class Analyzer(
    * Then apply a Project on a normal Join to eliminate natural or using join.
    */
   object ResolveNaturalAndUsingJoin extends Rule[LogicalPlan] {
-    override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case j @ Join(left, right, UsingJoin(joinType, usingCols), condition)
           if left.resolved && right.resolved && j.duplicateResolved =>
         commonNaturalJoinProcessing(left, right, joinType, usingCols, None)
@@ -2121,7 +2131,7 @@ class Analyzer(
    * to the given input attributes.
    */
   object ResolveDeserializer extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.childrenResolved => p
       case p if p.resolved => p
 
@@ -2207,7 +2217,7 @@ class Analyzer(
    * constructed is an inner class.
    */
   object ResolveNewInstance extends Rule[LogicalPlan] {
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.childrenResolved => p
       case p if p.resolved => p
 
@@ -2241,7 +2251,7 @@ class Analyzer(
         "type of the field in the target object")
     }
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+    def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
       case p if !p.childrenResolved => p
       case p if p.resolved => p
 
@@ -2300,7 +2310,7 @@ object CleanupAliases extends Rule[LogicalPlan] {
     case other => trimAliases(other)
   }
 
-  override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
     case Project(projectList, child) =>
       val cleanedProjectList =
         projectList.map(trimNonTopLevelAliases(_).asInstanceOf[NamedExpression])
@@ -2329,6 +2339,13 @@ object CleanupAliases extends Rule[LogicalPlan] {
   }
 }
 
+/** Remove the barrier nodes of analysis */
+object EliminateBarriers extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
+    case AnalysisBarrier(child) => child
+  }
+}
+
 /**
  * Ignore event time watermark in batch query, which is only supported in Structured Streaming.
  * TODO: add this rule into analyzer rule list.
@@ -2379,7 +2396,7 @@ object TimeWindowing extends Rule[LogicalPlan] {
    * @return the logical plan that will generate the time windows using the Expand operator, with
    *         the Filter operator for correctness and Project for usability.
    */
-  def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators {
+  def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
     case p: LogicalPlan if p.children.size == 1 =>
       val child = p.children.head
       val windowExpressions =
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index b5e8bdd79869e..6894aed15c16f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -78,8 +78,6 @@ trait CheckAnalysis extends PredicateHelper {
     // We transform up and order the rules so as to catch the first possible failure instead
     // of the result of cascading resolution failures.
     plan.foreachUp {
-      case p if p.analyzed => // Skip already analyzed sub-plans
-
       case u: UnresolvedRelation =>
         u.failAnalysis(s"Table or view not found: ${u.tableIdentifier}")
 
@@ -353,8 +351,6 @@ trait CheckAnalysis extends PredicateHelper {
       case o if !o.resolved => failAnalysis(s"unresolved operator ${o.simpleString}")
       case _ =>
     }
-
-    plan.foreach(_.setAnalyzed())
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
index 070bc542e4852..a8100b9b24aac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecision.scala
@@ -78,7 +78,7 @@ object DecimalPrecision extends TypeCoercionRule {
     PromotePrecision(Cast(e, dataType))
   }
 
-  override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transformUp {
     // fix decimal precision for expressions
     case q => q.transformExpressionsUp(
       decimalAndDecimal.orElse(integralAndDecimalLiteral).orElse(nondecimalAndDecimal))
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala
index 7358f9ee36921..a214e59302cd9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveTableValuedFunctions.scala
@@ -103,7 +103,7 @@ object ResolveTableValuedFunctions extends Rule[LogicalPlan] {
       })
   )
 
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
       val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match {
         case Some(tvf) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
index 860d20f897690..f9fd0df9e4010 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinals.scala
@@ -33,7 +33,7 @@ class SubstituteUnresolvedOrdinals(conf: SQLConf) extends Rule[LogicalPlan] {
     case _ => false
   }
 
-  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case s: Sort if conf.orderByOrdinal && s.order.exists(o => isIntLiteral(o.child)) =>
       val newOrders = s.order.map {
         case order @ SortOrder(ordinal @ Literal(index: Int, IntegerType), _, _, _) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 1ee2f6e941045..2f306f58b7b80 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -247,9 +247,7 @@ object TypeCoercion {
    */
   object WidenSetOperationTypes extends Rule[LogicalPlan] {
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-      case p if p.analyzed => p
-
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
       case s @ SetOperation(left, right) if s.childrenResolved &&
           left.output.length == right.output.length && !s.resolved =>
         val newChildren: Seq[LogicalPlan] = buildNewChildrenWithWiderTypes(left :: right :: Nil)
@@ -321,7 +319,8 @@ object TypeCoercion {
       }
     }
 
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -380,7 +379,8 @@ object TypeCoercion {
       }
     }
 
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -439,7 +439,7 @@ object TypeCoercion {
     private val trueValues = Seq(1.toByte, 1.toShort, 1, 1L, Decimal.ONE)
     private val falseValues = Seq(0.toByte, 0.toShort, 0, 0L, Decimal.ZERO)
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -480,7 +480,8 @@ object TypeCoercion {
    * This ensure that the types for various functions are as expected.
    */
   object FunctionArgumentConversion extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -570,7 +571,8 @@ object TypeCoercion {
    * converted to fractional types.
    */
   object Division extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who has not been resolved yet,
       // as this is an extra rule which should be applied at last.
       case e if !e.childrenResolved => e
@@ -592,7 +594,8 @@ object TypeCoercion {
    * Coerces the type of different branches of a CASE WHEN statement to a common type.
    */
   object CaseWhenCoercion extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case c: CaseWhen if c.childrenResolved && !c.valueTypesEqual =>
         val maybeCommonType = findWiderCommonType(c.valueTypes)
         maybeCommonType.map { commonType =>
@@ -622,7 +625,8 @@ object TypeCoercion {
    * Coerces the type of different branches of If statement to a common type.
    */
   object IfCoercion extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case e if !e.childrenResolved => e
       // Find tightest common type for If, if the true value and false value have different types.
       case i @ If(pred, left, right) if left.dataType != right.dataType =>
@@ -662,7 +666,7 @@ object TypeCoercion {
 
     private val acceptedTypes = Seq(DateType, TimestampType, StringType)
 
-    def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -679,7 +683,8 @@ object TypeCoercion {
    * Casts types according to the expected input types for [[Expression]]s.
    */
   object ImplicitTypeCasts extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
@@ -796,7 +801,8 @@ object TypeCoercion {
    * Cast WindowFrame boundaries to the type they operate upon.
    */
   object WindowFrameCoercion extends TypeCoercionRule {
-    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
+    override protected def coerceTypes(
+        plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
       case s @ WindowSpecDefinition(_, Seq(order), SpecifiedWindowFrame(RangeFrame, lower, upper))
           if order.resolved =>
         s.copy(frameSpecification = SpecifiedWindowFrame(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala
index a27aa845bf0ae..af1f9165b0044 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/timeZoneAnalysis.scala
@@ -38,7 +38,7 @@ case class ResolveTimeZone(conf: SQLConf) extends Rule[LogicalPlan] {
   }
 
   override def apply(plan: LogicalPlan): LogicalPlan =
-    plan.resolveExpressions(transformTimeZoneExprs)
+    plan.transformAllExpressions(transformTimeZoneExprs)
 
   def resolveTimeZones(e: Expression): Expression = e.transform(transformTimeZoneExprs)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
index ea46dd7282401..3bbe41cf8f15e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.internal.SQLConf
  * completely resolved during the batch of Resolution.
  */
 case class AliasViewChild(conf: SQLConf) extends Rule[LogicalPlan] with CastSupport {
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case v @ View(desc, output, child) if child.resolved && output != child.output =>
       val resolver = conf.resolver
       val queryColumnNames = desc.viewQueryColumnNames
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
index 64b28565eb27c..2673bea648d09 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala
@@ -270,7 +270,7 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper
   /**
    * Pull up the correlated predicates and rewrite all subqueries in an operator tree..
    */
-  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case f @ Filter(_, a: Aggregate) =>
       rewriteSubQueries(f, Seq(a, a.child))
     // Only a few unary nodes (Project/Filter/Aggregate) can contain subqueries.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 14188829db2af..a38458add7b5e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -33,58 +33,9 @@ abstract class LogicalPlan
   with QueryPlanConstraints
   with Logging {
 
-  private var _analyzed: Boolean = false
-
-  /**
-   * Marks this plan as already analyzed. This should only be called by [[CheckAnalysis]].
-   */
-  private[catalyst] def setAnalyzed(): Unit = { _analyzed = true }
-
-  /**
-   * Returns true if this node and its children have already been gone through analysis and
-   * verification.  Note that this is only an optimization used to avoid analyzing trees that
-   * have already been analyzed, and can be reset by transformations.
-   */
-  def analyzed: Boolean = _analyzed
-
   /** Returns true if this subtree has data from a streaming data source. */
   def isStreaming: Boolean = children.exists(_.isStreaming == true)
 
-  /**
-   * Returns a copy of this node where `rule` has been recursively applied first to all of its
-   * children and then itself (post-order). When `rule` does not apply to a given node, it is left
-   * unchanged.  This function is similar to `transformUp`, but skips sub-trees that have already
-   * been marked as analyzed.
-   *
-   * @param rule the function use to transform this nodes children
-   */
-  def resolveOperators(rule: PartialFunction[LogicalPlan, LogicalPlan]): LogicalPlan = {
-    if (!analyzed) {
-      val afterRuleOnChildren = mapChildren(_.resolveOperators(rule))
-      if (this fastEquals afterRuleOnChildren) {
-        CurrentOrigin.withOrigin(origin) {
-          rule.applyOrElse(this, identity[LogicalPlan])
-        }
-      } else {
-        CurrentOrigin.withOrigin(origin) {
-          rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan])
-        }
-      }
-    } else {
-      this
-    }
-  }
-
-  /**
-   * Recursively transforms the expressions of a tree, skipping nodes that have already
-   * been analyzed.
-   */
-  def resolveExpressions(r: PartialFunction[Expression, Expression]): LogicalPlan = {
-    this resolveOperators  {
-      case p => p.transformExpressions(r)
-    }
-  }
-
   override def verboseStringWithSuffix: String = {
     super.verboseString + statsCache.map(", " + _.toString).getOrElse("")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index ba5f97d608feb..cd474551622d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -883,3 +883,22 @@ case class Deduplicate(
 
   override def output: Seq[Attribute] = child.output
 }
+
+/**
+ * A logical plan for setting a barrier of analysis.
+ *
+ * The SQL Analyzer goes through a whole query plan even most part of it is analyzed. This
+ * increases the time spent on query analysis for long pipelines in ML, especially.
+ *
+ * This logical plan wraps an analyzed logical plan to prevent it from analysis again. The barrier
+ * is applied to the analyzed logical plan in Dataset. It won't change the output of wrapped
+ * logical plan and just acts as a wrapper to hide it from analyzer. New operations on the dataset
+ * will be put on the barrier, so only the new nodes created will be analyzed.
+ *
+ * This analysis barrier will be removed at the end of analysis stage.
+ */
+case class AnalysisBarrier(child: LogicalPlan) extends LeafNode {
+  override def output: Seq[Attribute] = child.output
+  override def isStreaming: Boolean = child.isStreaming
+  override def doCanonicalize(): LogicalPlan = child.canonicalized
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 109fb32aa4a12..f4514205d3ae0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -543,4 +543,18 @@ class AnalysisSuite extends AnalysisTest with Matchers {
       checkPartitioning(numPartitions = 10, exprs = SortOrder('a.attr, Ascending), 'b.attr)
     }
   }
+
+  test("SPARK-20392: analysis barrier") {
+    // [[AnalysisBarrier]] will be removed after analysis
+    checkAnalysis(
+      Project(Seq(UnresolvedAttribute("tbl.a")),
+        AnalysisBarrier(SubqueryAlias("tbl", testRelation))),
+      Project(testRelation.output, SubqueryAlias("tbl", testRelation)))
+
+    // Verify we won't go through a plan wrapped in a barrier.
+    // Since we wrap an unresolved plan and analyzer won't go through it. It remains unresolved.
+    val barrier = AnalysisBarrier(Project(Seq(UnresolvedAttribute("tbl.b")),
+      SubqueryAlias("tbl", testRelation)))
+    assertAnalysisError(barrier, Seq("cannot resolve '`tbl.b`'"))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
index cdf912df7c76a..14041747fd20e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/LogicalPlanSuite.scala
@@ -23,8 +23,8 @@ import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types.IntegerType
 
 /**
- * This suite is used to test [[LogicalPlan]]'s `resolveOperators` and make sure it can correctly
- * skips sub-trees that have already been marked as analyzed.
+ * This suite is used to test [[LogicalPlan]]'s `transformUp/transformDown` plus analysis barrier
+ * and make sure it can correctly skip sub-trees that have already been analyzed.
  */
 class LogicalPlanSuite extends SparkFunSuite {
   private var invocationCount = 0
@@ -36,39 +36,53 @@ class LogicalPlanSuite extends SparkFunSuite {
 
   private val testRelation = LocalRelation()
 
-  test("resolveOperator runs on operators") {
+  test("transformUp runs on operators") {
     invocationCount = 0
     val plan = Project(Nil, testRelation)
-    plan resolveOperators function
+    plan transformUp function
 
     assert(invocationCount === 1)
+
+    invocationCount = 0
+    plan transformDown function
+    assert(invocationCount === 1)
   }
 
-  test("resolveOperator runs on operators recursively") {
+  test("transformUp runs on operators recursively") {
     invocationCount = 0
     val plan = Project(Nil, Project(Nil, testRelation))
-    plan resolveOperators function
+    plan transformUp function
 
     assert(invocationCount === 2)
+
+    invocationCount = 0
+    plan transformDown function
+    assert(invocationCount === 2)
   }
 
-  test("resolveOperator skips all ready resolved plans") {
+  test("transformUp skips all ready resolved plans wrapped in analysis barrier") {
     invocationCount = 0
-    val plan = Project(Nil, Project(Nil, testRelation))
-    plan.foreach(_.setAnalyzed())
-    plan resolveOperators function
+    val plan = AnalysisBarrier(Project(Nil, Project(Nil, testRelation)))
+    plan transformUp function
 
     assert(invocationCount === 0)
+
+    invocationCount = 0
+    plan transformDown function
+    assert(invocationCount === 0)
   }
 
-  test("resolveOperator skips partially resolved plans") {
+  test("transformUp skips partially resolved plans wrapped in analysis barrier") {
     invocationCount = 0
-    val plan1 = Project(Nil, testRelation)
+    val plan1 = AnalysisBarrier(Project(Nil, testRelation))
     val plan2 = Project(Nil, plan1)
-    plan1.foreach(_.setAnalyzed())
-    plan2 resolveOperators function
+    plan2 transformUp function
 
     assert(invocationCount === 1)
+
+    invocationCount = 0
+    plan2 transformDown function
+    assert(invocationCount === 1)
   }
 
   test("isStreaming") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 167c9d050c3c4..c34cf0a7a7718 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -191,6 +191,9 @@ class Dataset[T] private[sql](
     }
   }
 
+  // Wraps analyzed logical plans with an analysis barrier so we won't traverse/resolve it again.
+  @transient private val planWithBarrier = AnalysisBarrier(logicalPlan)
+
   /**
    * Currently [[ExpressionEncoder]] is the only implementation of [[Encoder]], here we turn the
    * passed in encoder to [[ExpressionEncoder]] explicitly, and mark it implicit so that we can use
@@ -403,7 +406,7 @@ class Dataset[T] private[sql](
    */
   @Experimental
   @InterfaceStability.Evolving
-  def as[U : Encoder]: Dataset[U] = Dataset[U](sparkSession, logicalPlan)
+  def as[U : Encoder]: Dataset[U] = Dataset[U](sparkSession, planWithBarrier)
 
   /**
    * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed.
@@ -604,7 +607,7 @@ class Dataset[T] private[sql](
     require(parsedDelay.milliseconds >= 0 && parsedDelay.months >= 0,
       s"delay threshold ($delayThreshold) should not be negative.")
     EliminateEventTimeWatermark(
-      EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, logicalPlan))
+      EventTimeWatermark(UnresolvedAttribute(eventTime), parsedDelay, planWithBarrier))
   }
 
   /**
@@ -777,7 +780,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def join(right: Dataset[_]): DataFrame = withPlan {
-    Join(logicalPlan, right.logicalPlan, joinType = Inner, None)
+    Join(planWithBarrier, right.planWithBarrier, joinType = Inner, None)
   }
 
   /**
@@ -855,7 +858,7 @@ class Dataset[T] private[sql](
     // Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
     // by creating a new instance for one of the branch.
     val joined = sparkSession.sessionState.executePlan(
-      Join(logicalPlan, right.logicalPlan, joinType = JoinType(joinType), None))
+      Join(planWithBarrier, right.planWithBarrier, joinType = JoinType(joinType), None))
       .analyzed.asInstanceOf[Join]
 
     withPlan {
@@ -916,7 +919,7 @@ class Dataset[T] private[sql](
     // Trigger analysis so in the case of self-join, the analyzer will clone the plan.
     // After the cloning, left and right side will have distinct expression ids.
     val plan = withPlan(
-      Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr)))
+      Join(planWithBarrier, right.planWithBarrier, JoinType(joinType), Some(joinExprs.expr)))
       .queryExecution.analyzed.asInstanceOf[Join]
 
     // If auto self join alias is disabled, return the plan.
@@ -925,8 +928,8 @@ class Dataset[T] private[sql](
     }
 
     // If left/right have no output set intersection, return the plan.
-    val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed
-    val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed
+    val lanalyzed = withPlan(this.planWithBarrier).queryExecution.analyzed
+    val ranalyzed = withPlan(right.planWithBarrier).queryExecution.analyzed
     if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) {
       return withPlan(plan)
     }
@@ -958,7 +961,7 @@ class Dataset[T] private[sql](
    * @since 2.1.0
    */
   def crossJoin(right: Dataset[_]): DataFrame = withPlan {
-    Join(logicalPlan, right.logicalPlan, joinType = Cross, None)
+    Join(planWithBarrier, right.planWithBarrier, joinType = Cross, None)
   }
 
   /**
@@ -990,8 +993,8 @@ class Dataset[T] private[sql](
     // etc.
     val joined = sparkSession.sessionState.executePlan(
       Join(
-        this.logicalPlan,
-        other.logicalPlan,
+        this.planWithBarrier,
+        other.planWithBarrier,
         JoinType(joinType),
         Some(condition.expr))).analyzed.asInstanceOf[Join]
 
@@ -1212,7 +1215,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def as(alias: String): Dataset[T] = withTypedPlan {
-    SubqueryAlias(alias, logicalPlan)
+    SubqueryAlias(alias, planWithBarrier)
   }
 
   /**
@@ -1250,7 +1253,7 @@ class Dataset[T] private[sql](
    */
   @scala.annotation.varargs
   def select(cols: Column*): DataFrame = withPlan {
-    Project(cols.map(_.named), logicalPlan)
+    Project(cols.map(_.named), planWithBarrier)
   }
 
   /**
@@ -1305,8 +1308,8 @@ class Dataset[T] private[sql](
   @InterfaceStability.Evolving
   def select[U1](c1: TypedColumn[T, U1]): Dataset[U1] = {
     implicit val encoder = c1.encoder
-    val project = Project(c1.withInputType(exprEnc, logicalPlan.output).named :: Nil,
-      logicalPlan)
+    val project = Project(c1.withInputType(exprEnc, planWithBarrier.output).named :: Nil,
+      planWithBarrier)
 
     if (encoder.flat) {
       new Dataset[U1](sparkSession, project, encoder)
@@ -1324,8 +1327,8 @@ class Dataset[T] private[sql](
   protected def selectUntyped(columns: TypedColumn[_, _]*): Dataset[_] = {
     val encoders = columns.map(_.encoder)
     val namedColumns =
-      columns.map(_.withInputType(exprEnc, logicalPlan.output).named)
-    val execution = new QueryExecution(sparkSession, Project(namedColumns, logicalPlan))
+      columns.map(_.withInputType(exprEnc, planWithBarrier.output).named)
+    val execution = new QueryExecution(sparkSession, Project(namedColumns, planWithBarrier))
     new Dataset(sparkSession, execution, ExpressionEncoder.tuple(encoders))
   }
 
@@ -1401,7 +1404,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def filter(condition: Column): Dataset[T] = withTypedPlan {
-    Filter(condition.expr, logicalPlan)
+    Filter(condition.expr, planWithBarrier)
   }
 
   /**
@@ -1578,7 +1581,7 @@ class Dataset[T] private[sql](
   @Experimental
   @InterfaceStability.Evolving
   def groupByKey[K: Encoder](func: T => K): KeyValueGroupedDataset[K, T] = {
-    val inputPlan = logicalPlan
+    val inputPlan = planWithBarrier
     val withGroupingKey = AppendColumns(func, inputPlan)
     val executed = sparkSession.sessionState.executePlan(withGroupingKey)
 
@@ -1724,7 +1727,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def limit(n: Int): Dataset[T] = withTypedPlan {
-    Limit(Literal(n), logicalPlan)
+    Limit(Literal(n), planWithBarrier)
   }
 
   /**
@@ -1774,7 +1777,7 @@ class Dataset[T] private[sql](
   def union(other: Dataset[T]): Dataset[T] = withSetOperator {
     // This breaks caching, but it's usually ok because it addresses a very specific use case:
     // using union to union many files or partitions.
-    CombineUnions(Union(logicalPlan, other.logicalPlan))
+    CombineUnions(Union(logicalPlan, other.logicalPlan)).mapChildren(AnalysisBarrier)
   }
 
   /**
@@ -1833,7 +1836,7 @@ class Dataset[T] private[sql](
 
     // This breaks caching, but it's usually ok because it addresses a very specific use case:
     // using union to union many files or partitions.
-    CombineUnions(Union(logicalPlan, rightChild))
+    CombineUnions(Union(logicalPlan, rightChild)).mapChildren(AnalysisBarrier)
   }
 
   /**
@@ -1847,7 +1850,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def intersect(other: Dataset[T]): Dataset[T] = withSetOperator {
-    Intersect(logicalPlan, other.logicalPlan)
+    Intersect(planWithBarrier, other.planWithBarrier)
   }
 
   /**
@@ -1861,7 +1864,7 @@ class Dataset[T] private[sql](
    * @since 2.0.0
    */
   def except(other: Dataset[T]): Dataset[T] = withSetOperator {
-    Except(logicalPlan, other.logicalPlan)
+    Except(planWithBarrier, other.planWithBarrier)
   }
 
   /**
@@ -1912,7 +1915,7 @@ class Dataset[T] private[sql](
    */
   def sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] = {
     withTypedPlan {
-      Sample(0.0, fraction, withReplacement, seed, logicalPlan)
+      Sample(0.0, fraction, withReplacement, seed, planWithBarrier)
     }
   }
 
@@ -1954,15 +1957,15 @@ class Dataset[T] private[sql](
     // overlapping splits. To prevent this, we explicitly sort each input partition to make the
     // ordering deterministic. Note that MapTypes cannot be sorted and are explicitly pruned out
     // from the sort order.
-    val sortOrder = logicalPlan.output
+    val sortOrder = planWithBarrier.output
       .filter(attr => RowOrdering.isOrderable(attr.dataType))
       .map(SortOrder(_, Ascending))
     val plan = if (sortOrder.nonEmpty) {
-      Sort(sortOrder, global = false, logicalPlan)
+      Sort(sortOrder, global = false, planWithBarrier)
     } else {
       // SPARK-12662: If sort order is empty, we materialize the dataset to guarantee determinism
       cache()
-      logicalPlan
+      planWithBarrier
     }
     val sum = weights.sum
     val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _)
@@ -2046,7 +2049,7 @@ class Dataset[T] private[sql](
 
     withPlan {
       Generate(generator, join = true, outer = false,
-        qualifier = None, generatorOutput = Nil, logicalPlan)
+        qualifier = None, generatorOutput = Nil, planWithBarrier)
     }
   }
 
@@ -2087,7 +2090,7 @@ class Dataset[T] private[sql](
 
     withPlan {
       Generate(generator, join = true, outer = false,
-        qualifier = None, generatorOutput = Nil, logicalPlan)
+        qualifier = None, generatorOutput = Nil, planWithBarrier)
     }
   }
 
@@ -2235,7 +2238,7 @@ class Dataset[T] private[sql](
           u.name, sparkSession.sessionState.analyzer.resolver).getOrElse(u)
       case Column(expr: Expression) => expr
     }
-    val attrs = this.logicalPlan.output
+    val attrs = this.planWithBarrier.output
     val colsAfterDrop = attrs.filter { attr =>
       attr != expression
     }.map(attr => Column(attr))
@@ -2283,7 +2286,7 @@ class Dataset[T] private[sql](
       }
       cols
     }
-    Deduplicate(groupCols, logicalPlan)
+    Deduplicate(groupCols, planWithBarrier)
   }
 
   /**
@@ -2465,7 +2468,7 @@ class Dataset[T] private[sql](
   @Experimental
   @InterfaceStability.Evolving
   def filter(func: T => Boolean): Dataset[T] = {
-    withTypedPlan(TypedFilter(func, logicalPlan))
+    withTypedPlan(TypedFilter(func, planWithBarrier))
   }
 
   /**
@@ -2479,7 +2482,7 @@ class Dataset[T] private[sql](
   @Experimental
   @InterfaceStability.Evolving
   def filter(func: FilterFunction[T]): Dataset[T] = {
-    withTypedPlan(TypedFilter(func, logicalPlan))
+    withTypedPlan(TypedFilter(func, planWithBarrier))
   }
 
   /**
@@ -2493,7 +2496,7 @@ class Dataset[T] private[sql](
   @Experimental
   @InterfaceStability.Evolving
   def map[U : Encoder](func: T => U): Dataset[U] = withTypedPlan {
-    MapElements[T, U](func, logicalPlan)
+    MapElements[T, U](func, planWithBarrier)
   }
 
   /**
@@ -2508,7 +2511,7 @@ class Dataset[T] private[sql](
   @InterfaceStability.Evolving
   def map[U](func: MapFunction[T, U], encoder: Encoder[U]): Dataset[U] = {
     implicit val uEnc = encoder
-    withTypedPlan(MapElements[T, U](func, logicalPlan))
+    withTypedPlan(MapElements[T, U](func, planWithBarrier))
   }
 
   /**
@@ -2524,7 +2527,7 @@ class Dataset[T] private[sql](
   def mapPartitions[U : Encoder](func: Iterator[T] => Iterator[U]): Dataset[U] = {
     new Dataset[U](
       sparkSession,
-      MapPartitions[T, U](func, logicalPlan),
+      MapPartitions[T, U](func, planWithBarrier),
       implicitly[Encoder[U]])
   }
 
@@ -2555,7 +2558,7 @@ class Dataset[T] private[sql](
     val rowEncoder = encoder.asInstanceOf[ExpressionEncoder[Row]]
     Dataset.ofRows(
       sparkSession,
-      MapPartitionsInR(func, packageNames, broadcastVars, schema, rowEncoder, logicalPlan))
+      MapPartitionsInR(func, packageNames, broadcastVars, schema, rowEncoder, planWithBarrier))
   }
 
   /**
@@ -2719,7 +2722,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def repartition(numPartitions: Int): Dataset[T] = withTypedPlan {
-    Repartition(numPartitions, shuffle = true, logicalPlan)
+    Repartition(numPartitions, shuffle = true, planWithBarrier)
   }
 
   /**
@@ -2742,7 +2745,7 @@ class Dataset[T] private[sql](
          |For range partitioning use repartitionByRange(...) instead.
        """.stripMargin)
     withTypedPlan {
-      RepartitionByExpression(partitionExprs.map(_.expr), logicalPlan, numPartitions)
+      RepartitionByExpression(partitionExprs.map(_.expr), planWithBarrier, numPartitions)
     }
   }
 
@@ -2779,7 +2782,7 @@ class Dataset[T] private[sql](
       case expr: Expression => SortOrder(expr, Ascending)
     })
     withTypedPlan {
-      RepartitionByExpression(sortOrder, logicalPlan, numPartitions)
+      RepartitionByExpression(sortOrder, planWithBarrier, numPartitions)
     }
   }
 
@@ -2817,7 +2820,7 @@ class Dataset[T] private[sql](
    * @since 1.6.0
    */
   def coalesce(numPartitions: Int): Dataset[T] = withTypedPlan {
-    Repartition(numPartitions, shuffle = false, logicalPlan)
+    Repartition(numPartitions, shuffle = false, planWithBarrier)
   }
 
   /**
@@ -2900,7 +2903,7 @@ class Dataset[T] private[sql](
 
   // Represents the `QueryExecution` used to produce the content of the Dataset as an `RDD`.
   @transient private lazy val rddQueryExecution: QueryExecution = {
-    val deserialized = CatalystSerde.deserialize[T](logicalPlan)
+    val deserialized = CatalystSerde.deserialize[T](planWithBarrier)
     sparkSession.sessionState.executePlan(deserialized)
   }
 
@@ -3026,7 +3029,7 @@ class Dataset[T] private[sql](
       comment = None,
       properties = Map.empty,
       originalText = None,
-      child = logicalPlan,
+      child = planWithBarrier,
       allowExisting = false,
       replace = replace,
       viewType = viewType)
@@ -3226,7 +3229,7 @@ class Dataset[T] private[sql](
       }
     }
     withTypedPlan {
-      Sort(sortOrder, global = global, logicalPlan)
+      Sort(sortOrder, global = global, planWithBarrier)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 6e08df75b8a4c..f64e079539c4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -39,7 +39,7 @@ class ResolveSQLOnFile(sparkSession: SparkSession) extends Rule[LogicalPlan] {
     sparkSession.sessionState.conf.runSQLonFile && u.tableIdentifier.database.isDefined
   }
 
-  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case u: UnresolvedRelation if maybeSQLFile(u) =>
       try {
         val dataSource = DataSource(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index c25c90d0c70e2..b50642d275ba8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -241,7 +241,7 @@ class PlannerSuite extends SharedSQLContext {
   test("collapse adjacent repartitions") {
     val doubleRepartitioned = testData.repartition(10).repartition(20).coalesce(5)
     def countRepartitions(plan: LogicalPlan): Int = plan.collect { case r: Repartition => r }.length
-    assert(countRepartitions(doubleRepartitioned.queryExecution.logical) === 3)
+    assert(countRepartitions(doubleRepartitioned.queryExecution.analyzed) === 3)
     assert(countRepartitions(doubleRepartitioned.queryExecution.optimizedPlan) === 2)
     doubleRepartitioned.queryExecution.optimizedPlan match {
       case Repartition (numPartitions, shuffle, Repartition(_, shuffleChild, _)) =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index 3018c0642f062..a7961c757efa8 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -87,7 +87,7 @@ class ResolveHiveSerdeTable(session: SparkSession) extends Rule[LogicalPlan] {
     }
   }
 
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case c @ CreateTable(t, _, query) if DDLUtils.isHiveTable(t) =>
       // Finds the database name if the name does not exist.
       val dbName = t.identifier.database.getOrElse(session.catalog.currentDatabase)
@@ -114,7 +114,7 @@ class ResolveHiveSerdeTable(session: SparkSession) extends Rule[LogicalPlan] {
 }
 
 class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case relation: HiveTableRelation
         if DDLUtils.isHiveTable(relation.tableMeta) && relation.tableMeta.stats.isEmpty =>
       val table = relation.tableMeta
@@ -145,7 +145,7 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] {
  * `PreprocessTableInsertion`.
  */
 object HiveAnalysis extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case InsertIntoTable(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists)
         if DDLUtils.isHiveTable(r.tableMeta) =>
       InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists)

From fb6a9227516f893815fb0b6d26b578e21badd664 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 6 Dec 2017 20:20:20 +0900
Subject: [PATCH 047/356] [SPARK-20728][SQL][FOLLOWUP] Use an actionable
 exception message

## What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/19871 to improve an exception message.

## How was this patch tested?

Pass the Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19903 from dongjoon-hyun/orc_exception.
---
 .../spark/sql/execution/datasources/DataSource.scala      | 8 ++++----
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 5f12d5f93a35c..b676672b38cd0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -598,11 +598,11 @@ object DataSource extends Logging {
                 // Found the data source using fully qualified path
                 dataSource
               case Failure(error) =>
-                if (provider1.toLowerCase(Locale.ROOT) == "orc" ||
-                  provider1.startsWith("org.apache.spark.sql.hive.orc")) {
+                if (provider1.startsWith("org.apache.spark.sql.hive.orc")) {
                   throw new AnalysisException(
-                    "Hive-based ORC data source must be used with Hive support enabled. " +
-                    "Please use native ORC data source instead")
+                    "Hive built-in ORC data source must be used with Hive support enabled. " +
+                    "Please use the native ORC data source by setting 'spark.sql.orc.impl' to " +
+                    "'native'")
                 } else if (provider1.toLowerCase(Locale.ROOT) == "avro" ||
                   provider1 == "com.databricks.spark.avro") {
                   throw new AnalysisException(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 86bd9b95bca6a..8ddddbeee598f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1666,7 +1666,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     e = intercept[AnalysisException] {
       sql(s"select id from `org.apache.spark.sql.hive.orc`.`file_path`")
     }
-    assert(e.message.contains("Hive-based ORC data source must be used with Hive support"))
+    assert(e.message.contains("Hive built-in ORC data source must be used with Hive support"))
 
     e = intercept[AnalysisException] {
       sql(s"select id from `com.databricks.spark.avro`.`file_path`")
@@ -2790,7 +2790,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       val e = intercept[AnalysisException] {
         sql("CREATE TABLE spark_20728(a INT) USING ORC")
       }
-      assert(e.message.contains("Hive-based ORC data source must be used with Hive support"))
+      assert(e.message.contains("Hive built-in ORC data source must be used with Hive support"))
     }
 
     withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {

From 6f41c593bbefa946d13b62ecf4e85074fd3c1541 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 6 Dec 2017 08:27:17 -0800
Subject: [PATCH 048/356] [SPARK-22690][ML] Imputer inherit HasOutputCols

## What changes were proposed in this pull request?
make `Imputer` inherit `HasOutputCols`

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19889 from zhengruifeng/using_HasOutputCols.
---
 .../org/apache/spark/ml/feature/Imputer.scala      | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
index 4663f16b5f5dc..730ee9fc08db8 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Imputer.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.HasInputCols
+import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCols}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -32,7 +32,7 @@ import org.apache.spark.sql.types._
 /**
  * Params for [[Imputer]] and [[ImputerModel]].
  */
-private[feature] trait ImputerParams extends Params with HasInputCols {
+private[feature] trait ImputerParams extends Params with HasInputCols with HasOutputCols {
 
   /**
    * The imputation strategy. Currently only "mean" and "median" are supported.
@@ -63,16 +63,6 @@ private[feature] trait ImputerParams extends Params with HasInputCols {
   /** @group getParam */
   def getMissingValue: Double = $(missingValue)
 
-  /**
-   * Param for output column names.
-   * @group param
-   */
-  final val outputCols: StringArrayParam = new StringArrayParam(this, "outputCols",
-    "output column names")
-
-  /** @group getParam */
-  final def getOutputCols: Array[String] = $(outputCols)
-
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
     require($(inputCols).length == $(inputCols).distinct.length, s"inputCols contains" +

From 813c0f945d7f03800975eaed26b86a1f30e513c9 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 7 Dec 2017 00:45:51 +0800
Subject: [PATCH 049/356] [SPARK-22704][SQL] Least and Greatest use less global
 variables

## What changes were proposed in this pull request?

This PR accomplishes the following two items.

1. Reduce # of global variables from two to one
2. Make lifetime of global variable local within an operation

Item 1. reduces # of constant pool entries in a Java class. Item 2. ensures that an variable is not passed to arguments in a method split by `CodegenContext.splitExpressions()`, which is addressed by #19865.

## How was this patch tested?

Added new test into `ArithmeticExpressionSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19899 from kiszk/SPARK-22704.
---
 .../sql/catalyst/expressions/arithmetic.scala | 94 ++++++++++++-------
 .../ArithmeticExpressionSuite.scala           | 11 +++
 2 files changed, 73 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 739bd13c5078d..1893eec22b65d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -602,23 +602,38 @@ case class Least(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
-    def updateEval(eval: ExprCode): String = {
+    val tmpIsNull = ctx.freshName("leastTmpIsNull")
+    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
+    val evals = evalChildren.map(eval =>
       s"""
-        ${eval.code}
-        if (!${eval.isNull} && (${ev.isNull} ||
-          ${ctx.genGreater(dataType, ev.value, eval.value)})) {
-          ${ev.isNull} = false;
-          ${ev.value} = ${eval.value};
-        }
-      """
-    }
-    val codes = ctx.splitExpressionsWithCurrentInputs(evalChildren.map(updateEval))
-    ev.copy(code = s"""
-      ${ev.isNull} = true;
-      ${ev.value} = ${ctx.defaultValue(dataType)};
-      $codes""")
+         |${eval.code}
+         |if (!${eval.isNull} && ($tmpIsNull ||
+         |  ${ctx.genGreater(dataType, ev.value, eval.value)})) {
+         |  $tmpIsNull = false;
+         |  ${ev.value} = ${eval.value};
+         |}
+      """.stripMargin
+    )
+
+    val resultType = ctx.javaType(dataType)
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = evals,
+      funcName = "least",
+      extraArguments = Seq(resultType -> ev.value),
+      returnType = resultType,
+      makeSplitFunction = body =>
+        s"""
+          |$body
+          |return ${ev.value};
+        """.stripMargin,
+      foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
+    ev.copy(code =
+      s"""
+         |$tmpIsNull = true;
+         |${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+         |$codes
+         |final boolean ${ev.isNull} = $tmpIsNull;
+      """.stripMargin)
   }
 }
 
@@ -668,22 +683,37 @@ case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
-    def updateEval(eval: ExprCode): String = {
+    val tmpIsNull = ctx.freshName("greatestTmpIsNull")
+    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
+    val evals = evalChildren.map(eval =>
       s"""
-        ${eval.code}
-        if (!${eval.isNull} && (${ev.isNull} ||
-          ${ctx.genGreater(dataType, eval.value, ev.value)})) {
-          ${ev.isNull} = false;
-          ${ev.value} = ${eval.value};
-        }
-      """
-    }
-    val codes = ctx.splitExpressionsWithCurrentInputs(evalChildren.map(updateEval))
-    ev.copy(code = s"""
-      ${ev.isNull} = true;
-      ${ev.value} = ${ctx.defaultValue(dataType)};
-      $codes""")
+         |${eval.code}
+         |if (!${eval.isNull} && ($tmpIsNull ||
+         |  ${ctx.genGreater(dataType, eval.value, ev.value)})) {
+         |  $tmpIsNull = false;
+         |  ${ev.value} = ${eval.value};
+         |}
+      """.stripMargin
+    )
+
+    val resultType = ctx.javaType(dataType)
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = evals,
+      funcName = "greatest",
+      extraArguments = Seq(resultType -> ev.value),
+      returnType = resultType,
+      makeSplitFunction = body =>
+        s"""
+           |$body
+           |return ${ev.value};
+        """.stripMargin,
+      foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
+    ev.copy(code =
+      s"""
+         |$tmpIsNull = true;
+         |${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+         |$codes
+         |final boolean ${ev.isNull} = $tmpIsNull;
+      """.stripMargin)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index fb759eba6a9e2..be638d80e45d8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.types._
 
 class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -343,4 +344,14 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
     checkEvaluation(Least(inputsExpr), "s" * 1, EmptyRow)
     checkEvaluation(Greatest(inputsExpr), "s" * N, EmptyRow)
   }
+
+  test("SPARK-22704: Least and greatest use less global variables") {
+    val ctx1 = new CodegenContext()
+    Least(Seq(Literal(1), Literal(1))).genCode(ctx1)
+    assert(ctx1.mutableStates.size == 1)
+
+    val ctx2 = new CodegenContext()
+    Greatest(Seq(Literal(1), Literal(1))).genCode(ctx2)
+    assert(ctx2.mutableStates.size == 1)
+  }
 }

From e98f9647f44d1071a6b070db070841b8cda6bd7a Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Thu, 7 Dec 2017 00:50:49 +0800
Subject: [PATCH 050/356] [SPARK-22695][SQL] ScalaUDF should not use global
 variables

## What changes were proposed in this pull request?

ScalaUDF is using global variables which are not needed. This can generate some unneeded entries in the constant pool.

The PR replaces the unneeded global variables with local variables.

## How was this patch tested?

added UT

Author: Marco Gaido <mgaido@hortonworks.com>
Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19900 from mgaido91/SPARK-22695.
---
 .../sql/catalyst/expressions/ScalaUDF.scala   | 88 ++++++++++---------
 .../catalyst/expressions/ScalaUDFSuite.scala  |  6 ++
 2 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 179853032035e..4d26d9819321b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -982,35 +982,28 @@ case class ScalaUDF(
 
   // scalastyle:on line.size.limit
 
-  // Generate codes used to convert the arguments to Scala type for user-defined functions
-  private[this] def genCodeForConverter(ctx: CodegenContext, index: Int): String = {
-    val converterClassName = classOf[Any => Any].getName
-    val typeConvertersClassName = CatalystTypeConverters.getClass.getName + ".MODULE$"
-    val expressionClassName = classOf[Expression].getName
-    val scalaUDFClassName = classOf[ScalaUDF].getName
+  private val converterClassName = classOf[Any => Any].getName
+  private val scalaUDFClassName = classOf[ScalaUDF].getName
+  private val typeConvertersClassName = CatalystTypeConverters.getClass.getName + ".MODULE$"
 
+  // Generate codes used to convert the arguments to Scala type for user-defined functions
+  private[this] def genCodeForConverter(ctx: CodegenContext, index: Int): (String, String) = {
     val converterTerm = ctx.freshName("converter")
     val expressionIdx = ctx.references.size - 1
-    ctx.addMutableState(converterClassName, converterTerm,
-      s"$converterTerm = ($converterClassName)$typeConvertersClassName" +
-        s".createToScalaConverter(((${expressionClassName})((($scalaUDFClassName)" +
-          s"references[$expressionIdx]).getChildren().apply($index))).dataType());")
-    converterTerm
+    (converterTerm,
+      s"$converterClassName $converterTerm = ($converterClassName)$typeConvertersClassName" +
+        s".createToScalaConverter(((Expression)((($scalaUDFClassName)" +
+        s"references[$expressionIdx]).getChildren().apply($index))).dataType());")
   }
 
   override def doGenCode(
       ctx: CodegenContext,
       ev: ExprCode): ExprCode = {
+    val scalaUDF = ctx.freshName("scalaUDF")
+    val scalaUDFRef = ctx.addReferenceMinorObj(this, scalaUDFClassName)
 
-    val scalaUDF = ctx.addReferenceObj("scalaUDF", this)
-    val converterClassName = classOf[Any => Any].getName
-    val typeConvertersClassName = CatalystTypeConverters.getClass.getName + ".MODULE$"
-
-    // Generate codes used to convert the returned value of user-defined functions to Catalyst type
+    // Object to convert the returned value of user-defined functions to Catalyst type
     val catalystConverterTerm = ctx.freshName("catalystConverter")
-    ctx.addMutableState(converterClassName, catalystConverterTerm,
-      s"$catalystConverterTerm = ($converterClassName)$typeConvertersClassName" +
-        s".createToCatalystConverter($scalaUDF.dataType());")
 
     val resultTerm = ctx.freshName("result")
 
@@ -1022,8 +1015,6 @@ case class ScalaUDF(
     val funcClassName = s"scala.Function${children.size}"
 
     val funcTerm = ctx.freshName("udf")
-    ctx.addMutableState(funcClassName, funcTerm,
-      s"$funcTerm = ($funcClassName)$scalaUDF.userDefinedFunc();")
 
     // codegen for children expressions
     val evals = children.map(_.genCode(ctx))
@@ -1033,34 +1024,45 @@ case class ScalaUDF(
     // such as IntegerType, its javaType is `int` and the returned type of user-defined
     // function is Object. Trying to convert an Object to `int` will cause casting exception.
     val evalCode = evals.map(_.code).mkString
-    val (converters, funcArguments) = converterTerms.zipWithIndex.map { case (converter, i) =>
-      val eval = evals(i)
-      val argTerm = ctx.freshName("arg")
-      val convert = s"Object $argTerm = ${eval.isNull} ? null : $converter.apply(${eval.value});"
-      (convert, argTerm)
+    val (converters, funcArguments) = converterTerms.zipWithIndex.map {
+      case ((convName, convInit), i) =>
+        val eval = evals(i)
+        val argTerm = ctx.freshName("arg")
+        val convert =
+          s"""
+             |$convInit
+             |Object $argTerm = ${eval.isNull} ? null : $convName.apply(${eval.value});
+           """.stripMargin
+        (convert, argTerm)
     }.unzip
 
     val getFuncResult = s"$funcTerm.apply(${funcArguments.mkString(", ")})"
     val callFunc =
       s"""
-         ${ctx.boxedType(dataType)} $resultTerm = null;
-         try {
-           $resultTerm = (${ctx.boxedType(dataType)})$catalystConverterTerm.apply($getFuncResult);
-         } catch (Exception e) {
-           throw new org.apache.spark.SparkException($scalaUDF.udfErrorMessage(), e);
-         }
-       """
+         |${ctx.boxedType(dataType)} $resultTerm = null;
+         |$scalaUDFClassName $scalaUDF = $scalaUDFRef;
+         |try {
+         |  $funcClassName $funcTerm = ($funcClassName)$scalaUDF.userDefinedFunc();
+         |  $converterClassName $catalystConverterTerm = ($converterClassName)
+         |    $typeConvertersClassName.createToCatalystConverter($scalaUDF.dataType());
+         |  $resultTerm = (${ctx.boxedType(dataType)})$catalystConverterTerm.apply($getFuncResult);
+         |} catch (Exception e) {
+         |  throw new org.apache.spark.SparkException($scalaUDF.udfErrorMessage(), e);
+         |}
+       """.stripMargin
 
-    ev.copy(code = s"""
-      $evalCode
-      ${converters.mkString("\n")}
-      $callFunc
-
-      boolean ${ev.isNull} = $resultTerm == null;
-      ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
-      if (!${ev.isNull}) {
-        ${ev.value} = $resultTerm;
-      }""")
+    ev.copy(code =
+      s"""
+         |$evalCode
+         |${converters.mkString("\n")}
+         |$callFunc
+         |
+         |boolean ${ev.isNull} = $resultTerm == null;
+         |${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
+         |if (!${ev.isNull}) {
+         |  ${ev.value} = $resultTerm;
+         |}
+       """.stripMargin)
   }
 
   private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
index 13bd363c8b692..70dea4b39d55d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import java.util.Locale
 
 import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.types.{IntegerType, StringType}
 
 class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -47,4 +48,9 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(e2.getMessage.contains("Failed to execute user defined function"))
   }
 
+  test("SPARK-22695: ScalaUDF should not use global variables") {
+    val ctx = new CodegenContext
+    ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
 }

From 4286cba7dacf4b457fff91da3743ac2518699945 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 6 Dec 2017 10:11:25 -0800
Subject: [PATCH 051/356] [SPARK-22710] ConfigBuilder.fallbackConf should
 trigger onCreate function

## What changes were proposed in this pull request?
I was looking at the config code today and found that configs defined using ConfigBuilder.fallbackConf didn't trigger onCreate function. This patch fixes it.

This doesn't require backporting since we currently have no configs that use it.

## How was this patch tested?
Added a test case for all the config final creator functions in ConfigEntrySuite.

Author: Reynold Xin <rxin@databricks.com>

Closes #19905 from rxin/SPARK-22710.
---
 .../spark/internal/config/ConfigBuilder.scala |  4 +++-
 .../internal/config/ConfigEntrySuite.scala    | 20 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
index 8f4c1b60920db..b0cd7110a3b47 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigBuilder.scala
@@ -235,7 +235,9 @@ private[spark] case class ConfigBuilder(key: String) {
   }
 
   def fallbackConf[T](fallback: ConfigEntry[T]): ConfigEntry[T] = {
-    new FallbackConfigEntry(key, _alternatives, _doc, _public, fallback)
+    val entry = new FallbackConfigEntry(key, _alternatives, _doc, _public, fallback)
+    _onCreate.foreach(_(entry))
+    entry
   }
 
   def regexConf: TypedConfigBuilder[Regex] = {
diff --git a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
index bf08276dbf971..02514dc7daef4 100644
--- a/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/config/ConfigEntrySuite.scala
@@ -288,4 +288,24 @@ class ConfigEntrySuite extends SparkFunSuite {
     conf.remove(testKey("b"))
     assert(conf.get(iConf) === 3)
   }
+
+  test("onCreate") {
+    var onCreateCalled = false
+    ConfigBuilder(testKey("oc1")).onCreate(_ => onCreateCalled = true).intConf.createWithDefault(1)
+    assert(onCreateCalled)
+
+    onCreateCalled = false
+    ConfigBuilder(testKey("oc2")).onCreate(_ => onCreateCalled = true).intConf.createOptional
+    assert(onCreateCalled)
+
+    onCreateCalled = false
+    ConfigBuilder(testKey("oc3")).onCreate(_ => onCreateCalled = true).intConf
+      .createWithDefaultString("1.0")
+    assert(onCreateCalled)
+
+    val fallback = ConfigBuilder(testKey("oc4")).intConf.createWithDefault(1)
+    onCreateCalled = false
+    ConfigBuilder(testKey("oc5")).onCreate(_ => onCreateCalled = true).fallbackConf(fallback)
+    assert(onCreateCalled)
+  }
 }

From 51066b437b750933da03bb401c8356f4b48aefac Mon Sep 17 00:00:00 2001
From: Devaraj K <devaraj@apache.org>
Date: Wed, 6 Dec 2017 10:39:15 -0800
Subject: [PATCH 052/356] [SPARK-14228][CORE][YARN] Lost executor of RPC
 disassociated, and occurs exception: Could not find CoarseGrainedScheduler or
 it has been stopped

## What changes were proposed in this pull request?
I see the two instances where the exception is occurring.

**Instance 1:**

```
17/11/10 15:49:32 ERROR util.Utils: Uncaught exception in thread driver-revive-thread
org.apache.spark.SparkException: Could not find CoarseGrainedScheduler.
        at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:160)
        at org.apache.spark.rpc.netty.Dispatcher.postOneWayMessage(Dispatcher.scala:140)
        at org.apache.spark.rpc.netty.NettyRpcEnv.send(NettyRpcEnv.scala:187)
        at org.apache.spark.rpc.netty.NettyRpcEndpointRef.send(NettyRpcEnv.scala:521)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(CoarseGrainedSchedulerBackend.scala:125)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anon$1$$anonfun$run$1$$anonfun$apply$mcV$sp$1.apply(CoarseGrainedSchedulerBackend.scala:125)
        at scala.Option.foreach(Option.scala:257)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anon$1$$anonfun$run$1.apply$mcV$sp(CoarseGrainedSchedulerBackend.scala:125)
        at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1344)
        at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anon$1.run(CoarseGrainedSchedulerBackend.scala:124)
        at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
        at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:308)
        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:180)
        at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:294)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
```

In CoarseGrainedSchedulerBackend.scala, driver-revive-thread starts with DriverEndpoint.onStart() and keeps sending the ReviveOffers messages periodically till it gets shutdown as part DriverEndpoint.onStop(). There is no proper coordination between the driver-revive-thread(shutdown) and the RpcEndpoint unregister, RpcEndpoint unregister happens first and then driver-revive-thread shuts down as part of DriverEndpoint.onStop(), In-between driver-revive-thread may try to send the ReviveOffers message which is leading to the above exception.

To fix this issue, this PR moves the shutting down of driver-revive-thread to CoarseGrainedSchedulerBackend.stop() which executes before the DriverEndpoint unregister.

**Instance 2:**

```
17/11/10 16:31:38 ERROR cluster.YarnSchedulerBackend$YarnSchedulerEndpoint: Error requesting driver to remove executor 1 for reason Executor for container container_1508535467865_0226_01_000002 exited because of a YARN event (e.g., pre-emption) and not because of an error in the running job.
org.apache.spark.SparkException: Could not find CoarseGrainedScheduler.
        at org.apache.spark.rpc.netty.Dispatcher.postMessage(Dispatcher.scala:160)
        at org.apache.spark.rpc.netty.Dispatcher.postLocalMessage(Dispatcher.scala:135)
        at org.apache.spark.rpc.netty.NettyRpcEnv.ask(NettyRpcEnv.scala:229)
        at org.apache.spark.rpc.netty.NettyRpcEndpointRef.ask(NettyRpcEnv.scala:516)
        at org.apache.spark.rpc.RpcEndpointRef.ask(RpcEndpointRef.scala:63)
        at org.apache.spark.scheduler.cluster.YarnSchedulerBackend$YarnSchedulerEndpoint$$anonfun$receive$1.applyOrElse(YarnSchedulerBackend.scala:269)
        at org.apache.spark.rpc.netty.Inbox$$anonfun$process$1.apply$mcV$sp(Inbox.scala:117)
        at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:205)
        at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:101)
        at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:221)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
```

Here YarnDriverEndpoint tries to send remove executor messages after the Yarn scheduler backend service stop, which is leading to the above exception. To avoid the above exception,
1) We may add a condition(which checks whether service has stopped or not) before sending executor remove message
2) Add a warn log message in onFailure case when the service is already stopped

In this PR, chosen the 2) option which adds a log message in the case of onFailure without the exception stack trace since the option 1) would need to to go through for every remove executor message.

## How was this patch tested?
I verified it manually, I don't see these exceptions with the PR changes.

Author: Devaraj K <devaraj@apache.org>

Closes #19741 from devaraj-kavali/SPARK-14228.
---
 .../CoarseGrainedExecutorBackend.scala        |  6 +---
 .../CoarseGrainedSchedulerBackend.scala       | 30 ++++++++-----------
 ...bernetesClusterSchedulerBackendSuite.scala |  6 ++--
 .../cluster/YarnSchedulerBackend.scala        | 20 ++++---------
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 4c1f92a1bcbf2..9b62e4b1b7150 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -165,11 +165,7 @@ private[spark] class CoarseGrainedExecutorBackend(
     }
 
     if (notifyDriver && driver.nonEmpty) {
-      driver.get.ask[Boolean](
-        RemoveExecutor(executorId, new ExecutorLossReason(reason))
-      ).failed.foreach(e =>
-        logWarning(s"Unable to notify the driver due to " + e.getMessage, e)
-      )(ThreadUtils.sameThread)
+      driver.get.send(RemoveExecutor(executorId, new ExecutorLossReason(reason)))
     }
 
     System.exit(code)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 7bfb4d53c1834..e6982f333d521 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -95,6 +95,9 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   // The num of current max ExecutorId used to re-register appMaster
   @volatile protected var currentExecutorIdCounter = 0
 
+  private val reviveThread =
+    ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
+
   class DriverEndpoint(override val rpcEnv: RpcEnv, sparkProperties: Seq[(String, String)])
     extends ThreadSafeRpcEndpoint with Logging {
 
@@ -103,9 +106,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
     protected val addressToExecutorId = new HashMap[RpcAddress, String]
 
-    private val reviveThread =
-      ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-revive-thread")
-
     override def onStart() {
       // Periodically revive offers to allow delay scheduling to work
       val reviveIntervalMs = conf.getTimeAsMs("spark.scheduler.revive.interval", "1s")
@@ -154,6 +154,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         executorDataMap.values.foreach { ed =>
           ed.executorEndpoint.send(UpdateDelegationTokens(newDelegationTokens))
         }
+
+      case RemoveExecutor(executorId, reason) =>
+        // We will remove the executor's state and cannot restore it. However, the connection
+        // between the driver and the executor may be still alive so that the executor won't exit
+        // automatically, so try to tell the executor to stop itself. See SPARK-13519.
+        executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))
+        removeExecutor(executorId, reason)
     }
 
     override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
@@ -215,14 +222,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         }
         context.reply(true)
 
-      case RemoveExecutor(executorId, reason) =>
-        // We will remove the executor's state and cannot restore it. However, the connection
-        // between the driver and the executor may be still alive so that the executor won't exit
-        // automatically, so try to tell the executor to stop itself. See SPARK-13519.
-        executorDataMap.get(executorId).foreach(_.executorEndpoint.send(StopExecutor))
-        removeExecutor(executorId, reason)
-        context.reply(true)
-
       case RemoveWorker(workerId, host, message) =>
         removeWorker(workerId, host, message)
         context.reply(true)
@@ -373,10 +372,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
 
       shouldDisable
     }
-
-    override def onStop() {
-      reviveThread.shutdownNow()
-    }
   }
 
   var driverEndpoint: RpcEndpointRef = null
@@ -417,6 +412,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
   }
 
   override def stop() {
+    reviveThread.shutdownNow()
     stopExecutors()
     try {
       if (driverEndpoint != null) {
@@ -465,9 +461,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
    * at once.
    */
   protected def removeExecutor(executorId: String, reason: ExecutorLossReason): Unit = {
-    // Only log the failure since we don't care about the result.
-    driverEndpoint.ask[Boolean](RemoveExecutor(executorId, reason)).failed.foreach(t =>
-      logError(t.getMessage, t))(ThreadUtils.sameThread)
+    driverEndpoint.send(RemoveExecutor(executorId, reason))
   }
 
   protected def removeWorker(workerId: String, host: String, message: String): Unit = {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index 3febb2f47cfd4..13c09033a50ee 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -282,7 +282,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     // No more deletion attempts of the executors.
     // This is graceful termination and should not be detected as a failure.
     verify(podOperations, times(1)).delete(resolvedPod)
-    verify(driverEndpointRef, times(1)).ask[Boolean](
+    verify(driverEndpointRef, times(1)).send(
       RemoveExecutor("1", ExecutorExited(
         0,
         exitCausedByApp = false,
@@ -318,7 +318,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     requestExecutorRunnable.getValue.run()
     allocatorRunnable.getAllValues.asScala.last.run()
     verify(podOperations, never()).delete(firstResolvedPod)
-    verify(driverEndpointRef).ask[Boolean](
+    verify(driverEndpointRef).send(
       RemoveExecutor("1", ExecutorExited(
         1,
         exitCausedByApp = true,
@@ -356,7 +356,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     val recreatedResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
     allocatorRunnable.getValue.run()
     verify(podOperations).delete(firstResolvedPod)
-    verify(driverEndpointRef).ask[Boolean](
+    verify(driverEndpointRef).send(
       RemoveExecutor("1", SlaveLost("Executor lost for unknown reasons.")))
   }
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
index 415a29fd887e8..bb615c36cd97f 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -20,6 +20,7 @@ package org.apache.spark.scheduler.cluster
 import java.util.concurrent.atomic.{AtomicBoolean}
 
 import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.ExecutionContext.Implicits.global
 import scala.util.{Failure, Success}
 import scala.util.control.NonFatal
 
@@ -245,14 +246,7 @@ private[spark] abstract class YarnSchedulerBackend(
           Future.successful(RemoveExecutor(executorId, SlaveLost("AM is not yet registered.")))
       }
 
-      removeExecutorMessage
-        .flatMap { message =>
-          driverEndpoint.ask[Boolean](message)
-        }(ThreadUtils.sameThread)
-        .onFailure {
-          case NonFatal(e) => logError(
-            s"Error requesting driver to remove executor $executorId after disconnection.", e)
-        }(ThreadUtils.sameThread)
+      removeExecutorMessage.foreach { message => driverEndpoint.send(message) }
     }
 
     override def receive: PartialFunction[Any, Unit] = {
@@ -265,12 +259,10 @@ private[spark] abstract class YarnSchedulerBackend(
         addWebUIFilter(filterName, filterParams, proxyBase)
 
       case r @ RemoveExecutor(executorId, reason) =>
-        logWarning(reason.toString)
-        driverEndpoint.ask[Boolean](r).onFailure {
-          case e =>
-            logError("Error requesting driver to remove executor" +
-              s" $executorId for reason $reason", e)
-        }(ThreadUtils.sameThread)
+        if (!stopped.get) {
+          logWarning(s"Requesting driver to remove executor $executorId for reason $reason")
+          driverEndpoint.send(r)
+        }
     }
 
 

From effca9868e3feae16c5722c36878b23e616d01a2 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 6 Dec 2017 13:11:38 -0800
Subject: [PATCH 053/356] [SPARK-22720][SS] Make EventTimeWatermark Extend
 UnaryNode

## What changes were proposed in this pull request?
Our Analyzer and Optimizer have multiple rules for `UnaryNode`. After making `EventTimeWatermark` extend `UnaryNode`, we do not need a special handling for `EventTimeWatermark`.

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19913 from gatorsmile/eventtimewatermark.
---
 .../spark/sql/catalyst/plans/logical/EventTimeWatermark.scala | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
index 06196b5afb031..7a927e1e083b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/EventTimeWatermark.scala
@@ -38,7 +38,7 @@ object EventTimeWatermark {
 case class EventTimeWatermark(
     eventTime: Attribute,
     delay: CalendarInterval,
-    child: LogicalPlan) extends LogicalPlan {
+    child: LogicalPlan) extends UnaryNode {
 
   // Update the metadata on the eventTime column to include the desired delay.
   override val output: Seq[Attribute] = child.output.map { a =>
@@ -60,6 +60,4 @@ case class EventTimeWatermark(
       a
     }
   }
-
-  override val children: Seq[LogicalPlan] = child :: Nil
 }

From 9948b860aca42bbc6478ddfbc0ff590adb00c2f3 Mon Sep 17 00:00:00 2001
From: smurakozi <smurakozi@gmail.com>
Date: Wed, 6 Dec 2017 13:22:08 -0800
Subject: [PATCH 054/356] [SPARK-22516][SQL] Bump up Univocity version to 2.5.9

## What changes were proposed in this pull request?

There was a bug in Univocity Parser that causes the issue in SPARK-22516. This was fixed by upgrading from 2.5.4 to 2.5.9 version of the library :

**Executing**
```
spark.read.option("header","true").option("inferSchema", "true").option("multiLine", "true").option("comment", "g").csv("test_file_without_eof_char.csv").show()
```
**Before**
```
ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 6)
com.univocity.parsers.common.TextParsingException: java.lang.IllegalArgumentException - Unable to skip 1 lines from line 2. End of input reached
...
Internal state when error was thrown: line=3, column=0, record=2, charIndex=31
	at com.univocity.parsers.common.AbstractParser.handleException(AbstractParser.java:339)
	at com.univocity.parsers.common.AbstractParser.parseNext(AbstractParser.java:475)
	at org.apache.spark.sql.execution.datasources.csv.UnivocityParser$$anon$1.next(UnivocityParser.scala:281)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
```
**After**
```
+-------+-------+
|column1|column2|
+-------+-------+
|    abc|    def|
+-------+-------+
```

## How was this patch tested?
The already existing `CSVSuite.commented lines in CSV data` test was extended to parse the file also in multiline mode. The test input file was modified to also include a comment in the last line.

Author: smurakozi <smurakozi@gmail.com>

Closes #19906 from smurakozi/SPARK-22516.
---
 dev/deps/spark-deps-hadoop-2.6                |  2 +-
 dev/deps/spark-deps-hadoop-2.7                |  2 +-
 sql/core/pom.xml                              |  2 +-
 .../src/test/resources/test-data/comments.csv |  1 +
 .../execution/datasources/csv/CSVSuite.scala  | 23 +++++++++++--------
 5 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 2c68b73095c4d..3b5a6945d8c4b 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -180,7 +180,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.5.4.jar
+univocity-parsers-2.5.9.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 2aaac600b3ec3..64136baa01326 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -181,7 +181,7 @@ stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
 super-csv-2.2.0.jar
-univocity-parsers-2.5.4.jar
+univocity-parsers-2.5.9.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 4db3fea008ee9..93010c606cf45 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -38,7 +38,7 @@
     <dependency>
       <groupId>com.univocity</groupId>
       <artifactId>univocity-parsers</artifactId>
-      <version>2.5.4</version>
+      <version>2.5.9</version>
       <type>jar</type>
     </dependency>
     <dependency>
diff --git a/sql/core/src/test/resources/test-data/comments.csv b/sql/core/src/test/resources/test-data/comments.csv
index 6275be7285b36..c0ace46db8c00 100644
--- a/sql/core/src/test/resources/test-data/comments.csv
+++ b/sql/core/src/test/resources/test-data/comments.csv
@@ -4,3 +4,4 @@
 6,7,8,9,0,2015-08-21 16:58:01
 ~0,9,8,7,6,2015-08-22 17:59:02
 1,2,3,4,5,2015-08-23 18:00:42
+~ comment in last line to test SPARK-22516 - do not add empty line at the end of this file!
\ No newline at end of file
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index e439699605abb..4fe45420b4e77 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -483,18 +483,21 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   }
 
   test("commented lines in CSV data") {
-    val results = spark.read
-      .format("csv")
-      .options(Map("comment" -> "~", "header" -> "false"))
-      .load(testFile(commentsFile))
-      .collect()
+    Seq("false", "true").foreach { multiLine =>
 
-    val expected =
-      Seq(Seq("1", "2", "3", "4", "5.01", "2015-08-20 15:57:00"),
-        Seq("6", "7", "8", "9", "0", "2015-08-21 16:58:01"),
-        Seq("1", "2", "3", "4", "5", "2015-08-23 18:00:42"))
+      val results = spark.read
+        .format("csv")
+        .options(Map("comment" -> "~", "header" -> "false", "multiLine" -> multiLine))
+        .load(testFile(commentsFile))
+        .collect()
 
-    assert(results.toSeq.map(_.toSeq) === expected)
+      val expected =
+        Seq(Seq("1", "2", "3", "4", "5.01", "2015-08-20 15:57:00"),
+          Seq("6", "7", "8", "9", "0", "2015-08-21 16:58:01"),
+          Seq("1", "2", "3", "4", "5", "2015-08-23 18:00:42"))
+
+      assert(results.toSeq.map(_.toSeq) === expected)
+    }
   }
 
   test("inferring schema with commented lines in CSV data") {

From f110a7f884cb09f01a20462038328ddc5662b46f Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 6 Dec 2017 14:12:16 -0800
Subject: [PATCH 055/356] [SPARK-22693][SQL] CreateNamedStruct and InSet should
 not use global variables

## What changes were proposed in this pull request?

CreateNamedStruct and InSet are using a global variable which is not needed. This can generate some unneeded entries in the constant pool.

The PR removes the unnecessary mutable states and makes them local variables.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19896 from mgaido91/SPARK-22693.
---
 .../expressions/complexTypeCreator.scala      | 27 ++++++++++---------
 .../sql/catalyst/expressions/predicates.scala | 22 +++++++--------
 .../expressions/ComplexTypeSuite.scala        |  7 +++++
 .../catalyst/expressions/PredicateSuite.scala |  7 +++++
 4 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index 087b21043b309..3dc2ee03a86e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -356,22 +356,25 @@ case class CreateNamedStruct(children: Seq[Expression]) extends CreateNamedStruc
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rowClass = classOf[GenericInternalRow].getName
     val values = ctx.freshName("values")
-    ctx.addMutableState("Object[]", values, s"$values = null;")
+    val valCodes = valExprs.zipWithIndex.map { case (e, i) =>
+      val eval = e.genCode(ctx)
+      s"""
+         |${eval.code}
+         |if (${eval.isNull}) {
+         |  $values[$i] = null;
+         |} else {
+         |  $values[$i] = ${eval.value};
+         |}
+       """.stripMargin
+    }
     val valuesCode = ctx.splitExpressionsWithCurrentInputs(
-      valExprs.zipWithIndex.map { case (e, i) =>
-        val eval = e.genCode(ctx)
-        s"""
-          ${eval.code}
-          if (${eval.isNull}) {
-            $values[$i] = null;
-          } else {
-            $values[$i] = ${eval.value};
-          }"""
-      })
+      expressions = valCodes,
+      funcName = "createNamedStruct",
+      extraArguments = "Object[]" -> values :: Nil)
 
     ev.copy(code =
       s"""
-         |$values = new Object[${valExprs.size}];
+         |Object[] $values = new Object[${valExprs.size}];
          |$valuesCode
          |final InternalRow ${ev.value} = new $rowClass($values);
          |$values = null;
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 04e669492ec6d..a42dd7ecf57de 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -344,17 +344,17 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     } else {
       ""
     }
-    ctx.addMutableState(setName, setTerm,
-      s"$setTerm = (($InSetName)references[${ctx.references.size - 1}]).getSet();")
-    ev.copy(code = s"""
-      ${childGen.code}
-      boolean ${ev.isNull} = ${childGen.isNull};
-      boolean ${ev.value} = false;
-      if (!${ev.isNull}) {
-        ${ev.value} = $setTerm.contains(${childGen.value});
-        $setNull
-      }
-     """)
+    ev.copy(code =
+      s"""
+         |${childGen.code}
+         |${ctx.JAVA_BOOLEAN} ${ev.isNull} = ${childGen.isNull};
+         |${ctx.JAVA_BOOLEAN} ${ev.value} = false;
+         |if (!${ev.isNull}) {
+         |  $setName $setTerm = (($InSetName)references[${ctx.references.size - 1}]).getSet();
+         |  ${ev.value} = $setTerm.contains(${childGen.value});
+         |  $setNull
+         |}
+       """.stripMargin)
   }
 
   override def sql: String = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index b0eaad1c80f89..6dfca7d73a3df 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.UnresolvedExtractValue
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -299,4 +300,10 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
       new StringToMap(Literal("a=1_b=2_c=3"), Literal("_"), NonFoldableLiteral("="))
         .checkInputDataTypes().isFailure)
   }
+
+  test("SPARK-22693: CreateNamedStruct should not use global variables") {
+    val ctx = new CodegenContext
+    CreateNamedStruct(Seq("a", "x", "b", 2.0)).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 0079e4e8d6f74..95a0dfa057563 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.encoders.ExamplePointUDT
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
 import org.apache.spark.sql.types._
 
@@ -429,4 +430,10 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     val infinity = Literal(Double.PositiveInfinity)
     checkEvaluation(EqualTo(infinity, infinity), true)
   }
+
+  test("SPARK-22693: InSet should not use global variables") {
+    val ctx = new CodegenContext
+    InSet(Literal(1), Set(1, 2, 3, 4)).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
 }

From 8ae004b4602266d1f210e4c1564246d590412c06 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 6 Dec 2017 16:15:25 -0800
Subject: [PATCH 056/356] [SPARK-22688][SQL] Upgrade Janino version to 3.0.8

## What changes were proposed in this pull request?

This PR upgrade Janino version to 3.0.8. [Janino 3.0.8](https://janino-compiler.github.io/janino/changelog.html) includes an important fix to reduce the number of constant pool entries by using 'sipush' java bytecode.

* SIPUSH bytecode is not used for short integer constant [#33](https://github.com/janino-compiler/janino/issues/33).

Please see detail in [this discussion thread](https://github.com/apache/spark/pull/19518#issuecomment-346674976).

## How was this patch tested?

Existing tests

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19890 from kiszk/SPARK-22688.
---
 dev/deps/spark-deps-hadoop-2.6                              | 4 ++--
 dev/deps/spark-deps-hadoop-2.7                              | 4 ++--
 pom.xml                                                     | 2 +-
 .../sql/catalyst/expressions/codegen/CodeGenerator.scala    | 6 +++---
 .../scala/org/apache/spark/sql/execution/SparkPlan.scala    | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 3b5a6945d8c4b..1831f3378e852 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -35,7 +35,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-3.0.7.jar
+commons-compiler-3.0.8.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -96,7 +96,7 @@ jackson-mapper-asl-1.9.13.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
-janino-3.0.7.jar
+janino-3.0.8.jar
 java-xmlbuilder-1.1.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 64136baa01326..fe14c05987327 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -35,7 +35,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-3.0.7.jar
+commons-compiler-3.0.8.jar
 commons-compress-1.4.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -96,7 +96,7 @@ jackson-mapper-asl-1.9.13.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
-janino-3.0.7.jar
+janino-3.0.8.jar
 java-xmlbuilder-1.1.jar
 javassist-3.18.1-GA.jar
 javax.annotation-api-1.2.jar
diff --git a/pom.xml b/pom.xml
index 07bca9d267da0..52db79eaf036b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -170,7 +170,7 @@
     <!-- org.apache.commons/commons-lang3/-->
     <commons-lang3.version>3.5</commons-lang3.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
-    <janino.version>3.0.7</janino.version>
+    <janino.version>3.0.8</janino.version>
     <jersey.version>2.22.2</jersey.version>
     <joda.version>2.9.3</joda.version>
     <jodd.version>3.5.2</jodd.version>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 670c82eff9286..5c9e604a8d293 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -29,7 +29,7 @@ import scala.util.control.NonFatal
 import com.google.common.cache.{CacheBuilder, CacheLoader}
 import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException}
 import org.codehaus.commons.compiler.CompileException
-import org.codehaus.janino.{ByteArrayClassLoader, ClassBodyEvaluator, JaninoRuntimeException, SimpleCompiler}
+import org.codehaus.janino.{ByteArrayClassLoader, ClassBodyEvaluator, InternalCompilerException, SimpleCompiler}
 import org.codehaus.janino.util.ClassFile
 
 import org.apache.spark.{SparkEnv, TaskContext, TaskKilledException}
@@ -1240,12 +1240,12 @@ object CodeGenerator extends Logging {
       evaluator.cook("generated.java", code.body)
       updateAndGetCompilationStats(evaluator)
     } catch {
-      case e: JaninoRuntimeException =>
+      case e: InternalCompilerException =>
         val msg = s"failed to compile: $e"
         logError(msg, e)
         val maxLines = SQLConf.get.loggingMaxLinesForCodegen
         logInfo(s"\n${CodeFormatter.format(code, maxLines)}")
-        throw new JaninoRuntimeException(msg, e)
+        throw new InternalCompilerException(msg, e)
       case e: CompileException =>
         val msg = s"failed to compile: $e"
         logError(msg, e)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 657b265260135..787c1cfbfb3d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.concurrent.ExecutionContext
 
 import org.codehaus.commons.compiler.CompileException
-import org.codehaus.janino.JaninoRuntimeException
+import org.codehaus.janino.InternalCompilerException
 
 import org.apache.spark.{broadcast, SparkEnv}
 import org.apache.spark.internal.Logging
@@ -385,7 +385,7 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
     try {
       GeneratePredicate.generate(expression, inputSchema)
     } catch {
-      case _ @ (_: JaninoRuntimeException | _: CompileException) if codeGenFallBack =>
+      case _ @ (_: InternalCompilerException | _: CompileException) if codeGenFallBack =>
         genInterpretedPredicate(expression, inputSchema)
     }
   }

From d32337b1effd5359e1ce7e46893767c908d4b16a Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Thu, 7 Dec 2017 13:05:59 +0100
Subject: [PATCH 057/356] [SPARK-22721] BytesToBytesMap peak memory usage not
 accurate after reset()

## What changes were proposed in this pull request?

BytesToBytesMap doesn't update peak memory usage before shrinking back to initial capacity in reset(), so after a disk spill one never knows what was the size of hash table was before spilling.

## How was this patch tested?

Checked manually.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #19915 from juliuszsompolski/SPARK-22721.
---
 .../main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 4fadfe36cd716..7fdcf22c45f73 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -879,6 +879,7 @@ public LongArray getArray() {
    * Reset this map to initialized state.
    */
   public void reset() {
+    updatePeakMemoryUsed();
     numKeys = 0;
     numValues = 0;
     freeArray(longArray);

From c1e5688d1a44c152d27dcb9a04da22993d7ab826 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 7 Dec 2017 20:42:46 +0800
Subject: [PATCH 058/356] [SPARK-22672][SQL][TEST] Refactor ORC Tests

## What changes were proposed in this pull request?

Since SPARK-20682, we have two `OrcFileFormat`s. This PR refactors ORC tests with three principles (with a few exceptions)
1. Move test suite into `sql/core`.
2. Create `HiveXXX` test suite in `sql/hive` by reusing `sql/core` test suite.
3. `OrcTest` will provide common helper functions and `val orcImp: String`.

**Test Suites**

*Native OrcFileFormat*
- org.apache.spark.sql.hive.orc
  - OrcFilterSuite
  - OrcPartitionDiscoverySuite
  - OrcQuerySuite
  - OrcSourceSuite
- o.a.s.sql.hive.orc
  - OrcHadoopFsRelationSuite

*Hive built-in OrcFileFormat*

- o.a.s.sql.hive.orc
  - HiveOrcFilterSuite
  - HiveOrcPartitionDiscoverySuite
  - HiveOrcQuerySuite
  - HiveOrcSourceSuite
  - HiveOrcHadoopFsRelationSuite

**Hierarchy**
```
OrcTest
    -> OrcSuite
        -> OrcSourceSuite
    -> OrcQueryTest
        -> OrcQuerySuite
    -> OrcPartitionDiscoveryTest
        -> OrcPartitionDiscoverySuite
    -> OrcFilterSuite

HadoopFsRelationTest
    -> OrcHadoopFsRelationSuite
        -> HiveOrcHadoopFsRelationSuite
```

Please note the followings.
- Unlike the other test suites, `OrcHadoopFsRelationSuite` doesn't inherit `OrcTest`. It is inside `sql/hive` like `ParquetHadoopFsRelationSuite` due to the dependencies and follows the existing convention to use `val dataSourceName: String`
- `OrcFilterSuite`s cannot reuse test cases due to the different function signatures using Hive 1.2.1 ORC classes and Apache ORC 1.4.1 classes.

## How was this patch tested?

Pass the Jenkins tests with reorganized test suites.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19882 from dongjoon-hyun/SPARK-22672.
---
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  28 --
 .../datasources}/orc/OrcFilterSuite.scala     |  74 ++--
 .../orc/OrcPartitionDiscoverySuite.scala      |  48 +--
 .../datasources}/orc/OrcQuerySuite.scala      | 350 ++++++++--------
 .../datasources}/orc/OrcSourceSuite.scala     | 137 ++-----
 .../execution/datasources}/orc/OrcTest.scala  |  37 +-
 .../sql/hive/orc/HiveOrcFilterSuite.scala     | 387 ++++++++++++++++++
 .../orc/HiveOrcPartitionDiscoverySuite.scala  |  25 ++
 .../sql/hive/orc/HiveOrcQuerySuite.scala      | 165 ++++++++
 .../sql/hive/orc/HiveOrcSourceSuite.scala     | 107 +++++
 .../hive/orc/OrcHadoopFsRelationSuite.scala   |   8 +-
 11 files changed, 971 insertions(+), 395 deletions(-)
 rename sql/{hive/src/test/scala/org/apache/spark/sql/hive => core/src/test/scala/org/apache/spark/sql/execution/datasources}/orc/OrcFilterSuite.scala (87%)
 rename sql/{hive/src/test/scala/org/apache/spark/sql/hive => core/src/test/scala/org/apache/spark/sql/execution/datasources}/orc/OrcPartitionDiscoverySuite.scala (82%)
 rename sql/{hive/src/test/scala/org/apache/spark/sql/hive => core/src/test/scala/org/apache/spark/sql/execution/datasources}/orc/OrcQuerySuite.scala (68%)
 rename sql/{hive/src/test/scala/org/apache/spark/sql/hive => core/src/test/scala/org/apache/spark/sql/execution/datasources}/orc/OrcSourceSuite.scala (63%)
 rename sql/{hive/src/test/scala/org/apache/spark/sql/hive => core/src/test/scala/org/apache/spark/sql/execution/datasources}/orc/OrcTest.scala (72%)
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcPartitionDiscoverySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 8ddddbeee598f..5e077285ade55 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2775,32 +2775,4 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  test("SPARK-21791 ORC should support column names with dot") {
-    val orc = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
-    withTempDir { dir =>
-      val path = new File(dir, "orc").getCanonicalPath
-      Seq(Some(1), None).toDF("col.dots").write.format(orc).save(path)
-      assert(spark.read.format(orc).load(path).collect().length == 2)
-    }
-  }
-
-  test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
-    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {
-      val e = intercept[AnalysisException] {
-        sql("CREATE TABLE spark_20728(a INT) USING ORC")
-      }
-      assert(e.message.contains("Hive built-in ORC data source must be used with Hive support"))
-    }
-
-    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
-      withTable("spark_20728") {
-        sql("CREATE TABLE spark_20728(a INT) USING ORC")
-        val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
-          case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
-        }
-        assert(fileFormat == Some(classOf[OrcFileFormat]))
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
similarity index 87%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
index de6f0d67f1734..a5f6b68ee862e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
@@ -15,25 +15,32 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.orc
+package org.apache.spark.sql.execution.datasources.orc
 
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
 import scala.collection.JavaConverters._
 
-import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
+import org.apache.orc.storage.ql.io.sarg.{PredicateLeaf, SearchArgument}
 
-import org.apache.spark.sql.{Column, DataFrame, QueryTest}
+import org.apache.spark.sql.{Column, DataFrame}
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types._
 
 /**
- * A test suite that tests ORC filter API based filter pushdown optimization.
+ * A test suite that tests Apache ORC filter API based filter pushdown optimization.
+ * OrcFilterSuite and HiveOrcFilterSuite is logically duplicated to provide the same test coverage.
+ * The difference are the packages containing 'Predicate' and 'SearchArgument' classes.
+ * - OrcFilterSuite uses 'org.apache.orc.storage.ql.io.sarg' package.
+ * - HiveOrcFilterSuite uses 'org.apache.hadoop.hive.ql.io.sarg' package.
  */
-class OrcFilterSuite extends QueryTest with OrcTest {
+class OrcFilterSuite extends OrcTest with SharedSQLContext {
+
   private def checkFilterPredicate(
       df: DataFrame,
       predicate: Predicate,
@@ -55,7 +62,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
     assert(selectedFilters.nonEmpty, "No filter is pushed down")
 
-    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
+    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters)
     assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $selectedFilters")
     checker(maybeFilter.get)
   }
@@ -99,7 +106,7 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
     assert(selectedFilters.nonEmpty, "No filter is pushed down")
 
-    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
+    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters)
     assert(maybeFilter.isEmpty, s"Could generate filter predicate for $selectedFilters")
   }
 
@@ -284,40 +291,27 @@ class OrcFilterSuite extends QueryTest with OrcTest {
 
   test("filter pushdown - combinations with logical operators") {
     withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
-      // Because `ExpressionTree` is not accessible at Hive 1.2.x, this should be checked
-      // in string form in order to check filter creation including logical operators
-      // such as `and`, `or` or `not`. So, this function uses `SearchArgument.toString()`
-      // to produce string expression and then compare it to given string expression below.
-      // This might have to be changed after Hive version is upgraded.
       checkFilterPredicate(
         '_1.isNotNull,
-        """leaf-0 = (IS_NULL _1)
-          |expr = (not leaf-0)""".stripMargin.trim
+        "leaf-0 = (IS_NULL _1), expr = (not leaf-0)"
       )
       checkFilterPredicate(
         '_1 =!= 1,
-        """leaf-0 = (IS_NULL _1)
-          |leaf-1 = (EQUALS _1 1)
-          |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
+        "leaf-0 = (IS_NULL _1), leaf-1 = (EQUALS _1 1), expr = (and (not leaf-0) (not leaf-1))"
       )
       checkFilterPredicate(
         !('_1 < 4),
-        """leaf-0 = (IS_NULL _1)
-          |leaf-1 = (LESS_THAN _1 4)
-          |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
+        "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 4), expr = (and (not leaf-0) (not leaf-1))"
       )
       checkFilterPredicate(
         '_1 < 2 || '_1 > 3,
-        """leaf-0 = (LESS_THAN _1 2)
-          |leaf-1 = (LESS_THAN_EQUALS _1 3)
-          |expr = (or leaf-0 (not leaf-1))""".stripMargin.trim
+        "leaf-0 = (LESS_THAN _1 2), leaf-1 = (LESS_THAN_EQUALS _1 3), " +
+          "expr = (or leaf-0 (not leaf-1))"
       )
       checkFilterPredicate(
         '_1 < 2 && '_1 > 3,
-        """leaf-0 = (IS_NULL _1)
-          |leaf-1 = (LESS_THAN _1 2)
-          |leaf-2 = (LESS_THAN_EQUALS _1 3)
-          |expr = (and (not leaf-0) leaf-1 (not leaf-2))""".stripMargin.trim
+        "leaf-0 = (IS_NULL _1), leaf-1 = (LESS_THAN _1 2), leaf-2 = (LESS_THAN_EQUALS _1 3), " +
+          "expr = (and (not leaf-0) leaf-1 (not leaf-2))"
       )
     }
   }
@@ -344,4 +338,30 @@ class OrcFilterSuite extends QueryTest with OrcTest {
       checkNoFilterPredicate('_1.isNotNull)
     }
   }
+
+  test("SPARK-12218 Converting conjunctions into ORC SearchArguments") {
+    import org.apache.spark.sql.sources._
+    // The `LessThan` should be converted while the `StringContains` shouldn't
+    val schema = new StructType(
+      Array(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", StringType, nullable = true)))
+    assertResult("leaf-0 = (LESS_THAN a 10), expr = leaf-0") {
+      OrcFilters.createFilter(schema, Array(
+        LessThan("a", 10),
+        StringContains("b", "prefix")
+      )).get.toString
+    }
+
+    // The `LessThan` should be converted while the whole inner `And` shouldn't
+    assertResult("leaf-0 = (LESS_THAN a 10), expr = leaf-0") {
+      OrcFilters.createFilter(schema, Array(
+        LessThan("a", 10),
+        Not(And(
+          GreaterThan("a", 1),
+          StringContains("b", "prefix")
+        ))
+      )).get.toString
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala
similarity index 82%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala
index d1ce3f1e2f058..d1911ea7f32a9 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcPartitionDiscoverySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcPartitionDiscoverySuite.scala
@@ -15,19 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.orc
+package org.apache.spark.sql.execution.datasources.orc
 
 import java.io.File
 
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.TypeTag
-
-import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.scalatest.BeforeAndAfterAll
-
 import org.apache.spark.sql._
-import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.util.Utils
+import org.apache.spark.sql.test.SharedSQLContext
 
 // The data where the partitioning key exists only in the directory structure.
 case class OrcParData(intField: Int, stringField: String)
@@ -35,28 +28,8 @@ case class OrcParData(intField: Int, stringField: String)
 // The data that also includes the partitioning key
 case class OrcParDataWithKey(intField: Int, pi: Int, stringField: String, ps: String)
 
-// TODO This test suite duplicates ParquetPartitionDiscoverySuite a lot
-class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
-  import spark._
-  import spark.implicits._
-
-  val defaultPartitionName = ConfVars.DEFAULTPARTITIONNAME.defaultStrVal
-
-  def withTempDir(f: File => Unit): Unit = {
-    val dir = Utils.createTempDir().getCanonicalFile
-    try f(dir) finally Utils.deleteRecursively(dir)
-  }
-
-  def makeOrcFile[T <: Product: ClassTag: TypeTag](
-      data: Seq[T], path: File): Unit = {
-    data.toDF().write.mode("overwrite").orc(path.getCanonicalPath)
-  }
-
-
-  def makeOrcFile[T <: Product: ClassTag: TypeTag](
-      df: DataFrame, path: File): Unit = {
-    df.write.mode("overwrite").orc(path.getCanonicalPath)
-  }
+abstract class OrcPartitionDiscoveryTest extends OrcTest {
+  val defaultPartitionName = "__HIVE_DEFAULT_PARTITION__"
 
   protected def withTempTable(tableName: String)(f: => Unit): Unit = {
     try f finally spark.catalog.dropTempView(tableName)
@@ -90,7 +63,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
+      spark.read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -137,7 +110,7 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
+      spark.read.orc(base.getCanonicalPath).createOrReplaceTempView("t")
 
       withTempTable("t") {
         checkAnswer(
@@ -186,8 +159,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read
-        .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
+      spark.read
+        .option("hive.exec.default.partition.name", defaultPartitionName)
         .orc(base.getCanonicalPath)
         .createOrReplaceTempView("t")
 
@@ -228,8 +201,8 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
           makePartitionDir(base, defaultPartitionName, "pi" -> pi, "ps" -> ps))
       }
 
-      read
-        .option(ConfVars.DEFAULTPARTITIONNAME.varname, defaultPartitionName)
+      spark.read
+        .option("hive.exec.default.partition.name", defaultPartitionName)
         .orc(base.getCanonicalPath)
         .createOrReplaceTempView("t")
 
@@ -253,3 +226,4 @@ class OrcPartitionDiscoverySuite extends QueryTest with TestHiveSingleton with B
   }
 }
 
+class OrcPartitionDiscoverySuite extends OrcPartitionDiscoveryTest with SharedSQLContext
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
similarity index 68%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
index 1ffaf30311037..e00e057a18cc6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
@@ -15,24 +15,27 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.orc
+package org.apache.spark.sql.execution.datasources.orc
 
+import java.io.File
 import java.nio.charset.StandardCharsets
 import java.sql.Timestamp
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.hive.ql.io.orc.{OrcStruct, SparkOrcNewRecordReader}
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType}
+import org.apache.hadoop.mapreduce.lib.input.FileSplit
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+import org.apache.orc.{OrcConf, OrcFile}
 import org.apache.orc.OrcConf.COMPRESS
-import org.scalatest.BeforeAndAfterAll
+import org.apache.orc.mapred.OrcStruct
+import org.apache.orc.mapreduce.OrcInputFormat
 
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator}
-import org.apache.spark.sql.hive.HiveUtils
-import org.apache.spark.sql.hive.test.TestHive._
-import org.apache.spark.sql.hive.test.TestHive.implicits._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{IntegerType, StructType}
 import org.apache.spark.util.Utils
 
@@ -57,7 +60,8 @@ case class Contact(name: String, phone: String)
 
 case class Person(name: String, age: Int, contacts: Seq[Contact])
 
-class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
+abstract class OrcQueryTest extends OrcTest {
+  import testImplicits._
 
   test("Read/write All Types") {
     val data = (0 to 255).map { i =>
@@ -73,7 +77,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
   test("Read/write binary data") {
     withOrcFile(BinaryData("test".getBytes(StandardCharsets.UTF_8)) :: Nil) { file =>
-      val bytes = read.orc(file).head().getAs[Array[Byte]](0)
+      val bytes = spark.read.orc(file).head().getAs[Array[Byte]](0)
       assert(new String(bytes, StandardCharsets.UTF_8) === "test")
     }
   }
@@ -91,7 +95,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        read.orc(file),
+        spark.read.orc(file),
         data.toDF().collect())
     }
   }
@@ -172,7 +176,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
     withOrcFile(data) { file =>
       checkAnswer(
-        read.orc(file),
+        spark.read.orc(file),
         Row(Seq.fill(5)(null): _*))
     }
   }
@@ -183,9 +187,13 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       spark.range(0, 10).write
         .option(COMPRESS.getAttribute, "ZLIB")
         .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("ZLIB" === expectedCompressionKind.name())
+
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".zlib.orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
     }
 
     // `compression` overrides `orc.compress`.
@@ -194,9 +202,13 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
         .option("compression", "ZLIB")
         .option(COMPRESS.getAttribute, "SNAPPY")
         .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("ZLIB" === expectedCompressionKind.name())
+
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".zlib.orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
     }
   }
 
@@ -206,39 +218,39 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       spark.range(0, 10).write
         .option("compression", "ZLIB")
         .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("ZLIB" === expectedCompressionKind.name())
+
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".zlib.orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("ZLIB" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
     }
 
     withTempPath { file =>
       spark.range(0, 10).write
         .option("compression", "SNAPPY")
         .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("SNAPPY" === expectedCompressionKind.name())
+
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".snappy.orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("SNAPPY" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
     }
 
     withTempPath { file =>
       spark.range(0, 10).write
         .option("compression", "NONE")
         .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("NONE" === expectedCompressionKind.name())
-    }
-  }
 
-  // Following codec is not supported in Hive 1.2.1, ignore it now
-  ignore("LZO compression options for writing to an ORC file not supported in Hive 1.2.1") {
-    withTempPath { file =>
-      spark.range(0, 10).write
-        .option("compression", "LZO")
-        .orc(file.getCanonicalPath)
-      val expectedCompressionKind =
-        OrcFileOperator.getFileReader(file.getCanonicalPath).get.getCompression
-      assert("LZO" === expectedCompressionKind.name())
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("NONE" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
     }
   }
 
@@ -256,22 +268,28 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
 
   test("appending") {
     val data = (0 until 10).map(i => (i, i.toString))
-    createDataFrame(data).toDF("c1", "c2").createOrReplaceTempView("tmp")
+    spark.createDataFrame(data).toDF("c1", "c2").createOrReplaceTempView("tmp")
     withOrcTable(data, "t") {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
-      checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
+      checkAnswer(spark.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    sessionState.catalog.dropTable(TableIdentifier("tmp"), ignoreIfNotExists = true, purge = false)
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("tmp"),
+      ignoreIfNotExists = true,
+      purge = false)
   }
 
   test("overwriting") {
     val data = (0 until 10).map(i => (i, i.toString))
-    createDataFrame(data).toDF("c1", "c2").createOrReplaceTempView("tmp")
+    spark.createDataFrame(data).toDF("c1", "c2").createOrReplaceTempView("tmp")
     withOrcTable(data, "t") {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
-      checkAnswer(table("t"), data.map(Row.fromTuple))
+      checkAnswer(spark.table("t"), data.map(Row.fromTuple))
     }
-    sessionState.catalog.dropTable(TableIdentifier("tmp"), ignoreIfNotExists = true, purge = false)
+    spark.sessionState.catalog.dropTable(
+      TableIdentifier("tmp"),
+      ignoreIfNotExists = true,
+      purge = false)
   }
 
   test("self-join") {
@@ -334,60 +352,16 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
-      spark.range(0, 10).select('id as "Acol").write.format("orc").save(path)
-      spark.read.format("orc").load(path).schema("Acol")
+      spark.range(0, 10).select('id as "Acol").write.orc(path)
+      spark.read.orc(path).schema("Acol")
       intercept[IllegalArgumentException] {
-        spark.read.format("orc").load(path).schema("acol")
+        spark.read.orc(path).schema("acol")
       }
-      checkAnswer(spark.read.format("orc").load(path).select("acol").sort("acol"),
+      checkAnswer(spark.read.orc(path).select("acol").sort("acol"),
         (0 until 10).map(Row(_)))
     }
   }
 
-  test("SPARK-8501: Avoids discovery schema from empty ORC files") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-
-      withTable("empty_orc") {
-        withTempView("empty", "single") {
-          spark.sql(
-            s"""CREATE TABLE empty_orc(key INT, value STRING)
-               |STORED AS ORC
-               |LOCATION '${dir.toURI}'
-             """.stripMargin)
-
-          val emptyDF = Seq.empty[(Int, String)].toDF("key", "value").coalesce(1)
-          emptyDF.createOrReplaceTempView("empty")
-
-          // This creates 1 empty ORC file with Hive ORC SerDe.  We are using this trick because
-          // Spark SQL ORC data source always avoids write empty ORC files.
-          spark.sql(
-            s"""INSERT INTO TABLE empty_orc
-               |SELECT key, value FROM empty
-             """.stripMargin)
-
-          val errorMessage = intercept[AnalysisException] {
-            spark.read.orc(path)
-          }.getMessage
-
-          assert(errorMessage.contains("Unable to infer schema for ORC"))
-
-          val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1)
-          singleRowDF.createOrReplaceTempView("single")
-
-          spark.sql(
-            s"""INSERT INTO TABLE empty_orc
-               |SELECT key, value FROM single
-             """.stripMargin)
-
-          val df = spark.read.orc(path)
-          assert(df.schema === singleRowDF.schema.asNullable)
-          checkAnswer(df, singleRowDF)
-        }
-      }
-    }
-  }
-
   test("SPARK-10623 Enable ORC PPD") {
     withTempPath { dir =>
       withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
@@ -405,7 +379,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
         }
         // It needs to repartition data so that we can have several ORC files
         // in order to skip stripes in ORC.
-        createDataFrame(data).toDF("a", "b").repartition(10).write.orc(path)
+        spark.createDataFrame(data).toDF("a", "b").repartition(10).write.orc(path)
         val df = spark.read.orc(path)
 
         def checkPredicate(pred: Column, answer: Seq[Row]): Unit = {
@@ -440,77 +414,6 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
-  test("Verify the ORC conversion parameter: CONVERT_METASTORE_ORC") {
-    withTempView("single") {
-      val singleRowDF = Seq((0, "foo")).toDF("key", "value")
-      singleRowDF.createOrReplaceTempView("single")
-
-      Seq("true", "false").foreach { orcConversion =>
-        withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> orcConversion) {
-          withTable("dummy_orc") {
-            withTempPath { dir =>
-              val path = dir.getCanonicalPath
-              spark.sql(
-                s"""
-                   |CREATE TABLE dummy_orc(key INT, value STRING)
-                   |STORED AS ORC
-                   |LOCATION '${dir.toURI}'
-                 """.stripMargin)
-
-              spark.sql(
-                s"""
-                   |INSERT INTO TABLE dummy_orc
-                   |SELECT key, value FROM single
-                 """.stripMargin)
-
-              val df = spark.sql("SELECT * FROM dummy_orc WHERE key=0")
-              checkAnswer(df, singleRowDF)
-
-              val queryExecution = df.queryExecution
-              if (orcConversion == "true") {
-                queryExecution.analyzed.collectFirst {
-                  case _: LogicalRelation => ()
-                }.getOrElse {
-                  fail(s"Expecting the query plan to convert orc to data sources, " +
-                    s"but got:\n$queryExecution")
-                }
-              } else {
-                queryExecution.analyzed.collectFirst {
-                  case _: HiveTableRelation => ()
-                }.getOrElse {
-                  fail(s"Expecting no conversion from orc to data sources, " +
-                    s"but got:\n$queryExecution")
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  test("converted ORC table supports resolving mixed case field") {
-    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
-      withTable("dummy_orc") {
-        withTempPath { dir =>
-          val df = spark.range(5).selectExpr("id", "id as valueField", "id as partitionValue")
-          df.write
-            .partitionBy("partitionValue")
-            .mode("overwrite")
-            .orc(dir.getAbsolutePath)
-
-          spark.sql(s"""
-            |create external table dummy_orc (id long, valueField long)
-            |partitioned by (partitionValue int)
-            |stored as orc
-            |location "${dir.toURI}"""".stripMargin)
-          spark.sql(s"msck repair table dummy_orc")
-          checkAnswer(spark.sql("select * from dummy_orc"), df)
-        }
-      }
-    }
-  }
-
   test("SPARK-14962 Produce correct results on array type with isnotnull") {
     withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
       val data = (0 until 10).map(i => Tuple1(Array(i)))
@@ -544,7 +447,8 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       withTempPath { file =>
         // It needs to repartition data so that we can have several ORC files
         // in order to skip stripes in ORC.
-        createDataFrame(data).toDF("a").repartition(10).write.orc(file.getCanonicalPath)
+        spark.createDataFrame(data).toDF("a").repartition(10)
+          .write.orc(file.getCanonicalPath)
         val df = spark.read.orc(file.getCanonicalPath).where("a == 2")
         val actual = stripSparkFilter(df).count()
 
@@ -563,7 +467,8 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       withTempPath { file =>
         // It needs to repartition data so that we can have several ORC files
         // in order to skip stripes in ORC.
-        createDataFrame(data).toDF("a").repartition(10).write.orc(file.getCanonicalPath)
+        spark.createDataFrame(data).toDF("a").repartition(10)
+          .write.orc(file.getCanonicalPath)
         val df = spark.read.orc(file.getCanonicalPath).where(s"a == '$timeString'")
         val actual = stripSparkFilter(df).count()
 
@@ -596,14 +501,18 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
   test("Empty schema does not read data from ORC file") {
     val data = Seq((1, 1), (2, 2))
     withOrcFile(data) { path =>
-      val requestedSchema = StructType(Nil)
       val conf = new Configuration()
-      val physicalSchema = OrcFileOperator.readSchema(Seq(path), Some(conf)).get
-      OrcFileFormat.setRequiredColumns(conf, physicalSchema, requestedSchema)
-      val maybeOrcReader = OrcFileOperator.getFileReader(path, Some(conf))
-      assert(maybeOrcReader.isDefined)
-      val orcRecordReader = new SparkOrcNewRecordReader(
-        maybeOrcReader.get, conf, 0, maybeOrcReader.get.getContentLength)
+      conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute, "")
+      conf.setBoolean("hive.io.file.read.all.columns", false)
+
+      val orcRecordReader = {
+        val file = new File(path).listFiles().find(_.getName.endsWith(".snappy.orc")).head
+        val split = new FileSplit(new Path(file.toURI), 0, file.length, Array.empty[String])
+        val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
+        val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
+        val oif = new OrcInputFormat[OrcStruct]
+        oif.createRecordReader(split, hadoopAttemptContext)
+      }
 
       val recordsIterator = new RecordReaderIterator[OrcStruct](orcRecordReader)
       try {
@@ -614,27 +523,88 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
     }
   }
 
-   test("read from multiple orc input paths") {
-     val path1 = Utils.createTempDir()
-     val path2 = Utils.createTempDir()
-     makeOrcFile((1 to 10).map(Tuple1.apply), path1)
-     makeOrcFile((1 to 10).map(Tuple1.apply), path2)
-     assertResult(20)(read.orc(path1.getCanonicalPath, path2.getCanonicalPath).count())
-   }
+  test("read from multiple orc input paths") {
+    val path1 = Utils.createTempDir()
+    val path2 = Utils.createTempDir()
+    makeOrcFile((1 to 10).map(Tuple1.apply), path1)
+    makeOrcFile((1 to 10).map(Tuple1.apply), path2)
+    val df = spark.read.orc(path1.getCanonicalPath, path2.getCanonicalPath)
+    assert(df.count() == 20)
+  }
+}
+
+class OrcQuerySuite extends OrcQueryTest with SharedSQLContext {
+  import testImplicits._
+
+  test("LZO compression options for writing to an ORC file") {
+    withTempPath { file =>
+      spark.range(0, 10).write
+        .option("compression", "LZO")
+        .orc(file.getCanonicalPath)
+
+      val maybeOrcFile = file.listFiles().find(_.getName.endsWith(".lzo.orc"))
+      assert(maybeOrcFile.isDefined)
+
+      val orcFilePath = new Path(maybeOrcFile.get.getAbsolutePath)
+      val conf = OrcFile.readerOptions(new Configuration())
+      assert("LZO" === OrcFile.createReader(orcFilePath, conf).getCompressionKind.name)
+    }
+  }
+
+  test("Schema discovery on empty ORC files") {
+    // SPARK-8501 is fixed.
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withTable("empty_orc") {
+        withTempView("empty", "single") {
+          spark.sql(
+            s"""CREATE TABLE empty_orc(key INT, value STRING)
+               |USING ORC
+               |LOCATION '${dir.toURI}'
+             """.stripMargin)
+
+          val emptyDF = Seq.empty[(Int, String)].toDF("key", "value").coalesce(1)
+          emptyDF.createOrReplaceTempView("empty")
+
+          // This creates 1 empty ORC file with ORC SerDe.  We are using this trick because
+          // Spark SQL ORC data source always avoids write empty ORC files.
+          spark.sql(
+            s"""INSERT INTO TABLE empty_orc
+               |SELECT key, value FROM empty
+             """.stripMargin)
+
+          val df = spark.read.orc(path)
+          assert(df.schema === emptyDF.schema.asNullable)
+          checkAnswer(df, emptyDF)
+        }
+      }
+    }
+  }
+
+  test("SPARK-21791 ORC should support column names with dot") {
+    withTempDir { dir =>
+      val path = new File(dir, "orc").getCanonicalPath
+      Seq(Some(1), None).toDF("col.dots").write.orc(path)
+      assert(spark.read.orc(path).collect().length == 2)
+    }
+  }
 
   test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
-    Seq(
-      ("native", classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
-      ("hive", classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach { case (i, format) =>
-
-      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> i) {
-        withTable("spark_20728") {
-          sql("CREATE TABLE spark_20728(a INT) USING ORC")
-          val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
-            case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
-          }
-          assert(fileFormat == Some(format))
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "hive") {
+      val e = intercept[AnalysisException] {
+        sql("CREATE TABLE spark_20728(a INT) USING ORC")
+      }
+      assert(e.message.contains("Hive built-in ORC data source must be used with Hive support"))
+    }
+
+    withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> "native") {
+      withTable("spark_20728") {
+        sql("CREATE TABLE spark_20728(a INT) USING ORC")
+        val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
+          case l: LogicalRelation => l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
         }
+        assert(fileFormat == Some(classOf[OrcFileFormat]))
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
similarity index 63%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 2a086be57f517..6f5f2fd795f74 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.orc
+package org.apache.spark.sql.execution.datasources.orc
 
 import java.io.File
 import java.util.Locale
@@ -23,50 +23,30 @@ import java.util.Locale
 import org.apache.orc.OrcConf.COMPRESS
 import org.scalatest.BeforeAndAfterAll
 
-import org.apache.spark.sql.{QueryTest, Row}
-import org.apache.spark.sql.execution.datasources.orc.OrcOptions
-import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources._
-import org.apache.spark.sql.types._
+import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
 case class OrcData(intField: Int, stringField: String)
 
-abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll {
-  import spark._
+abstract class OrcSuite extends OrcTest with BeforeAndAfterAll {
+  import testImplicits._
 
   var orcTableDir: File = null
   var orcTableAsDir: File = null
 
-  override def beforeAll(): Unit = {
+  protected override def beforeAll(): Unit = {
     super.beforeAll()
 
     orcTableAsDir = Utils.createTempDir("orctests", "sparksql")
-
-    // Hack: to prepare orc data files using hive external tables
     orcTableDir = Utils.createTempDir("orctests", "sparksql")
-    import org.apache.spark.sql.hive.test.TestHive.implicits._
 
     sparkContext
       .makeRDD(1 to 10)
       .map(i => OrcData(i, s"part-$i"))
       .toDF()
-      .createOrReplaceTempView(s"orc_temp_table")
-
-    sql(
-      s"""CREATE EXTERNAL TABLE normal_orc(
-         |  intField INT,
-         |  stringField STRING
-         |)
-         |STORED AS ORC
-         |LOCATION '${orcTableAsDir.toURI}'
-       """.stripMargin)
-
-    sql(
-      s"""INSERT INTO TABLE normal_orc
-         |SELECT intField, stringField FROM orc_temp_table
-       """.stripMargin)
+      .createOrReplaceTempView("orc_temp_table")
   }
 
   test("create temporary orc table") {
@@ -152,56 +132,13 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
   }
 
   test("SPARK-18433: Improve DataSource option keys to be more case-insensitive") {
-    val conf = sqlContext.sessionState.conf
+    val conf = spark.sessionState.conf
     val option = new OrcOptions(Map(COMPRESS.getAttribute.toUpperCase(Locale.ROOT) -> "NONE"), conf)
     assert(option.compressionCodec == "NONE")
   }
 
-  test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
-    val location = Utils.createTempDir()
-    val uri = location.toURI
-    try {
-      hiveClient.runSqlHive("USE default")
-      hiveClient.runSqlHive(
-        """
-          |CREATE EXTERNAL TABLE hive_orc(
-          |  a STRING,
-          |  b CHAR(10),
-          |  c VARCHAR(10),
-          |  d ARRAY<CHAR(3)>)
-          |STORED AS orc""".stripMargin)
-      // Hive throws an exception if I assign the location in the create table statement.
-      hiveClient.runSqlHive(
-        s"ALTER TABLE hive_orc SET LOCATION '$uri'")
-      hiveClient.runSqlHive(
-        """
-          |INSERT INTO TABLE hive_orc
-          |SELECT 'a', 'b', 'c', ARRAY(CAST('d' AS CHAR(3)))
-          |FROM (SELECT 1) t""".stripMargin)
-
-      // We create a different table in Spark using the same schema which points to
-      // the same location.
-      spark.sql(
-        s"""
-           |CREATE EXTERNAL TABLE spark_orc(
-           |  a STRING,
-           |  b CHAR(10),
-           |  c VARCHAR(10),
-           |  d ARRAY<CHAR(3)>)
-           |STORED AS orc
-           |LOCATION '$uri'""".stripMargin)
-      val result = Row("a", "b         ", "c", Seq("d  "))
-      checkAnswer(spark.table("hive_orc"), result)
-      checkAnswer(spark.table("spark_orc"), result)
-    } finally {
-      hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc")
-      hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc")
-      Utils.deleteRecursively(location)
-    }
-  }
-
   test("SPARK-21839: Add SQL config for ORC compression") {
-    val conf = sqlContext.sessionState.conf
+    val conf = spark.sessionState.conf
     // Test if the default of spark.sql.orc.compression.codec is snappy
     assert(new OrcOptions(Map.empty[String, String], conf).compressionCodec == "SNAPPY")
 
@@ -225,13 +162,28 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
   }
 }
 
-class OrcSourceSuite extends OrcSuite {
-  override def beforeAll(): Unit = {
+class OrcSourceSuite extends OrcSuite with SharedSQLContext {
+
+  protected override def beforeAll(): Unit = {
     super.beforeAll()
 
+    sql(
+      s"""CREATE TABLE normal_orc(
+         |  intField INT,
+         |  stringField STRING
+         |)
+         |USING ORC
+         |LOCATION '${orcTableAsDir.toURI}'
+       """.stripMargin)
+
+    sql(
+      s"""INSERT INTO TABLE normal_orc
+         |SELECT intField, stringField FROM orc_temp_table
+       """.stripMargin)
+
     spark.sql(
       s"""CREATE TEMPORARY VIEW normal_orc_source
-         |USING org.apache.spark.sql.hive.orc
+         |USING ORC
          |OPTIONS (
          |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
          |)
@@ -239,43 +191,10 @@ class OrcSourceSuite extends OrcSuite {
 
     spark.sql(
       s"""CREATE TEMPORARY VIEW normal_orc_as_source
-         |USING org.apache.spark.sql.hive.orc
+         |USING ORC
          |OPTIONS (
          |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
          |)
        """.stripMargin)
   }
-
-  test("SPARK-12218 Converting conjunctions into ORC SearchArguments") {
-    // The `LessThan` should be converted while the `StringContains` shouldn't
-    val schema = new StructType(
-      Array(
-        StructField("a", IntegerType, nullable = true),
-        StructField("b", StringType, nullable = true)))
-    assertResult(
-      """leaf-0 = (LESS_THAN a 10)
-        |expr = leaf-0
-      """.stripMargin.trim
-    ) {
-      OrcFilters.createFilter(schema, Array(
-        LessThan("a", 10),
-        StringContains("b", "prefix")
-      )).get.toString
-    }
-
-    // The `LessThan` should be converted while the whole inner `And` shouldn't
-    assertResult(
-      """leaf-0 = (LESS_THAN a 10)
-        |expr = leaf-0
-      """.stripMargin.trim
-    ) {
-      OrcFilters.createFilter(schema, Array(
-        LessThan("a", 10),
-        Not(And(
-          GreaterThan("a", 1),
-          StringContains("b", "prefix")
-        ))
-      )).get.toString
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
similarity index 72%
rename from sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
index a2f08c5ba72c6..d94cb850ed2a2 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
@@ -15,20 +15,51 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.hive.orc
+package org.apache.spark.sql.execution.datasources.orc
 
 import java.io.File
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.scalatest.BeforeAndAfterAll
+
 import org.apache.spark.sql._
-import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf.ORC_IMPLEMENTATION
 import org.apache.spark.sql.test.SQLTestUtils
 
-private[sql] trait OrcTest extends SQLTestUtils with TestHiveSingleton {
+/**
+ * OrcTest
+ *   -> OrcSuite
+ *       -> OrcSourceSuite
+ *       -> HiveOrcSourceSuite
+ *   -> OrcQueryTests
+ *       -> OrcQuerySuite
+ *       -> HiveOrcQuerySuite
+ *   -> OrcPartitionDiscoveryTest
+ *       -> OrcPartitionDiscoverySuite
+ *       -> HiveOrcPartitionDiscoverySuite
+ *   -> OrcFilterSuite
+ *   -> HiveOrcFilterSuite
+ */
+abstract class OrcTest extends QueryTest with SQLTestUtils with BeforeAndAfterAll {
   import testImplicits._
 
+  val orcImp: String = "native"
+
+  private var originalConfORCImplementation = "native"
+
+  protected override def beforeAll(): Unit = {
+    super.beforeAll()
+    originalConfORCImplementation = conf.getConf(ORC_IMPLEMENTATION)
+    conf.setConf(ORC_IMPLEMENTATION, orcImp)
+  }
+
+  protected override def afterAll(): Unit = {
+    conf.setConf(ORC_IMPLEMENTATION, originalConfORCImplementation)
+    super.afterAll()
+  }
+
   /**
    * Writes `data` to a Orc file, which is then passed to `f` and will be deleted after `f`
    * returns.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala
new file mode 100644
index 0000000000000..283037caf4a9b
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcFilterSuite.scala
@@ -0,0 +1,387 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.nio.charset.StandardCharsets
+import java.sql.{Date, Timestamp}
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
+
+import org.apache.spark.sql.{Column, DataFrame}
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalOperation
+import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.orc.OrcTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.types._
+
+/**
+ * A test suite that tests Hive ORC filter API based filter pushdown optimization.
+ */
+class HiveOrcFilterSuite extends OrcTest with TestHiveSingleton {
+
+  override val orcImp: String = "hive"
+
+  private def checkFilterPredicate(
+      df: DataFrame,
+      predicate: Predicate,
+      checker: (SearchArgument) => Unit): Unit = {
+    val output = predicate.collect { case a: Attribute => a }.distinct
+    val query = df
+      .select(output.map(e => Column(e)): _*)
+      .where(Column(predicate))
+
+    var maybeRelation: Option[HadoopFsRelation] = None
+    val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
+      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _, _)) =>
+        maybeRelation = Some(orcRelation)
+        filters
+    }.flatten.reduceLeftOption(_ && _)
+    assert(maybeAnalyzedPredicate.isDefined, "No filter is analyzed from the given query")
+
+    val (_, selectedFilters, _) =
+      DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
+    assert(selectedFilters.nonEmpty, "No filter is pushed down")
+
+    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
+    assert(maybeFilter.isDefined, s"Couldn't generate filter predicate for $selectedFilters")
+    checker(maybeFilter.get)
+  }
+
+  private def checkFilterPredicate
+      (predicate: Predicate, filterOperator: PredicateLeaf.Operator)
+      (implicit df: DataFrame): Unit = {
+    def checkComparisonOperator(filter: SearchArgument) = {
+      val operator = filter.getLeaves.asScala
+      assert(operator.map(_.getOperator).contains(filterOperator))
+    }
+    checkFilterPredicate(df, predicate, checkComparisonOperator)
+  }
+
+  private def checkFilterPredicate
+      (predicate: Predicate, stringExpr: String)
+      (implicit df: DataFrame): Unit = {
+    def checkLogicalOperator(filter: SearchArgument) = {
+      assert(filter.toString == stringExpr)
+    }
+    checkFilterPredicate(df, predicate, checkLogicalOperator)
+  }
+
+  private def checkNoFilterPredicate
+      (predicate: Predicate)
+      (implicit df: DataFrame): Unit = {
+    val output = predicate.collect { case a: Attribute => a }.distinct
+    val query = df
+      .select(output.map(e => Column(e)): _*)
+      .where(Column(predicate))
+
+    var maybeRelation: Option[HadoopFsRelation] = None
+    val maybeAnalyzedPredicate = query.queryExecution.optimizedPlan.collect {
+      case PhysicalOperation(_, filters, LogicalRelation(orcRelation: HadoopFsRelation, _, _, _)) =>
+        maybeRelation = Some(orcRelation)
+        filters
+    }.flatten.reduceLeftOption(_ && _)
+    assert(maybeAnalyzedPredicate.isDefined, "No filter is analyzed from the given query")
+
+    val (_, selectedFilters, _) =
+      DataSourceStrategy.selectFilters(maybeRelation.get, maybeAnalyzedPredicate.toSeq)
+    assert(selectedFilters.nonEmpty, "No filter is pushed down")
+
+    val maybeFilter = OrcFilters.createFilter(query.schema, selectedFilters.toArray)
+    assert(maybeFilter.isEmpty, s"Could generate filter predicate for $selectedFilters")
+  }
+
+  test("filter pushdown - integer") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - long") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toLong)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - float") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toFloat)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - double") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i.toDouble)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === 1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> 1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < 2, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > 3, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= 1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= 4, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(1) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(1) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(2) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(3) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(1) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(4) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - string") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === "1", PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> "1", PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < "2", PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > "3", PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= "1", PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= "4", PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal("1") === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal("1") <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal("2") > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal("3") < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal("1") >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal("4") <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - boolean") {
+    withOrcDataFrame((true :: false :: Nil).map(b => Tuple1.apply(Option(b)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === true, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> true, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < true, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > false, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= false, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= false, PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(false) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(false) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(false) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(true) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(true) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(true) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - decimal") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1.apply(BigDecimal.valueOf(i)))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === BigDecimal.valueOf(1), PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> BigDecimal.valueOf(1), PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < BigDecimal.valueOf(2), PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > BigDecimal.valueOf(3), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= BigDecimal.valueOf(1), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= BigDecimal.valueOf(4), PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(1)) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(1)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(2)) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(3)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(1)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(
+        Literal(BigDecimal.valueOf(4)) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - timestamp") {
+    val timeString = "2015-08-20 14:57:00"
+    val timestamps = (1 to 4).map { i =>
+      val milliseconds = Timestamp.valueOf(timeString).getTime + i * 3600
+      new Timestamp(milliseconds)
+    }
+    withOrcDataFrame(timestamps.map(Tuple1(_))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === timestamps(0), PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> timestamps(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < timestamps(1), PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > timestamps(2), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= timestamps(0), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= timestamps(3), PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(timestamps(0)) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(timestamps(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(timestamps(1)) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(timestamps(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(timestamps(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(timestamps(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
+  test("filter pushdown - combinations with logical operators") {
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Option(i)))) { implicit df =>
+      // Because `ExpressionTree` is not accessible at Hive 1.2.x, this should be checked
+      // in string form in order to check filter creation including logical operators
+      // such as `and`, `or` or `not`. So, this function uses `SearchArgument.toString()`
+      // to produce string expression and then compare it to given string expression below.
+      // This might have to be changed after Hive version is upgraded.
+      checkFilterPredicate(
+        '_1.isNotNull,
+        """leaf-0 = (IS_NULL _1)
+          |expr = (not leaf-0)""".stripMargin.trim
+      )
+      checkFilterPredicate(
+        '_1 =!= 1,
+        """leaf-0 = (IS_NULL _1)
+          |leaf-1 = (EQUALS _1 1)
+          |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
+      )
+      checkFilterPredicate(
+        !('_1 < 4),
+        """leaf-0 = (IS_NULL _1)
+          |leaf-1 = (LESS_THAN _1 4)
+          |expr = (and (not leaf-0) (not leaf-1))""".stripMargin.trim
+      )
+      checkFilterPredicate(
+        '_1 < 2 || '_1 > 3,
+        """leaf-0 = (LESS_THAN _1 2)
+          |leaf-1 = (LESS_THAN_EQUALS _1 3)
+          |expr = (or leaf-0 (not leaf-1))""".stripMargin.trim
+      )
+      checkFilterPredicate(
+        '_1 < 2 && '_1 > 3,
+        """leaf-0 = (IS_NULL _1)
+          |leaf-1 = (LESS_THAN _1 2)
+          |leaf-2 = (LESS_THAN_EQUALS _1 3)
+          |expr = (and (not leaf-0) leaf-1 (not leaf-2))""".stripMargin.trim
+      )
+    }
+  }
+
+  test("no filter pushdown - non-supported types") {
+    implicit class IntToBinary(int: Int) {
+      def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8)
+    }
+    // ArrayType
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Array(i)))) { implicit df =>
+      checkNoFilterPredicate('_1.isNull)
+    }
+    // BinaryType
+    withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df =>
+      checkNoFilterPredicate('_1 <=> 1.b)
+    }
+    // DateType
+    val stringDate = "2015-01-01"
+    withOrcDataFrame(Seq(Tuple1(Date.valueOf(stringDate)))) { implicit df =>
+      checkNoFilterPredicate('_1 === Date.valueOf(stringDate))
+    }
+    // MapType
+    withOrcDataFrame((1 to 4).map(i => Tuple1(Map(i -> i)))) { implicit df =>
+      checkNoFilterPredicate('_1.isNotNull)
+    }
+  }
+
+  test("SPARK-12218 Converting conjunctions into ORC SearchArguments") {
+    import org.apache.spark.sql.sources._
+    // The `LessThan` should be converted while the `StringContains` shouldn't
+    val schema = new StructType(
+      Array(
+        StructField("a", IntegerType, nullable = true),
+        StructField("b", StringType, nullable = true)))
+    assertResult(
+      """leaf-0 = (LESS_THAN a 10)
+        |expr = leaf-0
+      """.stripMargin.trim
+    ) {
+      OrcFilters.createFilter(schema, Array(
+        LessThan("a", 10),
+        StringContains("b", "prefix")
+      )).get.toString
+    }
+
+    // The `LessThan` should be converted while the whole inner `And` shouldn't
+    assertResult(
+      """leaf-0 = (LESS_THAN a 10)
+        |expr = leaf-0
+      """.stripMargin.trim
+    ) {
+      OrcFilters.createFilter(schema, Array(
+        LessThan("a", 10),
+        Not(And(
+          GreaterThan("a", 1),
+          StringContains("b", "prefix")
+        ))
+      )).get.toString
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcPartitionDiscoverySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcPartitionDiscoverySuite.scala
new file mode 100644
index 0000000000000..ab9b492f347c6
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcPartitionDiscoverySuite.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.sql.execution.datasources.orc.OrcPartitionDiscoveryTest
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+
+class HiveOrcPartitionDiscoverySuite extends OrcPartitionDiscoveryTest with TestHiveSingleton  {
+  override val orcImp: String = "hive"
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
new file mode 100644
index 0000000000000..7244c369bd3f4
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
+import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.orc.OrcQueryTest
+import org.apache.spark.sql.hive.HiveUtils
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
+
+class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton {
+  import testImplicits._
+
+  override val orcImp: String = "hive"
+
+  test("SPARK-8501: Avoids discovery schema from empty ORC files") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+
+      withTable("empty_orc") {
+        withTempView("empty", "single") {
+          spark.sql(
+            s"""CREATE TABLE empty_orc(key INT, value STRING)
+               |STORED AS ORC
+               |LOCATION '${dir.toURI}'
+             """.stripMargin)
+
+          val emptyDF = Seq.empty[(Int, String)].toDF("key", "value").coalesce(1)
+          emptyDF.createOrReplaceTempView("empty")
+
+          // This creates 1 empty ORC file with Hive ORC SerDe.  We are using this trick because
+          // Spark SQL ORC data source always avoids write empty ORC files.
+          spark.sql(
+            s"""INSERT INTO TABLE empty_orc
+               |SELECT key, value FROM empty
+             """.stripMargin)
+
+          val errorMessage = intercept[AnalysisException] {
+            spark.read.orc(path)
+          }.getMessage
+
+          assert(errorMessage.contains("Unable to infer schema for ORC"))
+
+          val singleRowDF = Seq((0, "foo")).toDF("key", "value").coalesce(1)
+          singleRowDF.createOrReplaceTempView("single")
+
+          spark.sql(
+            s"""INSERT INTO TABLE empty_orc
+               |SELECT key, value FROM single
+             """.stripMargin)
+
+          val df = spark.read.orc(path)
+          assert(df.schema === singleRowDF.schema.asNullable)
+          checkAnswer(df, singleRowDF)
+        }
+      }
+    }
+  }
+
+  test("Verify the ORC conversion parameter: CONVERT_METASTORE_ORC") {
+    withTempView("single") {
+      val singleRowDF = Seq((0, "foo")).toDF("key", "value")
+      singleRowDF.createOrReplaceTempView("single")
+
+      Seq("true", "false").foreach { orcConversion =>
+        withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> orcConversion) {
+          withTable("dummy_orc") {
+            withTempPath { dir =>
+              val path = dir.getCanonicalPath
+              spark.sql(
+                s"""
+                   |CREATE TABLE dummy_orc(key INT, value STRING)
+                   |STORED AS ORC
+                   |LOCATION '${dir.toURI}'
+                 """.stripMargin)
+
+              spark.sql(
+                s"""
+                   |INSERT INTO TABLE dummy_orc
+                   |SELECT key, value FROM single
+                 """.stripMargin)
+
+              val df = spark.sql("SELECT * FROM dummy_orc WHERE key=0")
+              checkAnswer(df, singleRowDF)
+
+              val queryExecution = df.queryExecution
+              if (orcConversion == "true") {
+                queryExecution.analyzed.collectFirst {
+                  case _: LogicalRelation => ()
+                }.getOrElse {
+                  fail(s"Expecting the query plan to convert orc to data sources, " +
+                    s"but got:\n$queryExecution")
+                }
+              } else {
+                queryExecution.analyzed.collectFirst {
+                  case _: HiveTableRelation => ()
+                }.getOrElse {
+                  fail(s"Expecting no conversion from orc to data sources, " +
+                    s"but got:\n$queryExecution")
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test("converted ORC table supports resolving mixed case field") {
+    withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
+      withTable("dummy_orc") {
+        withTempPath { dir =>
+          val df = spark.range(5).selectExpr("id", "id as valueField", "id as partitionValue")
+          df.write
+            .partitionBy("partitionValue")
+            .mode("overwrite")
+            .orc(dir.getAbsolutePath)
+
+          spark.sql(s"""
+            |create external table dummy_orc (id long, valueField long)
+            |partitioned by (partitionValue int)
+            |stored as orc
+            |location "${dir.toURI}"""".stripMargin)
+          spark.sql(s"msck repair table dummy_orc")
+          checkAnswer(spark.sql("select * from dummy_orc"), df)
+        }
+      }
+    }
+  }
+
+  test("SPARK-20728 Make ORCFileFormat configurable between sql/hive and sql/core") {
+    Seq(
+      ("native", classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat]),
+      ("hive", classOf[org.apache.spark.sql.hive.orc.OrcFileFormat])).foreach {
+      case (orcImpl, format) =>
+        withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+          withTable("spark_20728") {
+            sql("CREATE TABLE spark_20728(a INT) USING ORC")
+            val fileFormat = sql("SELECT * FROM spark_20728").queryExecution.analyzed.collectFirst {
+              case l: LogicalRelation =>
+                l.relation.asInstanceOf[HadoopFsRelation].fileFormat.getClass
+            }
+            assert(fileFormat == Some(format))
+          }
+        }
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
new file mode 100644
index 0000000000000..17b7d8cfe127e
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.datasources.orc.OrcSuite
+import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.util.Utils
+
+class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
+
+  override val orcImp: String = "hive"
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+
+    sql(
+      s"""CREATE EXTERNAL TABLE normal_orc(
+         |  intField INT,
+         |  stringField STRING
+         |)
+         |STORED AS ORC
+         |LOCATION '${orcTableAsDir.toURI}'
+       """.stripMargin)
+
+    sql(
+      s"""INSERT INTO TABLE normal_orc
+         |SELECT intField, stringField FROM orc_temp_table
+       """.stripMargin)
+
+    spark.sql(
+      s"""CREATE TEMPORARY VIEW normal_orc_source
+         |USING org.apache.spark.sql.hive.orc
+         |OPTIONS (
+         |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
+         |)
+       """.stripMargin)
+
+    spark.sql(
+      s"""CREATE TEMPORARY VIEW normal_orc_as_source
+         |USING org.apache.spark.sql.hive.orc
+         |OPTIONS (
+         |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
+         |)
+       """.stripMargin)
+  }
+
+  test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
+    val location = Utils.createTempDir()
+    val uri = location.toURI
+    try {
+      hiveClient.runSqlHive("USE default")
+      hiveClient.runSqlHive(
+        """
+          |CREATE EXTERNAL TABLE hive_orc(
+          |  a STRING,
+          |  b CHAR(10),
+          |  c VARCHAR(10),
+          |  d ARRAY<CHAR(3)>)
+          |STORED AS orc""".stripMargin)
+      // Hive throws an exception if I assign the location in the create table statement.
+      hiveClient.runSqlHive(
+        s"ALTER TABLE hive_orc SET LOCATION '$uri'")
+      hiveClient.runSqlHive(
+        """
+          |INSERT INTO TABLE hive_orc
+          |SELECT 'a', 'b', 'c', ARRAY(CAST('d' AS CHAR(3)))
+          |FROM (SELECT 1) t""".stripMargin)
+
+      // We create a different table in Spark using the same schema which points to
+      // the same location.
+      spark.sql(
+        s"""
+           |CREATE EXTERNAL TABLE spark_orc(
+           |  a STRING,
+           |  b CHAR(10),
+           |  c VARCHAR(10),
+           |  d ARRAY<CHAR(3)>)
+           |STORED AS orc
+           |LOCATION '$uri'""".stripMargin)
+      val result = Row("a", "b         ", "c", Seq("d  "))
+      checkAnswer(spark.table("hive_orc"), result)
+      checkAnswer(spark.table("spark_orc"), result)
+    } finally {
+      hiveClient.runSqlHive("DROP TABLE IF EXISTS hive_orc")
+      hiveClient.runSqlHive("DROP TABLE IF EXISTS spark_orc")
+      Utils.deleteRecursively(location)
+    }
+  }
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
index ba0a7605da71c..f87162f94c01a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcHadoopFsRelationSuite.scala
@@ -30,7 +30,8 @@ import org.apache.spark.sql.types._
 class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
   import testImplicits._
 
-  override val dataSourceName: String = classOf[OrcFileFormat].getCanonicalName
+  override val dataSourceName: String =
+    classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
 
   // ORC does not play well with NullType and UDT.
   override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
@@ -116,3 +117,8 @@ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
     }
   }
 }
+
+class HiveOrcHadoopFsRelationSuite extends OrcHadoopFsRelationSuite {
+  override val dataSourceName: String =
+    classOf[org.apache.spark.sql.hive.orc.OrcFileFormat].getCanonicalName
+}

From e103adf45ae3e80626554370691987d8297dda0c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 7 Dec 2017 20:45:11 +0800
Subject: [PATCH 059/356] [SPARK-22703][SQL] make ColumnarRow an immutable view

## What changes were proposed in this pull request?

Similar to https://github.com/apache/spark/pull/19842 , we should also make `ColumnarRow` an immutable view, and move forward to make `ColumnVector` public.

## How was this patch tested?

Existing tests.

The performance concern should be same as https://github.com/apache/spark/pull/19842 .

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19898 from cloud-fan/row-id.
---
 .../vectorized/AggregateHashMap.java          |  9 +--
 .../vectorized/ArrowColumnVector.java         |  1 -
 .../execution/vectorized/ColumnVector.java    | 15 ++---
 .../execution/vectorized/ColumnarBatch.java   | 22 +++----
 .../sql/execution/vectorized/ColumnarRow.java | 66 ++++++++++---------
 .../vectorized/MutableColumnarRow.java        | 47 +++++++------
 .../vectorized/OffHeapColumnVector.java       |  2 +-
 .../vectorized/OnHeapColumnVector.java        |  2 +-
 .../vectorized/WritableColumnVector.java      |  5 --
 .../aggregate/HashAggregateExec.scala         |  9 +--
 .../VectorizedHashMapGenerator.scala          |  5 +-
 .../vectorized/ColumnarBatchSuite.scala       |  5 --
 12 files changed, 89 insertions(+), 99 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
index 9467435435d1f..24260b05194a7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/AggregateHashMap.java
@@ -41,7 +41,7 @@
 public class AggregateHashMap {
 
   private OnHeapColumnVector[] columnVectors;
-  private ColumnarBatch batch;
+  private MutableColumnarRow aggBufferRow;
   private int[] buckets;
   private int numBuckets;
   private int numRows = 0;
@@ -63,7 +63,7 @@ public AggregateHashMap(StructType schema, int capacity, double loadFactor, int
     this.maxSteps = maxSteps;
     numBuckets = (int) (capacity / loadFactor);
     columnVectors = OnHeapColumnVector.allocateColumns(capacity, schema);
-    batch = new ColumnarBatch(schema, columnVectors, capacity);
+    aggBufferRow = new MutableColumnarRow(columnVectors);
     buckets = new int[numBuckets];
     Arrays.fill(buckets, -1);
   }
@@ -72,14 +72,15 @@ public AggregateHashMap(StructType schema) {
     this(schema, DEFAULT_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_MAX_STEPS);
   }
 
-  public ColumnarRow findOrInsert(long key) {
+  public MutableColumnarRow findOrInsert(long key) {
     int idx = find(key);
     if (idx != -1 && buckets[idx] == -1) {
       columnVectors[0].putLong(numRows, key);
       columnVectors[1].putLong(numRows, 0);
       buckets[idx] = numRows++;
     }
-    return batch.getRow(buckets[idx]);
+    aggBufferRow.rowId = buckets[idx];
+    return aggBufferRow;
   }
 
   @VisibleForTesting
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 0071bd66760be..1f1347ccd315e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -323,7 +323,6 @@ public ArrowColumnVector(ValueVector vector) {
       for (int i = 0; i < childColumns.length; ++i) {
         childColumns[i] = new ArrowColumnVector(mapVector.getVectorById(i));
       }
-      resultStruct = new ColumnarRow(childColumns);
     } else {
       throw new UnsupportedOperationException();
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index cca14911fbb28..e6b87519239dd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -157,18 +157,16 @@ public abstract class ColumnVector implements AutoCloseable {
   /**
    * Returns a utility object to get structs.
    */
-  public ColumnarRow getStruct(int rowId) {
-    resultStruct.rowId = rowId;
-    return resultStruct;
+  public final ColumnarRow getStruct(int rowId) {
+    return new ColumnarRow(this, rowId);
   }
 
   /**
    * Returns a utility object to get structs.
    * provided to keep API compatibility with InternalRow for code generation
    */
-  public ColumnarRow getStruct(int rowId, int size) {
-    resultStruct.rowId = rowId;
-    return resultStruct;
+  public final ColumnarRow getStruct(int rowId, int size) {
+    return getStruct(rowId);
   }
 
   /**
@@ -216,11 +214,6 @@ public MapData getMap(int ordinal) {
    */
   protected DataType type;
 
-  /**
-   * Reusable Struct holder for getStruct().
-   */
-  protected ColumnarRow resultStruct;
-
   /**
    * The Dictionary for this column.
    *
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
index 2f5fb360b226f..a9d09aa679726 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
@@ -18,6 +18,7 @@
 
 import java.util.*;
 
+import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.types.StructType;
 
 /**
@@ -40,10 +41,10 @@ public final class ColumnarBatch {
   private final StructType schema;
   private final int capacity;
   private int numRows;
-  final ColumnVector[] columns;
+  private final ColumnVector[] columns;
 
-  // Staging row returned from getRow.
-  final ColumnarRow row;
+  // Staging row returned from `getRow`.
+  private final MutableColumnarRow row;
 
   /**
    * Called to close all the columns in this batch. It is not valid to access the data after
@@ -58,10 +59,10 @@ public void close() {
   /**
    * Returns an iterator over the rows in this batch. This skips rows that are filtered out.
    */
-  public Iterator<ColumnarRow> rowIterator() {
+  public Iterator<InternalRow> rowIterator() {
     final int maxRows = numRows;
-    final ColumnarRow row = new ColumnarRow(columns);
-    return new Iterator<ColumnarRow>() {
+    final MutableColumnarRow row = new MutableColumnarRow(columns);
+    return new Iterator<InternalRow>() {
       int rowId = 0;
 
       @Override
@@ -70,7 +71,7 @@ public boolean hasNext() {
       }
 
       @Override
-      public ColumnarRow next() {
+      public InternalRow next() {
         if (rowId >= maxRows) {
           throw new NoSuchElementException();
         }
@@ -133,9 +134,8 @@ public void setNumRows(int numRows) {
   /**
    * Returns the row in this batch at `rowId`. Returned row is reused across calls.
    */
-  public ColumnarRow getRow(int rowId) {
-    assert(rowId >= 0);
-    assert(rowId < numRows);
+  public InternalRow getRow(int rowId) {
+    assert(rowId >= 0 && rowId < numRows);
     row.rowId = rowId;
     return row;
   }
@@ -144,6 +144,6 @@ public ColumnarBatch(StructType schema, ColumnVector[] columns, int capacity) {
     this.schema = schema;
     this.columns = columns;
     this.capacity = capacity;
-    this.row = new ColumnarRow(columns);
+    this.row = new MutableColumnarRow(columns);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
index cabb7479525d9..95c0d09873d67 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
@@ -28,30 +28,32 @@
  * to be reused, callers should copy the data out if it needs to be stored.
  */
 public final class ColumnarRow extends InternalRow {
-  protected int rowId;
-  private final ColumnVector[] columns;
-
-  // Ctor used if this is a struct.
-  ColumnarRow(ColumnVector[] columns) {
-    this.columns = columns;
+  // The data for this row. E.g. the value of 3rd int field is `data.getChildColumn(3).getInt(rowId)`.
+  private final ColumnVector data;
+  private final int rowId;
+  private final int numFields;
+
+  ColumnarRow(ColumnVector data, int rowId) {
+    assert (data.dataType() instanceof StructType);
+    this.data = data;
+    this.rowId = rowId;
+    this.numFields = ((StructType) data.dataType()).size();
   }
 
-  public ColumnVector[] columns() { return columns; }
-
   @Override
-  public int numFields() { return columns.length; }
+  public int numFields() { return numFields; }
 
   /**
    * Revisit this. This is expensive. This is currently only used in test paths.
    */
   @Override
   public InternalRow copy() {
-    GenericInternalRow row = new GenericInternalRow(columns.length);
+    GenericInternalRow row = new GenericInternalRow(numFields);
     for (int i = 0; i < numFields(); i++) {
       if (isNullAt(i)) {
         row.setNullAt(i);
       } else {
-        DataType dt = columns[i].dataType();
+        DataType dt = data.getChildColumn(i).dataType();
         if (dt instanceof BooleanType) {
           row.setBoolean(i, getBoolean(i));
         } else if (dt instanceof ByteType) {
@@ -91,65 +93,65 @@ public boolean anyNull() {
   }
 
   @Override
-  public boolean isNullAt(int ordinal) { return columns[ordinal].isNullAt(rowId); }
+  public boolean isNullAt(int ordinal) { return data.getChildColumn(ordinal).isNullAt(rowId); }
 
   @Override
-  public boolean getBoolean(int ordinal) { return columns[ordinal].getBoolean(rowId); }
+  public boolean getBoolean(int ordinal) { return data.getChildColumn(ordinal).getBoolean(rowId); }
 
   @Override
-  public byte getByte(int ordinal) { return columns[ordinal].getByte(rowId); }
+  public byte getByte(int ordinal) { return data.getChildColumn(ordinal).getByte(rowId); }
 
   @Override
-  public short getShort(int ordinal) { return columns[ordinal].getShort(rowId); }
+  public short getShort(int ordinal) { return data.getChildColumn(ordinal).getShort(rowId); }
 
   @Override
-  public int getInt(int ordinal) { return columns[ordinal].getInt(rowId); }
+  public int getInt(int ordinal) { return data.getChildColumn(ordinal).getInt(rowId); }
 
   @Override
-  public long getLong(int ordinal) { return columns[ordinal].getLong(rowId); }
+  public long getLong(int ordinal) { return data.getChildColumn(ordinal).getLong(rowId); }
 
   @Override
-  public float getFloat(int ordinal) { return columns[ordinal].getFloat(rowId); }
+  public float getFloat(int ordinal) { return data.getChildColumn(ordinal).getFloat(rowId); }
 
   @Override
-  public double getDouble(int ordinal) { return columns[ordinal].getDouble(rowId); }
+  public double getDouble(int ordinal) { return data.getChildColumn(ordinal).getDouble(rowId); }
 
   @Override
   public Decimal getDecimal(int ordinal, int precision, int scale) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    return columns[ordinal].getDecimal(rowId, precision, scale);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    return data.getChildColumn(ordinal).getDecimal(rowId, precision, scale);
   }
 
   @Override
   public UTF8String getUTF8String(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    return columns[ordinal].getUTF8String(rowId);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    return data.getChildColumn(ordinal).getUTF8String(rowId);
   }
 
   @Override
   public byte[] getBinary(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    return columns[ordinal].getBinary(rowId);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    return data.getChildColumn(ordinal).getBinary(rowId);
   }
 
   @Override
   public CalendarInterval getInterval(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    final int months = columns[ordinal].getChildColumn(0).getInt(rowId);
-    final long microseconds = columns[ordinal].getChildColumn(1).getLong(rowId);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    final int months = data.getChildColumn(ordinal).getChildColumn(0).getInt(rowId);
+    final long microseconds = data.getChildColumn(ordinal).getChildColumn(1).getLong(rowId);
     return new CalendarInterval(months, microseconds);
   }
 
   @Override
   public ColumnarRow getStruct(int ordinal, int numFields) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    return columns[ordinal].getStruct(rowId);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    return data.getChildColumn(ordinal).getStruct(rowId);
   }
 
   @Override
   public ColumnarArray getArray(int ordinal) {
-    if (columns[ordinal].isNullAt(rowId)) return null;
-    return columns[ordinal].getArray(rowId);
+    if (data.getChildColumn(ordinal).isNullAt(rowId)) return null;
+    return data.getChildColumn(ordinal).getArray(rowId);
   }
 
   @Override
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index f272cc163611b..06602c147dfe9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -28,17 +28,24 @@
 
 /**
  * A mutable version of {@link ColumnarRow}, which is used in the vectorized hash map for hash
- * aggregate.
+ * aggregate, and {@link ColumnarBatch} to save object creation.
  *
  * Note that this class intentionally has a lot of duplicated code with {@link ColumnarRow}, to
  * avoid java polymorphism overhead by keeping {@link ColumnarRow} and this class final classes.
  */
 public final class MutableColumnarRow extends InternalRow {
   public int rowId;
-  private final WritableColumnVector[] columns;
+  private final ColumnVector[] columns;
+  private final WritableColumnVector[] writableColumns;
 
-  public MutableColumnarRow(WritableColumnVector[] columns) {
+  public MutableColumnarRow(ColumnVector[] columns) {
     this.columns = columns;
+    this.writableColumns = null;
+  }
+
+  public MutableColumnarRow(WritableColumnVector[] writableColumns) {
+    this.columns = writableColumns;
+    this.writableColumns = writableColumns;
   }
 
   @Override
@@ -225,54 +232,54 @@ public void update(int ordinal, Object value) {
 
   @Override
   public void setNullAt(int ordinal) {
-    columns[ordinal].putNull(rowId);
+    writableColumns[ordinal].putNull(rowId);
   }
 
   @Override
   public void setBoolean(int ordinal, boolean value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putBoolean(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putBoolean(rowId, value);
   }
 
   @Override
   public void setByte(int ordinal, byte value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putByte(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putByte(rowId, value);
   }
 
   @Override
   public void setShort(int ordinal, short value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putShort(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putShort(rowId, value);
   }
 
   @Override
   public void setInt(int ordinal, int value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putInt(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putInt(rowId, value);
   }
 
   @Override
   public void setLong(int ordinal, long value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putLong(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putLong(rowId, value);
   }
 
   @Override
   public void setFloat(int ordinal, float value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putFloat(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putFloat(rowId, value);
   }
 
   @Override
   public void setDouble(int ordinal, double value) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putDouble(rowId, value);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putDouble(rowId, value);
   }
 
   @Override
   public void setDecimal(int ordinal, Decimal value, int precision) {
-    columns[ordinal].putNotNull(rowId);
-    columns[ordinal].putDecimal(rowId, value, precision);
+    writableColumns[ordinal].putNotNull(rowId);
+    writableColumns[ordinal].putDecimal(rowId, value, precision);
   }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 806d0291a6c49..5f1b9885334b7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -547,7 +547,7 @@ protected void reserveInternal(int newCapacity) {
     } else if (type instanceof LongType || type instanceof DoubleType ||
         DecimalType.is64BitDecimalType(type) || type instanceof TimestampType) {
       this.data = Platform.reallocateMemory(data, oldCapacity * 8, newCapacity * 8);
-    } else if (resultStruct != null) {
+    } else if (childColumns != null) {
       // Nothing to store.
     } else {
       throw new RuntimeException("Unhandled " + type);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index 6e7f74ce12f16..f12772ede575d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -558,7 +558,7 @@ protected void reserveInternal(int newCapacity) {
         if (doubleData != null) System.arraycopy(doubleData, 0, newData, 0, capacity);
         doubleData = newData;
       }
-    } else if (resultStruct != null) {
+    } else if (childColumns != null) {
       // Nothing to store.
     } else {
       throw new RuntimeException("Unhandled " + type);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 0bea4cc97142d..7c053b579442c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -74,7 +74,6 @@ public void close() {
       dictionaryIds = null;
     }
     dictionary = null;
-    resultStruct = null;
   }
 
   public void reserve(int requiredCapacity) {
@@ -673,23 +672,19 @@ protected WritableColumnVector(int capacity, DataType type) {
       }
       this.childColumns = new WritableColumnVector[1];
       this.childColumns[0] = reserveNewColumn(childCapacity, childType);
-      this.resultStruct = null;
     } else if (type instanceof StructType) {
       StructType st = (StructType)type;
       this.childColumns = new WritableColumnVector[st.fields().length];
       for (int i = 0; i < childColumns.length; ++i) {
         this.childColumns[i] = reserveNewColumn(capacity, st.fields()[i].dataType());
       }
-      this.resultStruct = new ColumnarRow(this.childColumns);
     } else if (type instanceof CalendarIntervalType) {
       // Two columns. Months as int. Microseconds as Long.
       this.childColumns = new WritableColumnVector[2];
       this.childColumns[0] = reserveNewColumn(capacity, DataTypes.IntegerType);
       this.childColumns[1] = reserveNewColumn(capacity, DataTypes.LongType);
-      this.resultStruct = new ColumnarRow(this.childColumns);
     } else {
       this.childColumns = null;
-      this.resultStruct = null;
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 26d8cd7278353..9cadd13999e72 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -595,9 +595,7 @@ case class HashAggregateExec(
 
         ctx.addMutableState(fastHashMapClassName, fastHashMapTerm,
           s"$fastHashMapTerm = new $fastHashMapClassName();")
-        ctx.addMutableState(
-          s"java.util.Iterator<${classOf[ColumnarRow].getName}>",
-          iterTermForFastHashMap)
+        ctx.addMutableState(s"java.util.Iterator<InternalRow>", iterTermForFastHashMap)
       } else {
         val generatedMap = new RowBasedHashMapGenerator(ctx, aggregateExpressions,
           fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
@@ -674,7 +672,7 @@ case class HashAggregateExec(
        """.stripMargin
     }
 
-    // Iterate over the aggregate rows and convert them from ColumnarRow to UnsafeRow
+    // Iterate over the aggregate rows and convert them from InternalRow to UnsafeRow
     def outputFromVectorizedMap: String = {
       val row = ctx.freshName("fastHashMapRow")
       ctx.currentVars = null
@@ -687,10 +685,9 @@ case class HashAggregateExec(
         bufferSchema.toAttributes.zipWithIndex.map { case (attr, i) =>
           BoundReference(groupingKeySchema.length + i, attr.dataType, attr.nullable)
         })
-      val columnarRowCls = classOf[ColumnarRow].getName
       s"""
          |while ($iterTermForFastHashMap.hasNext()) {
-         |  $columnarRowCls $row = ($columnarRowCls) $iterTermForFastHashMap.next();
+         |  InternalRow $row = (InternalRow) $iterTermForFastHashMap.next();
          |  ${generateKeyRow.code}
          |  ${generateBufferRow.code}
          |  $outputFunc(${generateKeyRow.value}, ${generateBufferRow.value});
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 44ba539ebf7c2..f04cd48072f17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.execution.aggregate
 
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnarRow, MutableColumnarRow, OnHeapColumnVector}
+import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, MutableColumnarRow, OnHeapColumnVector}
 import org.apache.spark.sql.types._
 
 /**
@@ -231,7 +232,7 @@ class VectorizedHashMapGenerator(
 
   protected def generateRowIterator(): String = {
     s"""
-       |public java.util.Iterator<${classOf[ColumnarRow].getName}> rowIterator() {
+       |public java.util.Iterator<${classOf[InternalRow].getName}> rowIterator() {
        |  batch.setNumRows(numRows);
        |  return batch.rowIterator();
        |}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 0ae4f2d117609..c9c6bee513b53 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -751,11 +751,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
       c2.putDouble(1, 5.67)
 
       val s = column.getStruct(0)
-      assert(s.columns()(0).getInt(0) == 123)
-      assert(s.columns()(0).getInt(1) == 456)
-      assert(s.columns()(1).getDouble(0) == 3.45)
-      assert(s.columns()(1).getDouble(1) == 5.67)
-
       assert(s.getInt(0) == 123)
       assert(s.getDouble(1) == 3.45)
 

From ea2fbf41973e62b28b9d5fa772512ddba5c00d96 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 7 Dec 2017 20:55:35 +0800
Subject: [PATCH 060/356] [SPARK-22705][SQL] Case, Coalesce, and In use less
 global variables

## What changes were proposed in this pull request?

This PR accomplishes the following two items.

1. Reduce # of global variables from two to one for generated code of `Case` and `Coalesce` and remove global variables for generated code of `In`.
2. Make lifetime of global variable local within an operation

Item 1. reduces # of constant pool entries in a Java class. Item 2. ensures that an variable is not passed to arguments in a method split by `CodegenContext.splitExpressions()`, which is addressed by #19865.

## How was this patch tested?

Added new tests into `PredicateSuite`, `NullExpressionsSuite`, and `ConditionalExpressionSuite`.

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19901 from kiszk/SPARK-22705.
---
 .../expressions/conditionalExpressions.scala  | 70 +++++++++++--------
 .../expressions/nullExpressions.scala         | 19 +++--
 .../sql/catalyst/expressions/predicates.scala | 31 +++++---
 .../ConditionalExpressionSuite.scala          |  7 ++
 .../expressions/NullExpressionsSuite.scala    |  7 ++
 .../catalyst/expressions/PredicateSuite.scala |  6 ++
 6 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index ae5f7140847db..53c3b226895ec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -180,13 +180,18 @@ case class CaseWhen(
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    // This variable represents whether the first successful condition is met or not.
-    // It is initialized to `false` and it is set to `true` when the first condition which
-    // evaluates to `true` is met and therefore is not needed to go on anymore on the computation
-    // of the following conditions.
-    val conditionMet = ctx.freshName("caseWhenConditionMet")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
+    // This variable holds the state of the result:
+    // -1 means the condition is not met yet and the result is unknown.
+    val NOT_MATCHED = -1
+    // 0 means the condition is met and result is not null.
+    val HAS_NONNULL = 0
+    // 1 means the condition is met and result is null.
+    val HAS_NULL = 1
+    // It is initialized to `NOT_MATCHED`, and if it's set to `HAS_NULL` or `HAS_NONNULL`,
+    // We won't go on anymore on the computation.
+    val resultState = ctx.freshName("caseWhenResultState")
+    val tmpResult = ctx.freshName("caseWhenTmpResult")
+    ctx.addMutableState(ctx.javaType(dataType), tmpResult)
 
     // these blocks are meant to be inside a
     // do {
@@ -200,9 +205,8 @@ case class CaseWhen(
          |${cond.code}
          |if (!${cond.isNull} && ${cond.value}) {
          |  ${res.code}
-         |  ${ev.isNull} = ${res.isNull};
-         |  ${ev.value} = ${res.value};
-         |  $conditionMet = true;
+         |  $resultState = (byte)(${res.isNull} ? $HAS_NULL : $HAS_NONNULL);
+         |  $tmpResult = ${res.value};
          |  continue;
          |}
        """.stripMargin
@@ -212,59 +216,63 @@ case class CaseWhen(
       val res = elseExpr.genCode(ctx)
       s"""
          |${res.code}
-         |${ev.isNull} = ${res.isNull};
-         |${ev.value} = ${res.value};
+         |$resultState = (byte)(${res.isNull} ? $HAS_NULL : $HAS_NONNULL);
+         |$tmpResult = ${res.value};
        """.stripMargin
     }
 
     val allConditions = cases ++ elseCode
 
     // This generates code like:
-    //   conditionMet = caseWhen_1(i);
-    //   if(conditionMet) {
+    //   caseWhenResultState = caseWhen_1(i);
+    //   if(caseWhenResultState != -1) {
     //     continue;
     //   }
-    //   conditionMet = caseWhen_2(i);
-    //   if(conditionMet) {
+    //   caseWhenResultState = caseWhen_2(i);
+    //   if(caseWhenResultState != -1) {
     //     continue;
     //   }
     //   ...
     // and the declared methods are:
-    //   private boolean caseWhen_1234() {
-    //     boolean conditionMet = false;
+    //   private byte caseWhen_1234() {
+    //     byte caseWhenResultState = -1;
     //     do {
     //       // here the evaluation of the conditions
     //     } while (false);
-    //     return conditionMet;
+    //     return caseWhenResultState;
     //   }
     val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = allConditions,
       funcName = "caseWhen",
-      returnType = ctx.JAVA_BOOLEAN,
+      returnType = ctx.JAVA_BYTE,
       makeSplitFunction = func =>
         s"""
-           |${ctx.JAVA_BOOLEAN} $conditionMet = false;
+           |${ctx.JAVA_BYTE} $resultState = $NOT_MATCHED;
            |do {
            |  $func
            |} while (false);
-           |return $conditionMet;
+           |return $resultState;
          """.stripMargin,
       foldFunctions = _.map { funcCall =>
         s"""
-           |$conditionMet = $funcCall;
-           |if ($conditionMet) {
+           |$resultState = $funcCall;
+           |if ($resultState != $NOT_MATCHED) {
            |  continue;
            |}
          """.stripMargin
       }.mkString)
 
-    ev.copy(code = s"""
-      ${ev.isNull} = true;
-      ${ev.value} = ${ctx.defaultValue(dataType)};
-      ${ctx.JAVA_BOOLEAN} $conditionMet = false;
-      do {
-        $codes
-      } while (false);""")
+    ev.copy(code =
+      s"""
+         |${ctx.JAVA_BYTE} $resultState = $NOT_MATCHED;
+         |$tmpResult = ${ctx.defaultValue(dataType)};
+         |do {
+         |  $codes
+         |} while (false);
+         |// TRUE if any condition is met and the result is null, or no any condition is met.
+         |final boolean ${ev.isNull} = ($resultState != $HAS_NONNULL);
+         |final ${ctx.javaType(dataType)} ${ev.value} = $tmpResult;
+       """.stripMargin)
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 26c9a41efc9f9..294cdcb2e9546 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -72,8 +72,8 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
-    ctx.addMutableState(ctx.javaType(dataType), ev.value)
+    val tmpIsNull = ctx.freshName("coalesceTmpIsNull")
+    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
 
     // all the evals are meant to be in a do { ... } while (false); loop
     val evals = children.map { e =>
@@ -81,26 +81,30 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
       s"""
          |${eval.code}
          |if (!${eval.isNull}) {
-         |  ${ev.isNull} = false;
+         |  $tmpIsNull = false;
          |  ${ev.value} = ${eval.value};
          |  continue;
          |}
        """.stripMargin
     }
 
+    val resultType = ctx.javaType(dataType)
     val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = evals,
       funcName = "coalesce",
+      returnType = resultType,
       makeSplitFunction = func =>
         s"""
+           |$resultType ${ev.value} = ${ctx.defaultValue(dataType)};
            |do {
            |  $func
            |} while (false);
+           |return ${ev.value};
          """.stripMargin,
       foldFunctions = _.map { funcCall =>
         s"""
-           |$funcCall;
-           |if (!${ev.isNull}) {
+           |${ev.value} = $funcCall;
+           |if (!$tmpIsNull) {
            |  continue;
            |}
          """.stripMargin
@@ -109,11 +113,12 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 
     ev.copy(code =
       s"""
-         |${ev.isNull} = true;
-         |${ev.value} = ${ctx.defaultValue(dataType)};
+         |$tmpIsNull = true;
+         |$resultType ${ev.value} = ${ctx.defaultValue(dataType)};
          |do {
          |  $codes
          |} while (false);
+         |final boolean ${ev.isNull} = $tmpIsNull;
        """.stripMargin)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index a42dd7ecf57de..8eb41addaf689 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -237,8 +237,14 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
     val javaDataType = ctx.javaType(value.dataType)
     val valueGen = value.genCode(ctx)
     val listGen = list.map(_.genCode(ctx))
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.value)
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
+    // inTmpResult has 3 possible values:
+    // -1 means no matches found and there is at least one value in the list evaluated to null
+    val HAS_NULL = -1
+    // 0 means no matches found and all values in the list are not null
+    val NOT_MATCHED = 0
+    // 1 means one value in the list is matched
+    val MATCHED = 1
+    val tmpResult = ctx.freshName("inTmpResult")
     val valueArg = ctx.freshName("valueArg")
     // All the blocks are meant to be inside a do { ... } while (false); loop.
     // The evaluation of variables can be stopped when we find a matching value.
@@ -246,10 +252,9 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
       s"""
          |${x.code}
          |if (${x.isNull}) {
-         |  ${ev.isNull} = true;
+         |  $tmpResult = $HAS_NULL; // ${ev.isNull} = true;
          |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) {
-         |  ${ev.isNull} = false;
-         |  ${ev.value} = true;
+         |  $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true;
          |  continue;
          |}
        """.stripMargin)
@@ -257,17 +262,19 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
     val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = listCode,
       funcName = "valueIn",
-      extraArguments = (javaDataType, valueArg) :: Nil,
+      extraArguments = (javaDataType, valueArg) :: (ctx.JAVA_BYTE, tmpResult) :: Nil,
+      returnType = ctx.JAVA_BYTE,
       makeSplitFunction = body =>
         s"""
            |do {
            |  $body
            |} while (false);
+           |return $tmpResult;
          """.stripMargin,
       foldFunctions = _.map { funcCall =>
         s"""
-           |$funcCall;
-           |if (${ev.value}) {
+           |$tmpResult = $funcCall;
+           |if ($tmpResult == $MATCHED) {
            |  continue;
            |}
          """.stripMargin
@@ -276,14 +283,16 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
     ev.copy(code =
       s"""
          |${valueGen.code}
-         |${ev.value} = false;
-         |${ev.isNull} = ${valueGen.isNull};
-         |if (!${ev.isNull}) {
+         |byte $tmpResult = $HAS_NULL;
+         |if (!${valueGen.isNull}) {
+         |  $tmpResult = 0;
          |  $javaDataType $valueArg = ${valueGen.value};
          |  do {
          |    $codes
          |  } while (false);
          |}
+         |final boolean ${ev.isNull} = ($tmpResult == $HAS_NULL);
+         |final boolean ${ev.value} = ($tmpResult == $MATCHED);
        """.stripMargin)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index 3e11c3d2d4fe3..60d84aae1fa3f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.types._
 
 class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -145,4 +146,10 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
       IndexedSeq((Literal(12) === Literal(1), Literal(42)),
         (Literal(12) === Literal(42), Literal(1))))
   }
+
+  test("SPARK-22705: case when should use less global variables") {
+    val ctx = new CodegenContext()
+    CaseWhen(Seq((Literal.create(false, BooleanType), Literal(1))), Literal(-1)).genCode(ctx)
+    assert(ctx.mutableStates.size == 1)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index 40ef7770da33f..a23cd95632770 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 import org.apache.spark.sql.types._
@@ -155,6 +156,12 @@ class NullExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Coalesce(inputs), "x_1")
   }
 
+  test("SPARK-22705: Coalesce should use less global variables") {
+    val ctx = new CodegenContext()
+    Coalesce(Seq(Literal("a"), Literal("b"))).genCode(ctx)
+    assert(ctx.mutableStates.size == 1)
+  }
+
   test("AtLeastNNonNulls should not throw 64kb exception") {
     val inputs = (1 to 4000).map(x => Literal(s"x_$x"))
     checkEvaluation(AtLeastNNonNulls(1, inputs), true)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 95a0dfa057563..15cb0bea08f17 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -246,6 +246,12 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(In(Literal(1.0D), sets), true)
   }
 
+  test("SPARK-22705: In should use less global variables") {
+    val ctx = new CodegenContext()
+    In(Literal(1.0D), Seq(Literal(1.0D), Literal(2.0D))).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
+
   test("INSET") {
     val hS = HashSet[Any]() + 1 + 2
     val nS = HashSet[Any]() + 1 + 2 + null

From 2be448260d8115969e3d89b913c733943f151915 Mon Sep 17 00:00:00 2001
From: Sunitha Kambhampati <skambha@us.ibm.com>
Date: Thu, 7 Dec 2017 20:59:47 +0800
Subject: [PATCH 061/356] [SPARK-22452][SQL] Add getInt, getLong, getBoolean to
 DataSourceV2Options

- Implemented methods getInt, getLong, getBoolean for DataSourceV2Options
- Added new unit tests to exercise these methods

Author: Sunitha Kambhampati <skambha@us.ibm.com>

Closes #19902 from skambha/spark22452.
---
 .../sql/sources/v2/DataSourceV2Options.java   | 31 +++++++++++++++++++
 .../sources/v2/DataSourceV2OptionsSuite.scala | 31 +++++++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
index 9a89c8193dd6e..b2c908dc73a61 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
@@ -49,4 +49,35 @@ public DataSourceV2Options(Map<String, String> originalMap) {
   public Optional<String> get(String key) {
     return Optional.ofNullable(keyLowerCasedMap.get(toLowerCase(key)));
   }
+
+  /**
+   * Returns the boolean value to which the specified key is mapped,
+   * or defaultValue if there is no mapping for the key. The key match is case-insensitive
+   */
+  public boolean getBoolean(String key, boolean defaultValue) {
+    String lcaseKey = toLowerCase(key);
+    return keyLowerCasedMap.containsKey(lcaseKey) ?
+      Boolean.parseBoolean(keyLowerCasedMap.get(lcaseKey)) : defaultValue;
+  }
+
+  /**
+   * Returns the integer value to which the specified key is mapped,
+   * or defaultValue if there is no mapping for the key. The key match is case-insensitive
+   */
+  public int getInt(String key, int defaultValue) {
+    String lcaseKey = toLowerCase(key);
+    return keyLowerCasedMap.containsKey(lcaseKey) ?
+      Integer.parseInt(keyLowerCasedMap.get(lcaseKey)) : defaultValue;
+  }
+
+  /**
+   * Returns the long value to which the specified key is mapped,
+   * or defaultValue if there is no mapping for the key. The key match is case-insensitive
+   */
+  public long getLong(String key, long defaultValue) {
+    String lcaseKey = toLowerCase(key);
+    return keyLowerCasedMap.containsKey(lcaseKey) ?
+      Long.parseLong(keyLowerCasedMap.get(lcaseKey)) : defaultValue;
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
index 933f4075bcc8a..752d3c193cc74 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
@@ -37,4 +37,35 @@ class DataSourceV2OptionsSuite extends SparkFunSuite {
     val options = new DataSourceV2Options(Map("foo" -> "bAr").asJava)
     assert(options.get("foo").get == "bAr")
   }
+
+  test("getInt") {
+    val options = new DataSourceV2Options(Map("numFOo" -> "1", "foo" -> "bar").asJava)
+    assert(options.getInt("numFOO", 10) == 1)
+    assert(options.getInt("numFOO2", 10) == 10)
+
+    intercept[NumberFormatException]{
+      options.getInt("foo", 1)
+    }
+  }
+
+  test("getBoolean") {
+    val options = new DataSourceV2Options(
+      Map("isFoo" -> "true", "isFOO2" -> "false", "foo" -> "bar").asJava)
+    assert(options.getBoolean("isFoo", false))
+    assert(!options.getBoolean("isFoo2", true))
+    assert(options.getBoolean("isBar", true))
+    assert(!options.getBoolean("isBar", false))
+    assert(!options.getBoolean("FOO", true))
+  }
+
+  test("getLong") {
+    val options = new DataSourceV2Options(Map("numFoo" -> "9223372036854775807",
+      "foo" -> "bar").asJava)
+    assert(options.getLong("numFOO", 0L) == 9223372036854775807L)
+    assert(options.getLong("numFoo2", -1L) == -1L)
+
+    intercept[NumberFormatException]{
+      options.getLong("foo", 0L)
+    }
+  }
 }

From beb717f64802de03836a0fed8cce4d813be1769a Mon Sep 17 00:00:00 2001
From: Brad Kaiser <kaiserb@us.ibm.com>
Date: Thu, 7 Dec 2017 21:04:09 +0800
Subject: [PATCH 062/356] [SPARK-22618][CORE] Catch exception in removeRDD to
 stop jobs from dying

## What changes were proposed in this pull request?

I propose that BlockManagerMasterEndpoint.removeRdd() should catch and log any IOExceptions it receives. As it is now, the exception can bubble up to the main thread and kill user applications when called from RDD.unpersist(). I think this change is a better experience for the end user.

I chose to catch the exception in BlockManagerMasterEndpoint.removeRdd() instead of RDD.unpersist() because this way the RDD.unpersist() blocking option will still work correctly. Otherwise, blocking will get short circuited by the first error.

## How was this patch tested?

This patch was tested with a job that shows the job killing behavior mentioned above.

rxin, it looks like you originally wrote this method, I would appreciate it if you took a look. Thanks.

This contribution is my original work and is licensed under the project's open source license.

Author: Brad Kaiser <kaiserb@us.ibm.com>

Closes #19836 from brad-kaiser/catch-unpersist-exception.
---
 .../storage/BlockManagerMasterEndpoint.scala     | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 56d0266b8edad..89a6a71a589a1 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.storage
 
+import java.io.IOException
 import java.util.{HashMap => JHashMap}
 
 import scala.collection.JavaConverters._
@@ -159,11 +160,16 @@ class BlockManagerMasterEndpoint(
     // Ask the slaves to remove the RDD, and put the result in a sequence of Futures.
     // The dispatcher is used as an implicit argument into the Future sequence construction.
     val removeMsg = RemoveRdd(rddId)
-    Future.sequence(
-      blockManagerInfo.values.map { bm =>
-        bm.slaveEndpoint.ask[Int](removeMsg)
-      }.toSeq
-    )
+
+    val futures = blockManagerInfo.values.map { bm =>
+      bm.slaveEndpoint.ask[Int](removeMsg).recover {
+        case e: IOException =>
+          logWarning(s"Error trying to remove RDD $rddId", e)
+          0 // zero blocks were removed
+      }
+    }.toSeq
+
+    Future.sequence(futures)
   }
 
   private def removeShuffle(shuffleId: Int): Future[Seq[Boolean]] = {

From dd59a4be3641d0029ba6b1759837e1d1b9f8a840 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 7 Dec 2017 21:08:15 +0800
Subject: [PATCH 063/356] [SPARK-22712][SQL] Use
 `buildReaderWithPartitionValues` in native OrcFileFormat

## What changes were proposed in this pull request?

To support vectorization in native OrcFileFormat later, we need to use `buildReaderWithPartitionValues` instead of `buildReader` like ParquetFileFormat. This PR replaces `buildReader` with `buildReaderWithPartitionValues`.

## How was this patch tested?

Pass the Jenkins with the existing test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19907 from dongjoon-hyun/SPARK-ORC-BUILD-READER.
---
 .../execution/datasources/orc/OrcFileFormat.scala | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index 75c42213db3c8..f7471cd7debce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
@@ -124,7 +125,7 @@ class OrcFileFormat
     true
   }
 
-  override def buildReader(
+  override def buildReaderWithPartitionValues(
       sparkSession: SparkSession,
       dataSchema: StructType,
       partitionSchema: StructType,
@@ -167,9 +168,17 @@ class OrcFileFormat
         val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
         Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
 
-        val unsafeProjection = UnsafeProjection.create(requiredSchema)
+        val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
+        val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
         val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
-        iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+
+        if (partitionSchema.length == 0) {
+          iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+        } else {
+          val joinedRow = new JoinedRow()
+          iter.map(value =>
+            unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+        }
       }
     }
   }

From fc294463007c184715d604c718c0e913c83969f3 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Thu, 7 Dec 2017 21:18:27 +0800
Subject: [PATCH 064/356] [SPARK-22699][SQL] GenerateSafeProjection should not
 use global variables for struct

## What changes were proposed in this pull request?

GenerateSafeProjection is defining a mutable state for each struct, which is not needed. This is bad for the well known issues related to constant pool limits.
The PR replace the global variable with a local one.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19914 from mgaido91/SPARK-22699.
---
 .../codegen/GenerateSafeProjection.scala       | 18 ++++++++----------
 .../codegen/GeneratedProjectionSuite.scala     | 11 +++++++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 44e7148e5d98f..3dcbb518ba42a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -49,8 +49,6 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
     val tmpInput = ctx.freshName("tmpInput")
     val output = ctx.freshName("safeRow")
     val values = ctx.freshName("values")
-    // These expressions could be split into multiple functions
-    ctx.addMutableState("Object[]", values, s"$values = null;")
 
     val rowClass = classOf[GenericInternalRow].getName
 
@@ -66,15 +64,15 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
     val allFields = ctx.splitExpressions(
       expressions = fieldWriters,
       funcName = "writeFields",
-      arguments = Seq("InternalRow" -> tmpInput)
+      arguments = Seq("InternalRow" -> tmpInput, "Object[]" -> values)
     )
-    val code = s"""
-      final InternalRow $tmpInput = $input;
-      $values = new Object[${schema.length}];
-      $allFields
-      final InternalRow $output = new $rowClass($values);
-      $values = null;
-    """
+    val code =
+      s"""
+         |final InternalRow $tmpInput = $input;
+         |final Object[] $values = new Object[${schema.length}];
+         |$allFields
+         |final InternalRow $output = new $rowClass($values);
+       """.stripMargin
 
     ExprCode(code, "false", output)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
index 0cd0d8859145f..6031bdf19e957 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
@@ -208,4 +208,15 @@ class GeneratedProjectionSuite extends SparkFunSuite {
     unsafeProj.apply(InternalRow(InternalRow(UTF8String.fromString("b"))))
     assert(row.getStruct(0, 1).getString(0).toString == "a")
   }
+
+  test("SPARK-22699: GenerateSafeProjection should not use global variables for struct") {
+    val safeProj = GenerateSafeProjection.generate(
+      Seq(BoundReference(0, new StructType().add("i", IntegerType), true)))
+    val globalVariables = safeProj.getClass.getDeclaredFields
+    // We need always 3 variables:
+    // - one is a reference to this
+    // - one is the references object
+    // - one is the mutableRow
+    assert(globalVariables.length == 3)
+  }
 }

From b79071910e61a851931b8b96d4acb66e0576caa7 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Thu, 7 Dec 2017 21:24:36 +0800
Subject: [PATCH 065/356] [SPARK-22696][SQL] objects functions should not use
 unneeded global variables

## What changes were proposed in this pull request?

Some objects functions are using global variables which are not needed. This can generate some unneeded entries in the constant pool.

The PR replaces the unneeded global variables with local variables.

## How was this patch tested?

added UTs

Author: Marco Gaido <mgaido@hortonworks.com>
Author: Marco Gaido <marcogaido91@gmail.com>

Closes #19908 from mgaido91/SPARK-22696.
---
 .../expressions/objects/objects.scala         | 61 +++++++++++--------
 .../expressions/CodeGenerationSuite.scala     | 16 ++++-
 2 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 730b2ff96da6c..349afece84d5c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -1106,27 +1106,31 @@ case class CreateExternalRow(children: Seq[Expression], schema: StructType)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val rowClass = classOf[GenericRowWithSchema].getName
     val values = ctx.freshName("values")
-    ctx.addMutableState("Object[]", values)
 
     val childrenCodes = children.zipWithIndex.map { case (e, i) =>
       val eval = e.genCode(ctx)
-      eval.code + s"""
-          if (${eval.isNull}) {
-            $values[$i] = null;
-          } else {
-            $values[$i] = ${eval.value};
-          }
-         """
+      s"""
+         |${eval.code}
+         |if (${eval.isNull}) {
+         |  $values[$i] = null;
+         |} else {
+         |  $values[$i] = ${eval.value};
+         |}
+       """.stripMargin
     }
 
-    val childrenCode = ctx.splitExpressionsWithCurrentInputs(childrenCodes)
-    val schemaField = ctx.addReferenceObj("schema", schema)
+    val childrenCode = ctx.splitExpressionsWithCurrentInputs(
+      expressions = childrenCodes,
+      funcName = "createExternalRow",
+      extraArguments = "Object[]" -> values :: Nil)
+    val schemaField = ctx.addReferenceMinorObj(schema)
 
-    val code = s"""
-      $values = new Object[${children.size}];
-      $childrenCode
-      final ${classOf[Row].getName} ${ev.value} = new $rowClass($values, $schemaField);
-      """
+    val code =
+      s"""
+         |Object[] $values = new Object[${children.size}];
+         |$childrenCode
+         |final ${classOf[Row].getName} ${ev.value} = new $rowClass($values, $schemaField);
+       """.stripMargin
     ev.copy(code = code, isNull = "false")
   }
 }
@@ -1244,25 +1248,28 @@ case class InitializeJavaBean(beanInstance: Expression, setters: Map[String, Exp
 
     val javaBeanInstance = ctx.freshName("javaBean")
     val beanInstanceJavaType = ctx.javaType(beanInstance.dataType)
-    ctx.addMutableState(beanInstanceJavaType, javaBeanInstance)
 
     val initialize = setters.map {
       case (setterMethod, fieldValue) =>
         val fieldGen = fieldValue.genCode(ctx)
         s"""
-           ${fieldGen.code}
-           ${javaBeanInstance}.$setterMethod(${fieldGen.value});
-         """
+           |${fieldGen.code}
+           |$javaBeanInstance.$setterMethod(${fieldGen.value});
+         """.stripMargin
     }
-    val initializeCode = ctx.splitExpressionsWithCurrentInputs(initialize.toSeq)
+    val initializeCode = ctx.splitExpressionsWithCurrentInputs(
+      expressions = initialize.toSeq,
+      funcName = "initializeJavaBean",
+      extraArguments = beanInstanceJavaType -> javaBeanInstance :: Nil)
 
-    val code = s"""
-      ${instanceGen.code}
-      ${javaBeanInstance} = ${instanceGen.value};
-      if (!${instanceGen.isNull}) {
-        $initializeCode
-      }
-     """
+    val code =
+      s"""
+         |${instanceGen.code}
+         |$beanInstanceJavaType $javaBeanInstance = ${instanceGen.value};
+         |if (!${instanceGen.isNull}) {
+         |  $initializeCode
+         |}
+       """.stripMargin
     ev.copy(code = code, isNull = instanceGen.isNull, value = instanceGen.value)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index a4198f826cedb..40bf29bb3b573 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, CreateExternalRow, GetExternalRowField, ValidateExternalType}
+import org.apache.spark.sql.catalyst.expressions.objects._
 import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -380,4 +380,18 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
         s"Incorrect Evaluation: expressions: $exprAnd, actual: $actualAnd, expected: $expectedAnd")
     }
   }
+
+  test("SPARK-22696: CreateExternalRow should not use global variables") {
+    val ctx = new CodegenContext
+    val schema = new StructType().add("a", IntegerType).add("b", StringType)
+    CreateExternalRow(Seq(Literal(1), Literal("x")), schema).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
+
+  test("SPARK-22696: InitializeJavaBean should not use global variables") {
+    val ctx = new CodegenContext
+    InitializeJavaBean(Literal.fromObject(new java.util.LinkedList[Int]),
+      Map("add" -> Literal(1))).genCode(ctx)
+    assert(ctx.mutableStates.isEmpty)
+  }
 }

From f41c0a93fd3913ad93e55ddbfd875229872ecc97 Mon Sep 17 00:00:00 2001
From: kellyzly <kellyzly@126.com>
Date: Thu, 7 Dec 2017 10:04:04 -0600
Subject: [PATCH 066/356] [SPARK-22660][BUILD] Use position() and limit() to
 fix ambiguity issue in scala-2.12
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…a-2.12 and JDK9

## What changes were proposed in this pull request?
Some compile error after upgrading to scala-2.12
```javascript
spark_source/core/src/main/scala/org/apache/spark/executor/Executor.scala:455: ambiguous reference to overloaded definition, method limit in class ByteBuffer of type (x$1: Int)java.nio.ByteBuffer
method limit in class Buffer of type ()Int
match expected type ?
     val resultSize = serializedDirectResult.limit
error
```
The limit method was moved from ByteBuffer to the superclass Buffer and it can no longer be called without (). The same reason for position method.

```javascript
/home/zly/prj/oss/jdk9_HOS_SOURCE/spark_source/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala:427: ambiguous reference to overloaded definition, [error] both method putAll in class Properties of type (x$1: java.util.Map[_, _])Unit [error] and  method putAll in class Hashtable of type (x$1: java.util.Map[_ <: Object, _ <: Object])Unit [error] match argument types (java.util.Map[String,String])
 [error]       props.putAll(outputSerdeProps.toMap.asJava)
 [error]             ^
 ```
This is because the key type is Object instead of String which is unsafe.

## How was this patch tested?

running tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: kellyzly <kellyzly@126.com>

Closes #19854 from kellyzly/SPARK-22660.
---
 .../org/apache/spark/broadcast/TorrentBroadcast.scala     | 3 ++-
 .../main/scala/org/apache/spark/executor/Executor.scala   | 2 +-
 .../scheduler/cluster/CoarseGrainedSchedulerBackend.scala | 4 ++--
 .../org/apache/spark/util/ByteBufferInputStream.scala     | 2 +-
 .../org/apache/spark/util/io/ChunkedByteBuffer.scala      | 4 ++--
 .../org/apache/spark/serializer/KryoSerializerSuite.scala | 2 +-
 .../scala/org/apache/spark/storage/DiskStoreSuite.scala   | 2 +-
 .../org/apache/spark/sql/kafka010/KafkaTestUtils.scala    | 4 +++-
 .../sql/execution/columnar/NullableColumnAccessor.scala   | 2 +-
 .../columnar/compression/compressionSchemes.scala         | 2 +-
 .../sql/hive/execution/ScriptTransformationExec.scala     | 8 ++++++--
 .../apache/spark/streaming/dstream/RawInputDStream.scala  | 2 +-
 12 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 67e993c7f02e2..7aecd3c9668ea 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -99,7 +99,8 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   private def calcChecksum(block: ByteBuffer): Int = {
     val adler = new Adler32()
     if (block.hasArray) {
-      adler.update(block.array, block.arrayOffset + block.position, block.limit - block.position)
+      adler.update(block.array, block.arrayOffset + block.position(), block.limit()
+        - block.position())
     } else {
       val bytes = new Array[Byte](block.remaining())
       block.duplicate.get(bytes)
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index e3e555eaa0277..af0a0ab656564 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -452,7 +452,7 @@ private[spark] class Executor(
         // TODO: do not serialize value twice
         val directResult = new DirectTaskResult(valueBytes, accumUpdates)
         val serializedDirectResult = ser.serialize(directResult)
-        val resultSize = serializedDirectResult.limit
+        val resultSize = serializedDirectResult.limit()
 
         // directSend = sending directly back to the driver
         val serializedResult: ByteBuffer = {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index e6982f333d521..4d75063fbf1c5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -287,13 +287,13 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
       for (task <- tasks.flatten) {
         val serializedTask = TaskDescription.encode(task)
-        if (serializedTask.limit >= maxRpcMessageSize) {
+        if (serializedTask.limit() >= maxRpcMessageSize) {
           scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
             try {
               var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
                 "spark.rpc.message.maxSize (%d bytes). Consider increasing " +
                 "spark.rpc.message.maxSize or using broadcast variables for large values."
-              msg = msg.format(task.taskId, task.index, serializedTask.limit, maxRpcMessageSize)
+              msg = msg.format(task.taskId, task.index, serializedTask.limit(), maxRpcMessageSize)
               taskSetMgr.abort(msg)
             } catch {
               case e: Exception => logError("Exception in error callback", e)
diff --git a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
index a938cb07724c7..a5ee0ff16b5df 100644
--- a/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
+++ b/core/src/main/scala/org/apache/spark/util/ByteBufferInputStream.scala
@@ -54,7 +54,7 @@ class ByteBufferInputStream(private var buffer: ByteBuffer)
   override def skip(bytes: Long): Long = {
     if (buffer != null) {
       val amountToSkip = math.min(bytes, buffer.remaining).toInt
-      buffer.position(buffer.position + amountToSkip)
+      buffer.position(buffer.position() + amountToSkip)
       if (buffer.remaining() == 0) {
         cleanUp()
       }
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
index c28570fb24560..7367af7888bd8 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBuffer.scala
@@ -65,7 +65,7 @@ private[spark] class ChunkedByteBuffer(var chunks: Array[ByteBuffer]) {
     for (bytes <- getChunks()) {
       while (bytes.remaining() > 0) {
         val ioSize = Math.min(bytes.remaining(), bufferWriteChunkSize)
-        bytes.limit(bytes.position + ioSize)
+        bytes.limit(bytes.position() + ioSize)
         channel.write(bytes)
       }
     }
@@ -206,7 +206,7 @@ private[spark] class ChunkedByteBufferInputStream(
   override def skip(bytes: Long): Long = {
     if (currentChunk != null) {
       val amountToSkip = math.min(bytes, currentChunk.remaining).toInt
-      currentChunk.position(currentChunk.position + amountToSkip)
+      currentChunk.position(currentChunk.position() + amountToSkip)
       if (currentChunk.remaining() == 0) {
         if (chunks.hasNext) {
           currentChunk = chunks.next()
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
index eaec098b8d785..fc78655bf52ec 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerSuite.scala
@@ -199,7 +199,7 @@ class KryoSerializerSuite extends SparkFunSuite with SharedSparkContext {
     def check[T: ClassTag](t: T) {
       assert(ser.deserialize[T](ser.serialize(t)) === t)
       // Check that very long ranges don't get written one element at a time
-      assert(ser.serialize(t).limit < 100)
+      assert(ser.serialize(t).limit() < 100)
     }
     check(1 to 1000000)
     check(1 to 1000000 by 2)
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
index 7258fdf5efc0d..efdd02fff7871 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
@@ -118,7 +118,7 @@ class DiskStoreSuite extends SparkFunSuite {
     val chunks = chunkedByteBuffer.chunks
     assert(chunks.size === 2)
     for (chunk <- chunks) {
-      assert(chunk.limit === 10 * 1024)
+      assert(chunk.limit() === 10 * 1024)
     }
 
     val e = intercept[IllegalArgumentException]{
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 2df8352f48660..08db4d827e400 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -296,7 +296,9 @@ class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends L
     props.put("replica.socket.timeout.ms", "1500")
     props.put("delete.topic.enable", "true")
     props.put("offsets.topic.num.partitions", "1")
-    props.putAll(withBrokerProps.asJava)
+    // Can not use properties.putAll(propsMap.asJava) in scala-2.12
+    // See https://github.com/scala/bug/issues/10418
+    withBrokerProps.foreach { case (k, v) => props.put(k, v) }
     props
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessor.scala
index 2f09757aa341c..341ade1a5c613 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/NullableColumnAccessor.scala
@@ -35,7 +35,7 @@ private[columnar] trait NullableColumnAccessor extends ColumnAccessor {
     nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
     pos = 0
 
-    underlyingBuffer.position(underlyingBuffer.position + 4 + nullCount * 4)
+    underlyingBuffer.position(underlyingBuffer.position() + 4 + nullCount * 4)
     super.initialize()
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
index bf00ad997c76e..79dcf3a6105ce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/compression/compressionSchemes.scala
@@ -112,7 +112,7 @@ private[columnar] case object PassThrough extends CompressionScheme {
       var nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else capacity
       var pos = 0
       var seenNulls = 0
-      var bufferPos = buffer.position
+      var bufferPos = buffer.position()
       while (pos < capacity) {
         if (pos != nextNullIndex) {
           val len = nextNullIndex - pos
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
index d786a610f1535..3328400b214fb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformationExec.scala
@@ -412,7 +412,9 @@ case class HiveScriptIOSchema (
     propsMap = propsMap + (serdeConstants.LIST_COLUMN_TYPES -> columnTypesNames)
 
     val properties = new Properties()
-    properties.putAll(propsMap.asJava)
+    // Can not use properties.putAll(propsMap.asJava) in scala-2.12
+    // See https://github.com/scala/bug/issues/10418
+    propsMap.foreach { case (k, v) => properties.put(k, v) }
     serde.initialize(null, properties)
 
     serde
@@ -424,7 +426,9 @@ case class HiveScriptIOSchema (
     recordReaderClass.map { klass =>
       val instance = Utils.classForName(klass).newInstance().asInstanceOf[RecordReader]
       val props = new Properties()
-      props.putAll(outputSerdeProps.toMap.asJava)
+      // Can not use props.putAll(outputSerdeProps.toMap.asJava) in scala-2.12
+      // See https://github.com/scala/bug/issues/10418
+      outputSerdeProps.toMap.foreach { case (k, v) => props.put(k, v) }
       instance.initialize(inputStream, conf, props)
       instance
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
index b2ec33e82ddaa..b22bbb79a5cc9 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/RawInputDStream.scala
@@ -98,7 +98,7 @@ class RawNetworkReceiver(host: String, port: Int, storageLevel: StorageLevel)
 
   /** Read a buffer fully from a given Channel */
   private def readFully(channel: ReadableByteChannel, dest: ByteBuffer) {
-    while (dest.position < dest.limit) {
+    while (dest.position() < dest.limit()) {
       if (channel.read(dest) == -1) {
         throw new EOFException("End of channel")
       }

From 18b75d465b7563de926c5690094086a72a75c09f Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Thu, 7 Dec 2017 10:24:49 -0800
Subject: [PATCH 067/356] [SPARK-22719][SQL] Refactor ConstantPropagation

## What changes were proposed in this pull request?

The current time complexity of ConstantPropagation is O(n^2), which can be slow when the query is complex.
Refactor the implementation with O( n ) time complexity, and some pruning to avoid traversing the whole `Condition`

## How was this patch tested?

Unit test.

Also simple benchmark test in ConstantPropagationSuite
```
  val condition = (1 to 500).map{_ => Rand(0) === Rand(0)}.reduce(And)
  val query = testRelation
    .select(columnA)
    .where(condition)
  val start = System.currentTimeMillis()
  (1 to 40).foreach { _ =>
    Optimize.execute(query.analyze)
  }
  val end = System.currentTimeMillis()
  println(end - start)
```
Run time before changes: 18989ms (474ms per loop)
Run time after changes: 1275 ms (32ms per loop)

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19912 from gengliangwang/ConstantPropagation.
---
 .../sql/catalyst/optimizer/expressions.scala  | 106 ++++++++++++------
 1 file changed, 73 insertions(+), 33 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 785e815b41185..6305b6c84bae3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -64,49 +64,89 @@ object ConstantFolding extends Rule[LogicalPlan] {
  * }}}
  *
  * Approach used:
- * - Start from AND operator as the root
- * - Get all the children conjunctive predicates which are EqualTo / EqualNullSafe such that they
- *   don't have a `NOT` or `OR` operator in them
  * - Populate a mapping of attribute => constant value by looking at all the equals predicates
  * - Using this mapping, replace occurrence of the attributes with the corresponding constant values
  *   in the AND node.
  */
 object ConstantPropagation extends Rule[LogicalPlan] with PredicateHelper {
-  private def containsNonConjunctionPredicates(expression: Expression): Boolean = expression.find {
-    case _: Not | _: Or => true
-    case _ => false
-  }.isDefined
-
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case f: Filter => f transformExpressionsUp {
-      case and: And =>
-        val conjunctivePredicates =
-          splitConjunctivePredicates(and)
-            .filter(expr => expr.isInstanceOf[EqualTo] || expr.isInstanceOf[EqualNullSafe])
-            .filterNot(expr => containsNonConjunctionPredicates(expr))
-
-        val equalityPredicates = conjunctivePredicates.collect {
-          case e @ EqualTo(left: AttributeReference, right: Literal) => ((left, right), e)
-          case e @ EqualTo(left: Literal, right: AttributeReference) => ((right, left), e)
-          case e @ EqualNullSafe(left: AttributeReference, right: Literal) => ((left, right), e)
-          case e @ EqualNullSafe(left: Literal, right: AttributeReference) => ((right, left), e)
-        }
+    case f: Filter =>
+      val (newCondition, _) = traverse(f.condition, replaceChildren = true)
+      if (newCondition.isDefined) {
+        f.copy(condition = newCondition.get)
+      } else {
+        f
+      }
+  }
 
-        val constantsMap = AttributeMap(equalityPredicates.map(_._1))
-        val predicates = equalityPredicates.map(_._2).toSet
+  type EqualityPredicates = Seq[((AttributeReference, Literal), BinaryComparison)]
 
-        def replaceConstants(expression: Expression) = expression transform {
-          case a: AttributeReference =>
-            constantsMap.get(a) match {
-              case Some(literal) => literal
-              case None => a
-            }
+  /**
+   * Traverse a condition as a tree and replace attributes with constant values.
+   * - On matching [[And]], recursively traverse each children and get propagated mappings.
+   *   If the current node is not child of another [[And]], replace all occurrences of the
+   *   attributes with the corresponding constant values.
+   * - If a child of [[And]] is [[EqualTo]] or [[EqualNullSafe]], propagate the mapping
+   *   of attribute => constant.
+   * - On matching [[Or]] or [[Not]], recursively traverse each children, propagate empty mapping.
+   * - Otherwise, stop traversal and propagate empty mapping.
+   * @param condition condition to be traversed
+   * @param replaceChildren whether to replace attributes with constant values in children
+   * @return A tuple including:
+   *         1. Option[Expression]: optional changed condition after traversal
+   *         2. EqualityPredicates: propagated mapping of attribute => constant
+   */
+  private def traverse(condition: Expression, replaceChildren: Boolean)
+    : (Option[Expression], EqualityPredicates) =
+    condition match {
+      case e @ EqualTo(left: AttributeReference, right: Literal) => (None, Seq(((left, right), e)))
+      case e @ EqualTo(left: Literal, right: AttributeReference) => (None, Seq(((right, left), e)))
+      case e @ EqualNullSafe(left: AttributeReference, right: Literal) =>
+        (None, Seq(((left, right), e)))
+      case e @ EqualNullSafe(left: Literal, right: AttributeReference) =>
+        (None, Seq(((right, left), e)))
+      case a: And =>
+        val (newLeft, equalityPredicatesLeft) = traverse(a.left, replaceChildren = false)
+        val (newRight, equalityPredicatesRight) = traverse(a.right, replaceChildren = false)
+        val equalityPredicates = equalityPredicatesLeft ++ equalityPredicatesRight
+        val newSelf = if (equalityPredicates.nonEmpty && replaceChildren) {
+          Some(And(replaceConstants(newLeft.getOrElse(a.left), equalityPredicates),
+            replaceConstants(newRight.getOrElse(a.right), equalityPredicates)))
+        } else {
+          if (newLeft.isDefined || newRight.isDefined) {
+            Some(And(newLeft.getOrElse(a.left), newRight.getOrElse(a.right)))
+          } else {
+            None
+          }
         }
-
-        and transform {
-          case e @ EqualTo(_, _) if !predicates.contains(e) => replaceConstants(e)
-          case e @ EqualNullSafe(_, _) if !predicates.contains(e) => replaceConstants(e)
+        (newSelf, equalityPredicates)
+      case o: Or =>
+        // Ignore the EqualityPredicates from children since they are only propagated through And.
+        val (newLeft, _) = traverse(o.left, replaceChildren = true)
+        val (newRight, _) = traverse(o.right, replaceChildren = true)
+        val newSelf = if (newLeft.isDefined || newRight.isDefined) {
+          Some(Or(left = newLeft.getOrElse(o.left), right = newRight.getOrElse((o.right))))
+        } else {
+          None
         }
+        (newSelf, Seq.empty)
+      case n: Not =>
+        // Ignore the EqualityPredicates from children since they are only propagated through And.
+        val (newChild, _) = traverse(n.child, replaceChildren = true)
+        (newChild.map(Not), Seq.empty)
+      case _ => (None, Seq.empty)
+    }
+
+  private def replaceConstants(condition: Expression, equalityPredicates: EqualityPredicates)
+    : Expression = {
+    val constantsMap = AttributeMap(equalityPredicates.map(_._1))
+    val predicates = equalityPredicates.map(_._2).toSet
+    def replaceConstants0(expression: Expression) = expression transform {
+      case a: AttributeReference => constantsMap.getOrElse(a, a)
+    }
+    condition transform {
+      case e @ EqualTo(_, _) if !predicates.contains(e) => replaceConstants0(e)
+      case e @ EqualNullSafe(_, _) if !predicates.contains(e) => replaceConstants0(e)
     }
   }
 }

From aa1764ba1addbe7ad79344d5640bf6426267a38c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 7 Dec 2017 15:45:23 -0800
Subject: [PATCH 068/356] [SPARK-22279][SQL] Turn on
 spark.sql.hive.convertMetastoreOrc by default

## What changes were proposed in this pull request?

Like Parquet, this PR aims to turn on `spark.sql.hive.convertMetastoreOrc` by default.

## How was this patch tested?

Pass all the existing test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19499 from dongjoon-hyun/SPARK-22279.
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index f5e6720f6a510..c489690af8cd1 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -109,7 +109,7 @@ private[spark] object HiveUtils extends Logging {
     .doc("When set to true, the built-in ORC reader and writer are used to process " +
       "ORC tables created by using the HiveQL syntax, instead of Hive serde.")
     .booleanConf
-    .createWithDefault(false)
+    .createWithDefault(true)
 
   val HIVE_METASTORE_SHARED_PREFIXES = buildConf("spark.sql.hive.metastore.sharedPrefixes")
     .doc("A comma separated list of class prefixes that should be loaded using the classloader " +

From 0ba8f4b21167fe98a9f06036c4fc4ccd8e3287b4 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 8 Dec 2017 09:52:16 +0800
Subject: [PATCH 069/356] [SPARK-21787][SQL] Support for pushing down filters
 for DateType in native OrcFileFormat

## What changes were proposed in this pull request?

This PR support for pushing down filters for DateType in ORC

## How was this patch tested?

Pass the Jenkins with newly add and updated test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #18995 from dongjoon-hyun/SPARK-21787.
---
 .../datasources/orc/OrcFilters.scala          |  3 +-
 .../datasources/orc/OrcFilterSuite.scala      | 29 +++++++++++++++----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
index cec256cc1b498..4f44ae4fa1d71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilters.scala
@@ -82,8 +82,7 @@ private[orc] object OrcFilters {
    * Both CharType and VarcharType are cleaned at AstBuilder.
    */
   private def isSearchableType(dataType: DataType) = dataType match {
-    // TODO: SPARK-21787 Support for pushing down filters for DateType in ORC
-    case BinaryType | DateType => false
+    case BinaryType => false
     case _: AtomicType => true
     case _ => false
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
index a5f6b68ee862e..8680b86517b19 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcFilterSuite.scala
@@ -316,6 +316,30 @@ class OrcFilterSuite extends OrcTest with SharedSQLContext {
     }
   }
 
+  test("filter pushdown - date") {
+    val dates = Seq("2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21").map { day =>
+      Date.valueOf(day)
+    }
+    withOrcDataFrame(dates.map(Tuple1(_))) { implicit df =>
+      checkFilterPredicate('_1.isNull, PredicateLeaf.Operator.IS_NULL)
+
+      checkFilterPredicate('_1 === dates(0), PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate('_1 <=> dates(0), PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+
+      checkFilterPredicate('_1 < dates(1), PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate('_1 > dates(2), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 <= dates(0), PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate('_1 >= dates(3), PredicateLeaf.Operator.LESS_THAN)
+
+      checkFilterPredicate(Literal(dates(0)) === '_1, PredicateLeaf.Operator.EQUALS)
+      checkFilterPredicate(Literal(dates(0)) <=> '_1, PredicateLeaf.Operator.NULL_SAFE_EQUALS)
+      checkFilterPredicate(Literal(dates(1)) > '_1, PredicateLeaf.Operator.LESS_THAN)
+      checkFilterPredicate(Literal(dates(2)) < '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(dates(0)) >= '_1, PredicateLeaf.Operator.LESS_THAN_EQUALS)
+      checkFilterPredicate(Literal(dates(3)) <= '_1, PredicateLeaf.Operator.LESS_THAN)
+    }
+  }
+
   test("no filter pushdown - non-supported types") {
     implicit class IntToBinary(int: Int) {
       def b: Array[Byte] = int.toString.getBytes(StandardCharsets.UTF_8)
@@ -328,11 +352,6 @@ class OrcFilterSuite extends OrcTest with SharedSQLContext {
     withOrcDataFrame((1 to 4).map(i => Tuple1(i.b))) { implicit df =>
       checkNoFilterPredicate('_1 <=> 1.b)
     }
-    // DateType
-    val stringDate = "2015-01-01"
-    withOrcDataFrame(Seq(Tuple1(Date.valueOf(stringDate)))) { implicit df =>
-      checkNoFilterPredicate('_1 === Date.valueOf(stringDate))
-    }
     // MapType
     withOrcDataFrame((1 to 4).map(i => Tuple1(Map(i -> i)))) { implicit df =>
       checkNoFilterPredicate('_1.isNotNull)

From b11869bc3b10b1ee50b1752cfcf4e736ef110cb1 Mon Sep 17 00:00:00 2001
From: Tathagata Das <tathagata.das1565@gmail.com>
Date: Thu, 7 Dec 2017 22:02:51 -0800
Subject: [PATCH 070/356] [SPARK-22187][SS][REVERT] Revert change in state row
 format for mapGroupsWithState

## What changes were proposed in this pull request?

#19416 changed the format in which rows were encoded in the state store. However, this can break existing streaming queries with the old format in unpredictable ways (potentially crashing the JVM). Hence I am reverting this for now. This will be re-applied in the future after we start saving more metadata in checkpoints to signify which version of state row format the existing streaming query is running. Then we can decode old and new formats accordingly.

## How was this patch tested?
Existing tests.

Author: Tathagata Das <tathagata.das1565@gmail.com>

Closes #19924 from tdas/SPARK-22187-1.
---
 .../FlatMapGroupsWithStateExec.scala          | 135 +++++++++++++---
 .../FlatMapGroupsWithState_StateManager.scala | 153 ------------------
 .../FlatMapGroupsWithStateSuite.scala         | 130 +++++++--------
 3 files changed, 171 insertions(+), 247 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
index 29f38fab3f896..80769d728b8f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FlatMapGroupsWithStateExec.scala
@@ -23,8 +23,10 @@ import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Attribut
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Distribution}
 import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP
 import org.apache.spark.sql.execution.streaming.state._
 import org.apache.spark.sql.streaming.{GroupStateTimeout, OutputMode}
+import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.util.CompletionIterator
 
 /**
@@ -60,8 +62,27 @@ case class FlatMapGroupsWithStateExec(
   import GroupStateImpl._
 
   private val isTimeoutEnabled = timeoutConf != NoTimeout
-  val stateManager = new FlatMapGroupsWithState_StateManager(stateEncoder, isTimeoutEnabled)
-  val watermarkPresent = child.output.exists {
+  private val timestampTimeoutAttribute =
+    AttributeReference("timeoutTimestamp", dataType = IntegerType, nullable = false)()
+  private val stateAttributes: Seq[Attribute] = {
+    val encSchemaAttribs = stateEncoder.schema.toAttributes
+    if (isTimeoutEnabled) encSchemaAttribs :+ timestampTimeoutAttribute else encSchemaAttribs
+  }
+  // Get the serializer for the state, taking into account whether we need to save timestamps
+  private val stateSerializer = {
+    val encoderSerializer = stateEncoder.namedExpressions
+    if (isTimeoutEnabled) {
+      encoderSerializer :+ Literal(GroupStateImpl.NO_TIMESTAMP)
+    } else {
+      encoderSerializer
+    }
+  }
+  // Get the deserializer for the state. Note that this must be done in the driver, as
+  // resolving and binding of deserializer expressions to the encoded type can be safely done
+  // only in the driver.
+  private val stateDeserializer = stateEncoder.resolveAndBind().deserializer
+
+  private val watermarkPresent = child.output.exists {
     case a: Attribute if a.metadata.contains(EventTimeWatermark.delayKey) => true
     case _ => false
   }
@@ -92,11 +113,11 @@ case class FlatMapGroupsWithStateExec(
     child.execute().mapPartitionsWithStateStore[InternalRow](
       getStateInfo,
       groupingAttributes.toStructType,
-      stateManager.stateSchema,
+      stateAttributes.toStructType,
       indexOrdinal = None,
       sqlContext.sessionState,
       Some(sqlContext.streams.stateStoreCoordinator)) { case (store, iter) =>
-        val processor = new InputProcessor(store)
+        val updater = new StateStoreUpdater(store)
 
         // If timeout is based on event time, then filter late data based on watermark
         val filteredIter = watermarkPredicateForData match {
@@ -111,7 +132,7 @@ case class FlatMapGroupsWithStateExec(
         // all the data has been processed. This is to ensure that the timeout information of all
         // the keys with data is updated before they are processed for timeouts.
         val outputIterator =
-          processor.processNewData(filteredIter) ++ processor.processTimedOutState()
+          updater.updateStateForKeysWithData(filteredIter) ++ updater.updateStateForTimedOutKeys()
 
         // Return an iterator of all the rows generated by all the keys, such that when fully
         // consumed, all the state updates will be committed by the state store
@@ -126,7 +147,7 @@ case class FlatMapGroupsWithStateExec(
   }
 
   /** Helper class to update the state store */
-  class InputProcessor(store: StateStore) {
+  class StateStoreUpdater(store: StateStore) {
 
     // Converters for translating input keys, values, output data between rows and Java objects
     private val getKeyObj =
@@ -135,6 +156,14 @@ case class FlatMapGroupsWithStateExec(
       ObjectOperator.deserializeRowToObject(valueDeserializer, dataAttributes)
     private val getOutputRow = ObjectOperator.wrapObjectToRow(outputObjAttr.dataType)
 
+    // Converters for translating state between rows and Java objects
+    private val getStateObjFromRow = ObjectOperator.deserializeRowToObject(
+      stateDeserializer, stateAttributes)
+    private val getStateRowFromObj = ObjectOperator.serializeObjectToRow(stateSerializer)
+
+    // Index of the additional metadata fields in the state row
+    private val timeoutTimestampIndex = stateAttributes.indexOf(timestampTimeoutAttribute)
+
     // Metrics
     private val numUpdatedStateRows = longMetric("numUpdatedStateRows")
     private val numOutputRows = longMetric("numOutputRows")
@@ -143,19 +172,20 @@ case class FlatMapGroupsWithStateExec(
      * For every group, get the key, values and corresponding state and call the function,
      * and return an iterator of rows
      */
-    def processNewData(dataIter: Iterator[InternalRow]): Iterator[InternalRow] = {
+    def updateStateForKeysWithData(dataIter: Iterator[InternalRow]): Iterator[InternalRow] = {
       val groupedIter = GroupedIterator(dataIter, groupingAttributes, child.output)
       groupedIter.flatMap { case (keyRow, valueRowIter) =>
         val keyUnsafeRow = keyRow.asInstanceOf[UnsafeRow]
         callFunctionAndUpdateState(
-          stateManager.getState(store, keyUnsafeRow),
+          keyUnsafeRow,
           valueRowIter,
+          store.get(keyUnsafeRow),
           hasTimedOut = false)
       }
     }
 
     /** Find the groups that have timeout set and are timing out right now, and call the function */
-    def processTimedOutState(): Iterator[InternalRow] = {
+    def updateStateForTimedOutKeys(): Iterator[InternalRow] = {
       if (isTimeoutEnabled) {
         val timeoutThreshold = timeoutConf match {
           case ProcessingTimeTimeout => batchTimestampMs.get
@@ -164,11 +194,12 @@ case class FlatMapGroupsWithStateExec(
             throw new IllegalStateException(
               s"Cannot filter timed out keys for $timeoutConf")
         }
-        val timingOutKeys = stateManager.getAllState(store).filter { state =>
-          state.timeoutTimestamp != NO_TIMESTAMP && state.timeoutTimestamp < timeoutThreshold
+        val timingOutKeys = store.getRange(None, None).filter { rowPair =>
+          val timeoutTimestamp = getTimeoutTimestamp(rowPair.value)
+          timeoutTimestamp != NO_TIMESTAMP && timeoutTimestamp < timeoutThreshold
         }
-        timingOutKeys.flatMap { stateData =>
-          callFunctionAndUpdateState(stateData, Iterator.empty, hasTimedOut = true)
+        timingOutKeys.flatMap { rowPair =>
+          callFunctionAndUpdateState(rowPair.key, Iterator.empty, rowPair.value, hasTimedOut = true)
         }
       } else Iterator.empty
     }
@@ -178,19 +209,22 @@ case class FlatMapGroupsWithStateExec(
      * iterator. Note that the store updating is lazy, that is, the store will be updated only
      * after the returned iterator is fully consumed.
      *
-     * @param stateData All the data related to the state to be updated
+     * @param keyRow Row representing the key, cannot be null
      * @param valueRowIter Iterator of values as rows, cannot be null, but can be empty
+     * @param prevStateRow Row representing the previous state, can be null
      * @param hasTimedOut Whether this function is being called for a key timeout
      */
     private def callFunctionAndUpdateState(
-        stateData: FlatMapGroupsWithState_StateData,
+        keyRow: UnsafeRow,
         valueRowIter: Iterator[InternalRow],
+        prevStateRow: UnsafeRow,
         hasTimedOut: Boolean): Iterator[InternalRow] = {
 
-      val keyObj = getKeyObj(stateData.keyRow)  // convert key to objects
+      val keyObj = getKeyObj(keyRow)  // convert key to objects
       val valueObjIter = valueRowIter.map(getValueObj.apply) // convert value rows to objects
-      val groupState = GroupStateImpl.createForStreaming(
-        Option(stateData.stateObj),
+      val stateObj = getStateObj(prevStateRow)
+      val keyedState = GroupStateImpl.createForStreaming(
+        Option(stateObj),
         batchTimestampMs.getOrElse(NO_TIMESTAMP),
         eventTimeWatermark.getOrElse(NO_TIMESTAMP),
         timeoutConf,
@@ -198,24 +232,50 @@ case class FlatMapGroupsWithStateExec(
         watermarkPresent)
 
       // Call function, get the returned objects and convert them to rows
-      val mappedIterator = func(keyObj, valueObjIter, groupState).map { obj =>
+      val mappedIterator = func(keyObj, valueObjIter, keyedState).map { obj =>
         numOutputRows += 1
         getOutputRow(obj)
       }
 
       // When the iterator is consumed, then write changes to state
       def onIteratorCompletion: Unit = {
-        if (groupState.hasRemoved && groupState.getTimeoutTimestamp == NO_TIMESTAMP) {
-          stateManager.removeState(store, stateData.keyRow)
+
+        val currentTimeoutTimestamp = keyedState.getTimeoutTimestamp
+        // If the state has not yet been set but timeout has been set, then
+        // we have to generate a row to save the timeout. However, attempting serialize
+        // null using case class encoder throws -
+        //    java.lang.NullPointerException: Null value appeared in non-nullable field:
+        //    If the schema is inferred from a Scala tuple / case class, or a Java bean, please
+        //    try to use scala.Option[_] or other nullable types.
+        if (!keyedState.exists && currentTimeoutTimestamp != NO_TIMESTAMP) {
+          throw new IllegalStateException(
+            "Cannot set timeout when state is not defined, that is, state has not been" +
+              "initialized or has been removed")
+        }
+
+        if (keyedState.hasRemoved) {
+          store.remove(keyRow)
           numUpdatedStateRows += 1
+
         } else {
-          val currentTimeoutTimestamp = groupState.getTimeoutTimestamp
-          val hasTimeoutChanged = currentTimeoutTimestamp != stateData.timeoutTimestamp
-          val shouldWriteState = groupState.hasUpdated || groupState.hasRemoved || hasTimeoutChanged
+          val previousTimeoutTimestamp = getTimeoutTimestamp(prevStateRow)
+          val stateRowToWrite = if (keyedState.hasUpdated) {
+            getStateRow(keyedState.get)
+          } else {
+            prevStateRow
+          }
+
+          val hasTimeoutChanged = currentTimeoutTimestamp != previousTimeoutTimestamp
+          val shouldWriteState = keyedState.hasUpdated || hasTimeoutChanged
 
           if (shouldWriteState) {
-            val updatedStateObj = if (groupState.exists) groupState.get else null
-            stateManager.putState(store, stateData.keyRow, updatedStateObj, currentTimeoutTimestamp)
+            if (stateRowToWrite == null) {
+              // This should never happen because checks in GroupStateImpl should avoid cases
+              // where empty state would need to be written
+              throw new IllegalStateException("Attempting to write empty state")
+            }
+            setTimeoutTimestamp(stateRowToWrite, currentTimeoutTimestamp)
+            store.put(keyRow, stateRowToWrite)
             numUpdatedStateRows += 1
           }
         }
@@ -224,5 +284,28 @@ case class FlatMapGroupsWithStateExec(
       // Return an iterator of rows such that fully consumed, the updated state value will be saved
       CompletionIterator[InternalRow, Iterator[InternalRow]](mappedIterator, onIteratorCompletion)
     }
+
+    /** Returns the state as Java object if defined */
+    def getStateObj(stateRow: UnsafeRow): Any = {
+      if (stateRow != null) getStateObjFromRow(stateRow) else null
+    }
+
+    /** Returns the row for an updated state */
+    def getStateRow(obj: Any): UnsafeRow = {
+      assert(obj != null)
+      getStateRowFromObj(obj)
+    }
+
+    /** Returns the timeout timestamp of a state row is set */
+    def getTimeoutTimestamp(stateRow: UnsafeRow): Long = {
+      if (isTimeoutEnabled && stateRow != null) {
+        stateRow.getLong(timeoutTimestampIndex)
+      } else NO_TIMESTAMP
+    }
+
+    /** Set the timestamp in a state row */
+    def setTimeoutTimestamp(stateRow: UnsafeRow, timeoutTimestamps: Long): Unit = {
+      if (isTimeoutEnabled) stateRow.setLong(timeoutTimestampIndex, timeoutTimestamps)
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala
deleted file mode 100644
index e49546830286b..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/FlatMapGroupsWithState_StateManager.scala
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.streaming.state
-
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, CaseWhen, CreateNamedStruct, GetStructField, IsNull, Literal, UnsafeRow}
-import org.apache.spark.sql.execution.ObjectOperator
-import org.apache.spark.sql.execution.streaming.GroupStateImpl
-import org.apache.spark.sql.execution.streaming.GroupStateImpl.NO_TIMESTAMP
-import org.apache.spark.sql.types.{IntegerType, LongType, StructType}
-
-
-/**
- * Class to serialize/write/read/deserialize state for
- * [[org.apache.spark.sql.execution.streaming.FlatMapGroupsWithStateExec]].
- */
-class FlatMapGroupsWithState_StateManager(
-    stateEncoder: ExpressionEncoder[Any],
-    shouldStoreTimestamp: Boolean) extends Serializable {
-
-  /** Schema of the state rows saved in the state store */
-  val stateSchema = {
-    val schema = new StructType().add("groupState", stateEncoder.schema, nullable = true)
-    if (shouldStoreTimestamp) schema.add("timeoutTimestamp", LongType) else schema
-  }
-
-  /** Get deserialized state and corresponding timeout timestamp for a key */
-  def getState(store: StateStore, keyRow: UnsafeRow): FlatMapGroupsWithState_StateData = {
-    val stateRow = store.get(keyRow)
-    stateDataForGets.withNew(
-      keyRow, stateRow, getStateObj(stateRow), getTimestamp(stateRow))
-  }
-
-  /** Put state and timeout timestamp for a key */
-  def putState(store: StateStore, keyRow: UnsafeRow, state: Any, timestamp: Long): Unit = {
-    val stateRow = getStateRow(state)
-    setTimestamp(stateRow, timestamp)
-    store.put(keyRow, stateRow)
-  }
-
-  /** Removed all information related to a key */
-  def removeState(store: StateStore, keyRow: UnsafeRow): Unit = {
-    store.remove(keyRow)
-  }
-
-  /** Get all the keys and corresponding state rows in the state store */
-  def getAllState(store: StateStore): Iterator[FlatMapGroupsWithState_StateData] = {
-    val stateDataForGetAllState = FlatMapGroupsWithState_StateData()
-    store.getRange(None, None).map { pair =>
-      stateDataForGetAllState.withNew(
-        pair.key, pair.value, getStateObjFromRow(pair.value), getTimestamp(pair.value))
-    }
-  }
-
-  // Ordinals of the information stored in the state row
-  private lazy val nestedStateOrdinal = 0
-  private lazy val timeoutTimestampOrdinal = 1
-
-  // Get the serializer for the state, taking into account whether we need to save timestamps
-  private val stateSerializer = {
-    val nestedStateExpr = CreateNamedStruct(
-      stateEncoder.namedExpressions.flatMap(e => Seq(Literal(e.name), e)))
-    if (shouldStoreTimestamp) {
-      Seq(nestedStateExpr, Literal(GroupStateImpl.NO_TIMESTAMP))
-    } else {
-      Seq(nestedStateExpr)
-    }
-  }
-
-  // Get the deserializer for the state. Note that this must be done in the driver, as
-  // resolving and binding of deserializer expressions to the encoded type can be safely done
-  // only in the driver.
-  private val stateDeserializer = {
-    val boundRefToNestedState = BoundReference(nestedStateOrdinal, stateEncoder.schema, true)
-    val deser = stateEncoder.resolveAndBind().deserializer.transformUp {
-      case BoundReference(ordinal, _, _) => GetStructField(boundRefToNestedState, ordinal)
-    }
-    CaseWhen(Seq(IsNull(boundRefToNestedState) -> Literal(null)), elseValue = deser)
-  }
-
-  // Converters for translating state between rows and Java objects
-  private lazy val getStateObjFromRow = ObjectOperator.deserializeRowToObject(
-    stateDeserializer, stateSchema.toAttributes)
-  private lazy val getStateRowFromObj = ObjectOperator.serializeObjectToRow(stateSerializer)
-
-  // Reusable instance for returning state information
-  private lazy val stateDataForGets = FlatMapGroupsWithState_StateData()
-
-  /** Returns the state as Java object if defined */
-  private def getStateObj(stateRow: UnsafeRow): Any = {
-    if (stateRow == null) null
-    else getStateObjFromRow(stateRow)
-  }
-
-  /** Returns the row for an updated state */
-  private def getStateRow(obj: Any): UnsafeRow = {
-    val row = getStateRowFromObj(obj)
-    if (obj == null) {
-      row.setNullAt(nestedStateOrdinal)
-    }
-    row
-  }
-
-  /** Returns the timeout timestamp of a state row is set */
-  private def getTimestamp(stateRow: UnsafeRow): Long = {
-    if (shouldStoreTimestamp && stateRow != null) {
-      stateRow.getLong(timeoutTimestampOrdinal)
-    } else NO_TIMESTAMP
-  }
-
-  /** Set the timestamp in a state row */
-  private def setTimestamp(stateRow: UnsafeRow, timeoutTimestamps: Long): Unit = {
-    if (shouldStoreTimestamp) stateRow.setLong(timeoutTimestampOrdinal, timeoutTimestamps)
-  }
-}
-
-/**
- * Class to capture deserialized state and timestamp return by the state manager.
- * This is intended for reuse.
- */
-case class FlatMapGroupsWithState_StateData(
-    var keyRow: UnsafeRow = null,
-    var stateRow: UnsafeRow = null,
-    var stateObj: Any = null,
-    var timeoutTimestamp: Long = -1) {
-  def withNew(
-      newKeyRow: UnsafeRow,
-      newStateRow: UnsafeRow,
-      newStateObj: Any,
-      newTimeout: Long): this.type = {
-    keyRow = newKeyRow
-    stateRow = newStateRow
-    stateObj = newStateObj
-    timeoutTimestamp = newTimeout
-    this
-  }
-}
-
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
index b906393a379ae..de2b51678cea6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FlatMapGroupsWithStateSuite.scala
@@ -360,13 +360,13 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     }
   }
 
-  // Values used for testing InputProcessor
+  // Values used for testing StateStoreUpdater
   val currentBatchTimestamp = 1000
   val currentBatchWatermark = 1000
   val beforeTimeoutThreshold = 999
   val afterTimeoutThreshold = 1001
 
-  // Tests for InputProcessor.processNewData() when timeout = NoTimeout
+  // Tests for StateStoreUpdater.updateStateForKeysWithData() when timeout = NoTimeout
   for (priorState <- Seq(None, Some(0))) {
     val priorStateStr = if (priorState.nonEmpty) "prior state set" else "no prior state"
     val testName = s"NoTimeout - $priorStateStr - "
@@ -397,7 +397,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       expectedState = None)        // should be removed
   }
 
-  // Tests for InputProcessor.processTimedOutState() when timeout != NoTimeout
+  // Tests for StateStoreUpdater.updateStateForKeysWithData() when timeout != NoTimeout
   for (priorState <- Seq(None, Some(0))) {
     for (priorTimeoutTimestamp <- Seq(NO_TIMESTAMP, 1000)) {
       var testName = ""
@@ -444,18 +444,6 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
           expectedState = None)                                 // state should be removed
       }
 
-      // Tests with ProcessingTimeTimeout
-      if (priorState == None) {
-        testStateUpdateWithData(
-          s"ProcessingTimeTimeout - $testName - timeout updated without initializing state",
-          stateUpdates = state => { state.setTimeoutDuration(5000) },
-          timeoutConf = ProcessingTimeTimeout,
-          priorState = None,
-          priorTimeoutTimestamp = priorTimeoutTimestamp,
-          expectedState = None,
-          expectedTimeoutTimestamp = currentBatchTimestamp + 5000)
-      }
-
       testStateUpdateWithData(
         s"ProcessingTimeTimeout - $testName - state and timeout duration updated",
         stateUpdates =
@@ -466,36 +454,10 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
         expectedState = Some(5),                                 // state should change
         expectedTimeoutTimestamp = currentBatchTimestamp + 5000) // timestamp should change
 
-      testStateUpdateWithData(
-        s"ProcessingTimeTimeout - $testName - timeout updated after state removed",
-        stateUpdates = state => { state.remove(); state.setTimeoutDuration(5000) },
-        timeoutConf = ProcessingTimeTimeout,
-        priorState = priorState,
-        priorTimeoutTimestamp = priorTimeoutTimestamp,
-        expectedState = None,
-        expectedTimeoutTimestamp = currentBatchTimestamp + 5000)
-
-      // Tests with EventTimeTimeout
-
-      if (priorState == None) {
-        testStateUpdateWithData(
-          s"EventTimeTimeout - $testName - setting timeout without init state not allowed",
-          stateUpdates = state => {
-            state.setTimeoutTimestamp(10000)
-          },
-          timeoutConf = EventTimeTimeout,
-          priorState = None,
-          priorTimeoutTimestamp = priorTimeoutTimestamp,
-          expectedState = None,
-          expectedTimeoutTimestamp = 10000)
-      }
-
       testStateUpdateWithData(
         s"EventTimeTimeout - $testName - state and timeout timestamp updated",
         stateUpdates =
-          (state: GroupState[Int]) => {
-            state.update(5); state.setTimeoutTimestamp(5000)
-          },
+          (state: GroupState[Int]) => { state.update(5); state.setTimeoutTimestamp(5000) },
         timeoutConf = EventTimeTimeout,
         priorState = priorState,
         priorTimeoutTimestamp = priorTimeoutTimestamp,
@@ -514,23 +476,50 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
         timeoutConf = EventTimeTimeout,
         priorState = priorState,
         priorTimeoutTimestamp = priorTimeoutTimestamp,
-        expectedState = Some(5), // state should change
-        expectedTimeoutTimestamp = NO_TIMESTAMP) // timestamp should not update
-
-      testStateUpdateWithData(
-        s"EventTimeTimeout - $testName - setting timeout with state removal not allowed",
-        stateUpdates = state => {
-          state.remove(); state.setTimeoutTimestamp(10000)
-        },
-        timeoutConf = EventTimeTimeout,
-        priorState = priorState,
-        priorTimeoutTimestamp = priorTimeoutTimestamp,
-        expectedState = None,
-        expectedTimeoutTimestamp = 10000)
+        expectedState = Some(5),                                 // state should change
+        expectedTimeoutTimestamp = NO_TIMESTAMP)                 // timestamp should not update
     }
   }
 
-  // Tests for InputProcessor.processTimedOutState()
+  // Currently disallowed cases for StateStoreUpdater.updateStateForKeysWithData(),
+  // Try to remove these cases in the future
+  for (priorTimeoutTimestamp <- Seq(NO_TIMESTAMP, 1000)) {
+    val testName =
+      if (priorTimeoutTimestamp != NO_TIMESTAMP) "prior timeout set" else "no prior timeout"
+    testStateUpdateWithData(
+      s"ProcessingTimeTimeout - $testName - setting timeout without init state not allowed",
+      stateUpdates = state => { state.setTimeoutDuration(5000) },
+      timeoutConf = ProcessingTimeTimeout,
+      priorState = None,
+      priorTimeoutTimestamp = priorTimeoutTimestamp,
+      expectedException = classOf[IllegalStateException])
+
+    testStateUpdateWithData(
+      s"ProcessingTimeTimeout - $testName - setting timeout with state removal not allowed",
+      stateUpdates = state => { state.remove(); state.setTimeoutDuration(5000) },
+      timeoutConf = ProcessingTimeTimeout,
+      priorState = Some(5),
+      priorTimeoutTimestamp = priorTimeoutTimestamp,
+      expectedException = classOf[IllegalStateException])
+
+    testStateUpdateWithData(
+      s"EventTimeTimeout - $testName - setting timeout without init state not allowed",
+      stateUpdates = state => { state.setTimeoutTimestamp(10000) },
+      timeoutConf = EventTimeTimeout,
+      priorState = None,
+      priorTimeoutTimestamp = priorTimeoutTimestamp,
+      expectedException = classOf[IllegalStateException])
+
+    testStateUpdateWithData(
+      s"EventTimeTimeout - $testName - setting timeout with state removal not allowed",
+      stateUpdates = state => { state.remove(); state.setTimeoutTimestamp(10000) },
+      timeoutConf = EventTimeTimeout,
+      priorState = Some(5),
+      priorTimeoutTimestamp = priorTimeoutTimestamp,
+      expectedException = classOf[IllegalStateException])
+  }
+
+  // Tests for StateStoreUpdater.updateStateForTimedOutKeys()
   val preTimeoutState = Some(5)
   for (timeoutConf <- Seq(ProcessingTimeTimeout, EventTimeTimeout)) {
     testStateUpdateWithTimeout(
@@ -1034,7 +1023,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     if (priorState.isEmpty && priorTimeoutTimestamp != NO_TIMESTAMP) {
       return // there can be no prior timestamp, when there is no prior state
     }
-    test(s"InputProcessor - process new data - $testName") {
+    test(s"StateStoreUpdater - updates with data - $testName") {
       val mapGroupsFunc = (key: Int, values: Iterator[Int], state: GroupState[Int]) => {
         assert(state.hasTimedOut === false, "hasTimedOut not false")
         assert(values.nonEmpty, "Some value is expected")
@@ -1056,7 +1045,7 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
       expectedState: Option[Int],
       expectedTimeoutTimestamp: Long = NO_TIMESTAMP): Unit = {
 
-    test(s"InputProcessor - process timed out state - $testName") {
+    test(s"StateStoreUpdater - updates for timeout - $testName") {
       val mapGroupsFunc = (key: Int, values: Iterator[Int], state: GroupState[Int]) => {
         assert(state.hasTimedOut === true, "hasTimedOut not true")
         assert(values.isEmpty, "values not empty")
@@ -1083,20 +1072,21 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     val store = newStateStore()
     val mapGroupsSparkPlan = newFlatMapGroupsWithStateExec(
       mapGroupsFunc, timeoutConf, currentBatchTimestamp)
-    val inputProcessor = new mapGroupsSparkPlan.InputProcessor(store)
-    val stateManager = mapGroupsSparkPlan.stateManager
+    val updater = new mapGroupsSparkPlan.StateStoreUpdater(store)
     val key = intToRow(0)
     // Prepare store with prior state configs
-    if (priorState.nonEmpty || priorTimeoutTimestamp != NO_TIMESTAMP) {
-      stateManager.putState(store, key, priorState.orNull, priorTimeoutTimestamp)
+    if (priorState.nonEmpty) {
+      val row = updater.getStateRow(priorState.get)
+      updater.setTimeoutTimestamp(row, priorTimeoutTimestamp)
+      store.put(key.copy(), row.copy())
     }
 
     // Call updating function to update state store
     def callFunction() = {
       val returnedIter = if (testTimeoutUpdates) {
-        inputProcessor.processTimedOutState()
+        updater.updateStateForTimedOutKeys()
       } else {
-        inputProcessor.processNewData(Iterator(key))
+        updater.updateStateForKeysWithData(Iterator(key))
       }
       returnedIter.size // consume the iterator to force state updates
     }
@@ -1107,11 +1097,15 @@ class FlatMapGroupsWithStateSuite extends StateStoreMetricsTest
     } else {
       // Call function to update and verify updated state in store
       callFunction()
-      val updatedState = stateManager.getState(store, key)
-      assert(Option(updatedState.stateObj).map(_.toString.toInt) === expectedState,
+      val updatedStateRow = store.get(key)
+      assert(
+        Option(updater.getStateObj(updatedStateRow)).map(_.toString.toInt) === expectedState,
         "final state not as expected")
-      assert(updatedState.timeoutTimestamp === expectedTimeoutTimestamp,
-        "final timeout timestamp not as expected")
+      if (updatedStateRow != null) {
+        assert(
+          updater.getTimeoutTimestamp(updatedStateRow) === expectedTimeoutTimestamp,
+          "final timeout timestamp not as expected")
+      }
     }
   }
 

From f88a67bf08a66bd0291c11776a0c7c3928738ed9 Mon Sep 17 00:00:00 2001
From: Sunitha Kambhampati <skambha@us.ibm.com>
Date: Fri, 8 Dec 2017 14:48:19 +0800
Subject: [PATCH 071/356] [SPARK-22452][SQL] Add getDouble to
 DataSourceV2Options

- Implemented getDouble method in DataSourceV2Options
- Add unit test

Author: Sunitha Kambhampati <skambha@us.ibm.com>

Closes #19921 from skambha/ds2.
---
 .../spark/sql/sources/v2/DataSourceV2Options.java     |  9 +++++++++
 .../sql/sources/v2/DataSourceV2OptionsSuite.scala     | 11 +++++++++++
 2 files changed, 20 insertions(+)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
index b2c908dc73a61..e98c04517c3db 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
@@ -80,4 +80,13 @@ public long getLong(String key, long defaultValue) {
       Long.parseLong(keyLowerCasedMap.get(lcaseKey)) : defaultValue;
   }
 
+  /**
+   * Returns the double value to which the specified key is mapped,
+   * or defaultValue if there is no mapping for the key. The key match is case-insensitive
+   */
+  public double getDouble(String key, double defaultValue) {
+    String lcaseKey = toLowerCase(key);
+    return keyLowerCasedMap.containsKey(lcaseKey) ?
+      Double.parseDouble(keyLowerCasedMap.get(lcaseKey)) : defaultValue;
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
index 752d3c193cc74..90d92864b26fa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2OptionsSuite.scala
@@ -68,4 +68,15 @@ class DataSourceV2OptionsSuite extends SparkFunSuite {
       options.getLong("foo", 0L)
     }
   }
+
+  test("getDouble") {
+    val options = new DataSourceV2Options(Map("numFoo" -> "922337.1",
+      "foo" -> "bar").asJava)
+    assert(options.getDouble("numFOO", 0d) == 922337.1d)
+    assert(options.getDouble("numFoo2", -1.02d) == -1.02d)
+
+    intercept[NumberFormatException]{
+      options.getDouble("foo", 0.1d)
+    }
+  }
 }

From f28b1a4c41d41e069b882bac06a68b2c45eb156c Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Fri, 8 Dec 2017 12:19:45 +0100
Subject: [PATCH 072/356] [SPARK-22721] BytesToBytesMap peak memory not
 updated.

## What changes were proposed in this pull request?

Follow-up to earlier commit.
The peak memory of BytesToBytesMap is not updated in more places - spill() and destructiveIterator().

## How was this patch tested?

Manually.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #19923 from juliuszsompolski/SPARK-22721cd.
---
 .../main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index 7fdcf22c45f73..5f0045507aaab 100644
--- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -347,6 +347,8 @@ public long spill(long numBytes) throws IOException {
           return 0L;
         }
 
+        updatePeakMemoryUsed();
+
         // TODO: use existing ShuffleWriteMetrics
         ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
 
@@ -424,6 +426,7 @@ public MapIterator iterator() {
    * `lookup()`, the behavior of the returned iterator is undefined.
    */
   public MapIterator destructiveIterator() {
+    updatePeakMemoryUsed();
     return new MapIterator(numValues, loc, true);
   }
 

From 26e66453decf40ed6d590498aadbbf442bb90622 Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Fri, 8 Dec 2017 20:44:21 +0900
Subject: [PATCH 073/356] [SPARK-22655][PYSPARK] Throw exception rather than
 exit silently in PythonRunner when Spark session is stopped

## What changes were proposed in this pull request?

During Spark shutdown, if there are some active tasks, sometimes they will complete with incorrect results. The issue is in PythonRunner where it is returning partial result instead of throwing exception during Spark shutdown.

This patch makes it so that these tasks fail instead of complete with partial results.

## How was this patch tested?

Existing tests.

Author: Li Jin <ice.xelloss@gmail.com>

Closes #19852 from icexelloss/python-runner-shutdown.
---
 .../main/scala/org/apache/spark/api/python/PythonRunner.scala | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index f524de68fbce0..93d508c28ebba 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -317,10 +317,6 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         logDebug("Exception thrown after task interruption", e)
         throw new TaskKilledException(context.getKillReason().getOrElse("unknown reason"))
 
-      case e: Exception if env.isStopped =>
-        logDebug("Exception thrown after context is stopped", e)
-        null.asInstanceOf[OUT]  // exit silently
-
       case e: Exception if writerThread.exception.isDefined =>
         logError("Python worker exited unexpectedly (crashed)", e)
         logError("This may have been caused by a prior exception:", writerThread.exception.get)

From e4639fa68f479ccd7c365d3cf2bc2e93e24f1a00 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Fri, 8 Dec 2017 14:17:50 -0800
Subject: [PATCH 074/356] =?UTF-8?q?[SPARK-21672][CORE]=20Remove=20SHS-spec?=
 =?UTF-8?q?ific=20application=20/=20attempt=20data=20=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…structures

## What changes were proposed in this pull request?

In general, the SHS pages now use the public API types to represent applications. Some internal code paths still used its own view of what applications and attempts look like (`ApplicationHistoryInfo` and `ApplicationAttemptInfo`), declared in ApplicationHistoryProvider.scala.

This pull request removes these classes and updates the rest of the code to use `status.api.v1.ApplicationInfo` and `status.api.v1.ApplicationAttemptInfo` instead.

Furthermore `status.api.v1.ApplicationInfo` and `status.api.v1.ApplicationAttemptInfo` were changed to case class to
- facilitate copying instances
- equality checking in test code
- nicer toString()

To simplify the code a bit `v1.` prefixes were also removed from occurrences of v1.ApplicationInfo and v1.ApplicationAttemptInfo as there is no more ambiguity between classes in history and status.api.v1.

## How was this patch tested?

By running existing automated tests.

Author: Sandor Murakozi <smurakozi@gmail.com>

Closes #19920 from smurakozi/SPARK-21672.
---
 .../history/ApplicationHistoryProvider.scala  | 30 +++-------------
 .../deploy/history/FsHistoryProvider.scala    | 32 ++++++-----------
 .../spark/deploy/history/HistoryPage.scala    |  8 ++++-
 .../spark/deploy/history/HistoryServer.scala  |  8 ++---
 .../spark/status/AppStatusListener.scala      |  8 ++---
 .../api/v1/ApplicationListResource.scala      | 32 -----------------
 .../org/apache/spark/status/api/v1/api.scala  | 34 +++++++++----------
 .../history/FsHistoryProviderSuite.scala      | 13 ++++---
 .../deploy/history/HistoryServerSuite.scala   |  9 +++--
 9 files changed, 63 insertions(+), 111 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
index 38f0d6f2afa5e..f1c06205bf04c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala
@@ -23,31 +23,9 @@ import java.util.zip.ZipOutputStream
 import scala.xml.Node
 
 import org.apache.spark.SparkException
+import org.apache.spark.status.api.v1.ApplicationInfo
 import org.apache.spark.ui.SparkUI
 
-private[spark] case class ApplicationAttemptInfo(
-    attemptId: Option[String],
-    startTime: Long,
-    endTime: Long,
-    lastUpdated: Long,
-    sparkUser: String,
-    completed: Boolean = false,
-    appSparkVersion: String)
-
-private[spark] case class ApplicationHistoryInfo(
-    id: String,
-    name: String,
-    attempts: List[ApplicationAttemptInfo]) {
-
-  /**
-   * Has this application completed?
-   * @return true if the most recent attempt has completed
-   */
-  def completed: Boolean = {
-    attempts.nonEmpty && attempts.head.completed
-  }
-}
-
 /**
  * A loaded UI for a Spark application.
  *
@@ -119,7 +97,7 @@ private[history] abstract class ApplicationHistoryProvider {
    *
    * @return List of all know applications.
    */
-  def getListing(): Iterator[ApplicationHistoryInfo]
+  def getListing(): Iterator[ApplicationInfo]
 
   /**
    * Returns the Spark UI for a specific application.
@@ -152,9 +130,9 @@ private[history] abstract class ApplicationHistoryProvider {
   def writeEventLogs(appId: String, attemptId: Option[String], zipStream: ZipOutputStream): Unit
 
   /**
-   * @return the [[ApplicationHistoryInfo]] for the appId if it exists.
+   * @return the [[ApplicationInfo]] for the appId if it exists.
    */
-  def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo]
+  def getApplicationInfo(appId: String): Option[ApplicationInfo]
 
   /**
    * @return html text to display when the application list is empty
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 6a83c106f6d84..a8e1becc56ab7 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -43,7 +43,7 @@ import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.status._
 import org.apache.spark.status.KVUtils._
-import org.apache.spark.status.api.v1
+import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo}
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 import org.apache.spark.util.kvstore._
@@ -252,19 +252,19 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }
 
-  override def getListing(): Iterator[ApplicationHistoryInfo] = {
+  override def getListing(): Iterator[ApplicationInfo] = {
     // Return the listing in end time descending order.
     listing.view(classOf[ApplicationInfoWrapper])
       .index("endTime")
       .reverse()
       .iterator()
       .asScala
-      .map(_.toAppHistoryInfo())
+      .map(_.toApplicationInfo())
   }
 
-  override def getApplicationInfo(appId: String): Option[ApplicationHistoryInfo] = {
+  override def getApplicationInfo(appId: String): Option[ApplicationInfo] = {
     try {
-      Some(load(appId).toAppHistoryInfo())
+      Some(load(appId).toApplicationInfo())
     } catch {
       case e: NoSuchElementException =>
         None
@@ -795,24 +795,16 @@ private[history] case class LogInfo(
     fileSize: Long)
 
 private[history] class AttemptInfoWrapper(
-    val info: v1.ApplicationAttemptInfo,
+    val info: ApplicationAttemptInfo,
     val logPath: String,
     val fileSize: Long,
     val adminAcls: Option[String],
     val viewAcls: Option[String],
     val adminAclsGroups: Option[String],
-    val viewAclsGroups: Option[String]) {
-
-  def toAppAttemptInfo(): ApplicationAttemptInfo = {
-    ApplicationAttemptInfo(info.attemptId, info.startTime.getTime(),
-      info.endTime.getTime(), info.lastUpdated.getTime(), info.sparkUser,
-      info.completed, info.appSparkVersion)
-  }
-
-}
+    val viewAclsGroups: Option[String])
 
 private[history] class ApplicationInfoWrapper(
-    val info: v1.ApplicationInfo,
+    val info: ApplicationInfo,
     val attempts: List[AttemptInfoWrapper]) {
 
   @JsonIgnore @KVIndexParam
@@ -824,9 +816,7 @@ private[history] class ApplicationInfoWrapper(
   @JsonIgnore @KVIndexParam("oldestAttempt")
   def oldestAttempt(): Long = attempts.map(_.info.lastUpdated.getTime()).min
 
-  def toAppHistoryInfo(): ApplicationHistoryInfo = {
-    ApplicationHistoryInfo(info.id, info.name, attempts.map(_.toAppAttemptInfo()))
-  }
+  def toApplicationInfo(): ApplicationInfo = info.copy(attempts = attempts.map(_.info))
 
 }
 
@@ -883,7 +873,7 @@ private[history] class AppListingListener(log: FileStatus, clock: Clock) extends
     var memoryPerExecutorMB: Option[Int] = None
 
     def toView(): ApplicationInfoWrapper = {
-      val apiInfo = new v1.ApplicationInfo(id, name, coresGranted, maxCores, coresPerExecutor,
+      val apiInfo = ApplicationInfo(id, name, coresGranted, maxCores, coresPerExecutor,
         memoryPerExecutorMB, Nil)
       new ApplicationInfoWrapper(apiInfo, List(attempt.toView()))
     }
@@ -906,7 +896,7 @@ private[history] class AppListingListener(log: FileStatus, clock: Clock) extends
     var viewAclsGroups: Option[String] = None
 
     def toView(): AttemptInfoWrapper = {
-      val apiInfo = new v1.ApplicationAttemptInfo(
+      val apiInfo = ApplicationAttemptInfo(
         attemptId,
         startTime,
         endTime,
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index 6399dccc1676a..d3dd58996a0bb 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -21,6 +21,7 @@ import javax.servlet.http.HttpServletRequest
 
 import scala.xml.Node
 
+import org.apache.spark.status.api.v1.ApplicationInfo
 import org.apache.spark.ui.{UIUtils, WebUIPage}
 
 private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("") {
@@ -30,7 +31,8 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
     val requestedIncomplete =
       Option(UIUtils.stripXSS(request.getParameter("showIncomplete"))).getOrElse("false").toBoolean
 
-    val allAppsSize = parent.getApplicationList().count(_.completed != requestedIncomplete)
+    val allAppsSize = parent.getApplicationList()
+      .count(isApplicationCompleted(_) != requestedIncomplete)
     val eventLogsUnderProcessCount = parent.getEventLogsUnderProcess()
     val lastUpdatedTime = parent.getLastUpdatedTime()
     val providerConfig = parent.getProviderConfig()
@@ -88,4 +90,8 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
   private def makePageLink(showIncomplete: Boolean): String = {
     UIUtils.prependBaseUri("/?" + "showIncomplete=" + showIncomplete)
   }
+
+  private def isApplicationCompleted(appInfo: ApplicationInfo): Boolean = {
+    appInfo.attempts.nonEmpty && appInfo.attempts.head.completed
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index b822a48e98e91..75484f5c9f30f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -30,7 +30,7 @@ import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
-import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, ApplicationsListResource, UIRoot}
+import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, UIRoot}
 import org.apache.spark.ui.{SparkUI, UIUtils, WebUI}
 import org.apache.spark.ui.JettyUtils._
 import org.apache.spark.util.{ShutdownHookManager, SystemClock, Utils}
@@ -175,7 +175,7 @@ class HistoryServer(
    *
    * @return List of all known applications.
    */
-  def getApplicationList(): Iterator[ApplicationHistoryInfo] = {
+  def getApplicationList(): Iterator[ApplicationInfo] = {
     provider.getListing()
   }
 
@@ -188,11 +188,11 @@ class HistoryServer(
   }
 
   def getApplicationInfoList: Iterator[ApplicationInfo] = {
-    getApplicationList().map(ApplicationsListResource.appHistoryInfoToPublicAppInfo)
+    getApplicationList()
   }
 
   def getApplicationInfo(appId: String): Option[ApplicationInfo] = {
-    provider.getApplicationInfo(appId).map(ApplicationsListResource.appHistoryInfoToPublicAppInfo)
+    provider.getApplicationInfo(appId)
   }
 
   override def writeEventLogs(
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 9c23d9d8c923a..6da44cbc44c4d 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -77,7 +77,7 @@ private[spark] class AppStatusListener(
   override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
     assert(event.appId.isDefined, "Application without IDs are not supported.")
 
-    val attempt = new v1.ApplicationAttemptInfo(
+    val attempt = v1.ApplicationAttemptInfo(
       event.appAttemptId,
       new Date(event.time),
       new Date(-1),
@@ -87,7 +87,7 @@ private[spark] class AppStatusListener(
       false,
       sparkVersion)
 
-    appInfo = new v1.ApplicationInfo(
+    appInfo = v1.ApplicationInfo(
       event.appId.get,
       event.appName,
       None,
@@ -122,7 +122,7 @@ private[spark] class AppStatusListener(
 
   override def onApplicationEnd(event: SparkListenerApplicationEnd): Unit = {
     val old = appInfo.attempts.head
-    val attempt = new v1.ApplicationAttemptInfo(
+    val attempt = v1.ApplicationAttemptInfo(
       old.attemptId,
       old.startTime,
       new Date(event.time),
@@ -132,7 +132,7 @@ private[spark] class AppStatusListener(
       true,
       old.appSparkVersion)
 
-    appInfo = new v1.ApplicationInfo(
+    appInfo = v1.ApplicationInfo(
       appInfo.id,
       appInfo.name,
       None,
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
index 91660a524ca93..69054f2b771f1 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/ApplicationListResource.scala
@@ -20,8 +20,6 @@ import java.util.{Date, List => JList}
 import javax.ws.rs.{DefaultValue, GET, Produces, QueryParam}
 import javax.ws.rs.core.MediaType
 
-import org.apache.spark.deploy.history.ApplicationHistoryInfo
-
 @Produces(Array(MediaType.APPLICATION_JSON))
 private[v1] class ApplicationListResource extends ApiRequestContext {
 
@@ -67,33 +65,3 @@ private[v1] class ApplicationListResource extends ApiRequestContext {
     startTimeOk && endTimeOk
   }
 }
-
-private[spark] object ApplicationsListResource {
-  def appHistoryInfoToPublicAppInfo(app: ApplicationHistoryInfo): ApplicationInfo = {
-    new ApplicationInfo(
-      id = app.id,
-      name = app.name,
-      coresGranted = None,
-      maxCores = None,
-      coresPerExecutor = None,
-      memoryPerExecutorMB = None,
-      attempts = app.attempts.map { internalAttemptInfo =>
-        new ApplicationAttemptInfo(
-          attemptId = internalAttemptInfo.attemptId,
-          startTime = new Date(internalAttemptInfo.startTime),
-          endTime = new Date(internalAttemptInfo.endTime),
-          duration =
-            if (internalAttemptInfo.endTime > 0) {
-              internalAttemptInfo.endTime - internalAttemptInfo.startTime
-            } else {
-              0
-            },
-          lastUpdated = new Date(internalAttemptInfo.lastUpdated),
-          sparkUser = internalAttemptInfo.sparkUser,
-          completed = internalAttemptInfo.completed,
-          appSparkVersion = internalAttemptInfo.appSparkVersion
-        )
-      }
-    )
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 14280099f6422..45eaf935fb083 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -24,27 +24,27 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.JobExecutionStatus
 
-class ApplicationInfo private[spark](
-    val id: String,
-    val name: String,
-    val coresGranted: Option[Int],
-    val maxCores: Option[Int],
-    val coresPerExecutor: Option[Int],
-    val memoryPerExecutorMB: Option[Int],
-    val attempts: Seq[ApplicationAttemptInfo])
+case class ApplicationInfo private[spark](
+    id: String,
+    name: String,
+    coresGranted: Option[Int],
+    maxCores: Option[Int],
+    coresPerExecutor: Option[Int],
+    memoryPerExecutorMB: Option[Int],
+    attempts: Seq[ApplicationAttemptInfo])
 
 @JsonIgnoreProperties(
   value = Array("startTimeEpoch", "endTimeEpoch", "lastUpdatedEpoch"),
   allowGetters = true)
-class ApplicationAttemptInfo private[spark](
-    val attemptId: Option[String],
-    val startTime: Date,
-    val endTime: Date,
-    val lastUpdated: Date,
-    val duration: Long,
-    val sparkUser: String,
-    val completed: Boolean = false,
-    val appSparkVersion: String) {
+case class ApplicationAttemptInfo private[spark](
+    attemptId: Option[String],
+    startTime: Date,
+    endTime: Date,
+    lastUpdated: Date,
+    duration: Long,
+    sparkUser: String,
+    completed: Boolean = false,
+    appSparkVersion: String) {
 
   def getStartTimeEpoch: Long = startTime.getTime
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 86c8cdf43258c..84ee01c7f5aaf 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.deploy.history
 
 import java.io._
 import java.nio.charset.StandardCharsets
+import java.util.Date
 import java.util.concurrent.TimeUnit
 import java.util.zip.{ZipInputStream, ZipOutputStream}
 
@@ -42,6 +43,7 @@ import org.apache.spark.io._
 import org.apache.spark.scheduler._
 import org.apache.spark.security.GroupMappingServiceProvider
 import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo}
 import org.apache.spark.util.{Clock, JsonProtocol, ManualClock, Utils}
 
 class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging {
@@ -114,9 +116,12 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
           end: Long,
           lastMod: Long,
           user: String,
-          completed: Boolean): ApplicationHistoryInfo = {
-        ApplicationHistoryInfo(id, name,
-          List(ApplicationAttemptInfo(None, start, end, lastMod, user, completed, "")))
+          completed: Boolean): ApplicationInfo = {
+
+        val duration = if (end > 0) end - start else 0
+        new ApplicationInfo(id, name, None, None, None, None,
+          List(ApplicationAttemptInfo(None, new Date(start),
+            new Date(end), new Date(lastMod), duration, user, completed, "")))
       }
 
       // For completed files, lastUpdated would be lastModified time.
@@ -667,7 +672,7 @@ class FsHistoryProviderSuite extends SparkFunSuite with BeforeAndAfter with Matc
    *     }
    */
   private def updateAndCheck(provider: FsHistoryProvider)
-      (checkFn: Seq[ApplicationHistoryInfo] => Unit): Unit = {
+      (checkFn: Seq[ApplicationInfo] => Unit): Unit = {
     provider.checkForLogs()
     provider.cleanLogs()
     checkFn(provider.getListing().toSeq)
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index d22a19e8af74a..4e4395d0fb959 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -45,6 +45,7 @@ import org.scalatest.selenium.WebBrowser
 
 import org.apache.spark._
 import org.apache.spark.deploy.history.config._
+import org.apache.spark.status.api.v1.ApplicationInfo
 import org.apache.spark.status.api.v1.JobData
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{ResetSystemProperties, Utils}
@@ -505,6 +506,10 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       getAppUI.store.jobsList(List(JobExecutionStatus.RUNNING).asJava)
     }
 
+    def isApplicationCompleted(appInfo: ApplicationInfo): Boolean = {
+      appInfo.attempts.nonEmpty && appInfo.attempts.head.completed
+    }
+
     activeJobs() should have size 0
     completedJobs() should have size 1
     getNumJobs("") should be (1)
@@ -537,7 +542,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       assert(4 === getNumJobsRestful(), s"two jobs back-to-back not updated, server=$server\n")
     }
     val jobcount = getNumJobs("/jobs")
-    assert(!provider.getListing().next.completed)
+    assert(!isApplicationCompleted(provider.getListing().next))
 
     listApplications(false) should contain(appId)
 
@@ -545,7 +550,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
     resetSparkContext()
     // check the app is now found as completed
     eventually(stdTimeout, stdInterval) {
-      assert(provider.getListing().next.completed,
+      assert(isApplicationCompleted(provider.getListing().next),
         s"application never completed, server=$server\n")
     }
 

From acf7ef3154e094875fa89f30a78ab111b267db91 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Sat, 9 Dec 2017 11:53:15 +0900
Subject: [PATCH 075/356] [SPARK-12297][SQL] Adjust timezone for int96 data
 from impala

## What changes were proposed in this pull request?

Int96 data written by impala vs data written by hive & spark is stored slightly differently -- they use a different offset for the timezone.  This adds an option "spark.sql.parquet.int96TimestampConversion" (false by default) to adjust timestamps if and only if the writer is impala (or more precisely, if the parquet file's "createdBy" metadata does not start with "parquet-mr").  This matches the existing behavior in hive from HIVE-9482.

## How was this patch tested?

Unit test added, existing tests run via jenkins.

Author: Imran Rashid <irashid@cloudera.com>
Author: Henry Robinson <henry@apache.org>

Closes #19769 from squito/SPARK-12297_skip_conversion.
---
 .../sql/catalyst/util/DateTimeUtils.scala     |   1 +
 .../apache/spark/sql/internal/SQLConf.scala   |   9 ++
 .../parquet/VectorizedColumnReader.java       |  59 +++++++--
 .../VectorizedParquetRecordReader.java        |  18 ++-
 .../parquet/ParquetFileFormat.scala           |  30 ++++-
 .../parquet/ParquetReadSupport.scala          |  15 ++-
 .../parquet/ParquetRecordMaterializer.scala   |   7 +-
 .../parquet/ParquetRowConverter.scala         |  11 +-
 .../resources/test-data/impala_timestamp.parq | Bin 0 -> 374 bytes
 .../ParquetInteroperabilitySuite.scala        | 114 ++++++++++++++++++
 10 files changed, 237 insertions(+), 27 deletions(-)
 create mode 100644 sql/core/src/test/resources/test-data/impala_timestamp.parq

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 746c3e8950f7b..b1ed25645b36c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -61,6 +61,7 @@ object DateTimeUtils {
   final val YearZero = -17999
   final val toYearZero = to2001 + 7304850
   final val TimeZoneGMT = TimeZone.getTimeZone("GMT")
+  final val TimeZoneUTC = TimeZone.getTimeZone("UTC")
   final val MonthOf31Days = Set(1, 3, 5, 7, 8, 10, 12)
 
   val TIMEZONE_OPTION = "timeZone"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ce9cc562b220f..1121444cc938a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -291,6 +291,13 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val PARQUET_INT96_TIMESTAMP_CONVERSION = buildConf("spark.sql.parquet.int96TimestampConversion")
+    .doc("This controls whether timestamp adjustments should be applied to INT96 data when " +
+      "converting to timestamps, for data written by Impala.  This is necessary because Impala " +
+      "stores INT96 data with a different timezone offset than Hive & Spark.")
+    .booleanConf
+    .createWithDefault(false)
+
   object ParquetOutputTimestampType extends Enumeration {
     val INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS = Value
   }
@@ -1206,6 +1213,8 @@ class SQLConf extends Serializable with Logging {
 
   def isParquetINT96AsTimestamp: Boolean = getConf(PARQUET_INT96_AS_TIMESTAMP)
 
+  def isParquetINT96TimestampConversion: Boolean = getConf(PARQUET_INT96_TIMESTAMP_CONVERSION)
+
   def isParquetINT64AsTimestampMillis: Boolean = getConf(PARQUET_INT64_AS_TIMESTAMP_MILLIS)
 
   def parquetOutputTimestampType: ParquetOutputTimestampType.Value = {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 71ca8b1b96a98..b7646969bcf3d 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.datasources.parquet;
 
 import java.io.IOException;
+import java.util.TimeZone;
 
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
@@ -93,13 +94,18 @@ public class VectorizedColumnReader {
   private final PageReader pageReader;
   private final ColumnDescriptor descriptor;
   private final OriginalType originalType;
+  // The timezone conversion to apply to int96 timestamps. Null if no conversion.
+  private final TimeZone convertTz;
+  private final static TimeZone UTC = DateTimeUtils.TimeZoneUTC();
 
   public VectorizedColumnReader(
       ColumnDescriptor descriptor,
       OriginalType originalType,
-      PageReader pageReader) throws IOException {
+      PageReader pageReader,
+      TimeZone convertTz) throws IOException {
     this.descriptor = descriptor;
     this.pageReader = pageReader;
+    this.convertTz = convertTz;
     this.originalType = originalType;
     this.maxDefLevel = descriptor.getMaxDefinitionLevel();
 
@@ -222,6 +228,10 @@ void readBatch(int total, WritableColumnVector column) throws IOException {
     }
   }
 
+  private boolean shouldConvertTimestamps() {
+    return convertTz != null && !convertTz.equals(UTC);
+  }
+
   /**
    * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
    */
@@ -294,11 +304,21 @@ private void decodeDictionaryIds(
         break;
       case INT96:
         if (column.dataType() == DataTypes.TimestampType) {
-          for (int i = rowId; i < rowId + num; ++i) {
-            // TODO: Convert dictionary of Binaries to dictionary of Longs
-            if (!column.isNullAt(i)) {
-              Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
-              column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
+          if (!shouldConvertTimestamps()) {
+            for (int i = rowId; i < rowId + num; ++i) {
+              if (!column.isNullAt(i)) {
+                Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
+                column.putLong(i, ParquetRowConverter.binaryToSQLTimestamp(v));
+              }
+            }
+          } else {
+            for (int i = rowId; i < rowId + num; ++i) {
+              if (!column.isNullAt(i)) {
+                Binary v = dictionary.decodeToBinary(dictionaryIds.getDictId(i));
+                long rawTime = ParquetRowConverter.binaryToSQLTimestamp(v);
+                long adjTime = DateTimeUtils.convertTz(rawTime, convertTz, UTC);
+                column.putLong(i, adjTime);
+              }
             }
           }
         } else {
@@ -428,13 +448,26 @@ private void readBinaryBatch(int rowId, int num, WritableColumnVector column) {
     if (column.dataType() == DataTypes.StringType || column.dataType() == DataTypes.BinaryType) {
       defColumn.readBinarys(num, column, rowId, maxDefLevel, data);
     } else if (column.dataType() == DataTypes.TimestampType) {
-      for (int i = 0; i < num; i++) {
-        if (defColumn.readInteger() == maxDefLevel) {
-          column.putLong(rowId + i,
-              // Read 12 bytes for INT96
-              ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12)));
-        } else {
-          column.putNull(rowId + i);
+      if (!shouldConvertTimestamps()) {
+        for (int i = 0; i < num; i++) {
+          if (defColumn.readInteger() == maxDefLevel) {
+            // Read 12 bytes for INT96
+            long rawTime = ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12));
+            column.putLong(rowId + i, rawTime);
+          } else {
+            column.putNull(rowId + i);
+          }
+        }
+      } else {
+        for (int i = 0; i < num; i++) {
+          if (defColumn.readInteger() == maxDefLevel) {
+            // Read 12 bytes for INT96
+            long rawTime = ParquetRowConverter.binaryToSQLTimestamp(data.readBinary(12));
+            long adjTime = DateTimeUtils.convertTz(rawTime, convertTz, UTC);
+            column.putLong(rowId + i, adjTime);
+          } else {
+            column.putNull(rowId + i);
+          }
         }
       }
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 669d71e60779d..14f2a58d638f0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
+import java.util.TimeZone;
 
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
@@ -77,6 +78,12 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
    */
   private boolean[] missingColumns;
 
+  /**
+   * The timezone that timestamp INT96 values should be converted to. Null if no conversion. Here to workaround
+   * incompatibilities between different engines when writing timestamp values.
+   */
+  private TimeZone convertTz = null;
+
   /**
    * columnBatch object that is used for batch decoding. This is created on first use and triggers
    * batched decoding. It is not valid to interleave calls to the batched interface with the row
@@ -105,10 +112,15 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
    */
   private final MemoryMode MEMORY_MODE;
 
-  public VectorizedParquetRecordReader(boolean useOffHeap) {
+  public VectorizedParquetRecordReader(TimeZone convertTz, boolean useOffHeap) {
+    this.convertTz = convertTz;
     MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
   }
 
+  public VectorizedParquetRecordReader(boolean useOffHeap) {
+    this(null, useOffHeap);
+  }
+
   /**
    * Implementation of RecordReader API.
    */
@@ -291,8 +303,8 @@ private void checkEndOfRowGroup() throws IOException {
     columnReaders = new VectorizedColumnReader[columns.size()];
     for (int i = 0; i < columns.size(); ++i) {
       if (missingColumns[i]) continue;
-      columnReaders[i] = new VectorizedColumnReader(
-        columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)));
+      columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(),
+        pages.getPageReader(columns.get(i)), convertTz);
     }
     totalCountLoadedSoFar += pages.getRowCount();
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 2b1064955a777..45bedf70f975c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -45,6 +45,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.parser.LegacyTypeStringParser
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, OnHeapColumnVector}
 import org.apache.spark.sql.internal.SQLConf
@@ -307,6 +308,9 @@ class ParquetFileFormat
     hadoopConf.set(
       ParquetWriteSupport.SPARK_ROW_SCHEMA,
       requiredSchema.json)
+    hadoopConf.set(
+      SQLConf.SESSION_LOCAL_TIMEZONE.key,
+      sparkSession.sessionState.conf.sessionLocalTimeZone)
 
     ParquetWriteSupport.setSchema(requiredSchema, hadoopConf)
 
@@ -345,6 +349,8 @@ class ParquetFileFormat
       resultSchema.forall(_.dataType.isInstanceOf[AtomicType])
     val enableRecordFilter: Boolean =
       sparkSession.sessionState.conf.parquetRecordFilterEnabled
+    val timestampConversion: Boolean =
+      sparkSession.sessionState.conf.isParquetINT96TimestampConversion
     // Whole stage codegen (PhysicalRDD) is able to deal with batches directly
     val returningBatch = supportBatch(sparkSession, resultSchema)
 
@@ -363,6 +369,22 @@ class ParquetFileFormat
           fileSplit.getLocations,
           null)
 
+      val sharedConf = broadcastedHadoopConf.value.value
+      // PARQUET_INT96_TIMESTAMP_CONVERSION says to apply timezone conversions to int96 timestamps'
+      // *only* if the file was created by something other than "parquet-mr", so check the actual
+      // writer here for this file.  We have to do this per-file, as each file in the table may
+      // have different writers.
+      def isCreatedByParquetMr(): Boolean = {
+        val footer = ParquetFileReader.readFooter(sharedConf, fileSplit.getPath, SKIP_ROW_GROUPS)
+        footer.getFileMetaData().getCreatedBy().startsWith("parquet-mr")
+      }
+      val convertTz =
+        if (timestampConversion && !isCreatedByParquetMr()) {
+          Some(DateTimeUtils.getTimeZone(sharedConf.get(SQLConf.SESSION_LOCAL_TIMEZONE.key)))
+        } else {
+          None
+        }
+
       val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
       val hadoopAttemptContext =
         new TaskAttemptContextImpl(broadcastedHadoopConf.value.value, attemptId)
@@ -374,8 +396,8 @@ class ParquetFileFormat
       }
       val taskContext = Option(TaskContext.get())
       val parquetReader = if (enableVectorizedReader) {
-        val vectorizedReader =
-          new VectorizedParquetRecordReader(enableOffHeapColumnVector && taskContext.isDefined)
+        val vectorizedReader = new VectorizedParquetRecordReader(
+          convertTz.orNull, enableOffHeapColumnVector && taskContext.isDefined)
         vectorizedReader.initialize(split, hadoopAttemptContext)
         logDebug(s"Appending $partitionSchema ${file.partitionValues}")
         vectorizedReader.initBatch(partitionSchema, file.partitionValues)
@@ -388,9 +410,9 @@ class ParquetFileFormat
         // ParquetRecordReader returns UnsafeRow
         val reader = if (pushed.isDefined && enableRecordFilter) {
           val parquetFilter = FilterCompat.get(pushed.get, null)
-          new ParquetRecordReader[UnsafeRow](new ParquetReadSupport, parquetFilter)
+          new ParquetRecordReader[UnsafeRow](new ParquetReadSupport(convertTz), parquetFilter)
         } else {
-          new ParquetRecordReader[UnsafeRow](new ParquetReadSupport)
+          new ParquetRecordReader[UnsafeRow](new ParquetReadSupport(convertTz))
         }
         reader.initialize(split, hadoopAttemptContext)
         reader
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
index 2854cb1bc0c25..40ce5d5e0564e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import java.util.{Map => JMap}
+import java.util.{Map => JMap, TimeZone}
 
 import scala.collection.JavaConverters._
 
@@ -48,9 +48,17 @@ import org.apache.spark.sql.types._
  * Due to this reason, we no longer rely on [[ReadContext]] to pass requested schema from [[init()]]
  * to [[prepareForRead()]], but use a private `var` for simplicity.
  */
-private[parquet] class ParquetReadSupport extends ReadSupport[UnsafeRow] with Logging {
+private[parquet] class ParquetReadSupport(val convertTz: Option[TimeZone])
+    extends ReadSupport[UnsafeRow] with Logging {
   private var catalystRequestedSchema: StructType = _
 
+  def this() {
+    // We need a zero-arg constructor for SpecificParquetRecordReaderBase.  But that is only
+    // used in the vectorized reader, where we get the convertTz value directly, and the value here
+    // is ignored.
+    this(None)
+  }
+
   /**
    * Called on executor side before [[prepareForRead()]] and instantiating actual Parquet record
    * readers.  Responsible for figuring out Parquet requested schema used for column pruning.
@@ -95,7 +103,8 @@ private[parquet] class ParquetReadSupport extends ReadSupport[UnsafeRow] with Lo
     new ParquetRecordMaterializer(
       parquetRequestedSchema,
       ParquetReadSupport.expandUDT(catalystRequestedSchema),
-      new ParquetToSparkSchemaConverter(conf))
+      new ParquetToSparkSchemaConverter(conf),
+      convertTz)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
index 793755e9aaeb5..b2459dd0e8bba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRecordMaterializer.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
+import java.util.TimeZone
+
 import org.apache.parquet.io.api.{GroupConverter, RecordMaterializer}
 import org.apache.parquet.schema.MessageType
 
@@ -33,11 +35,12 @@ import org.apache.spark.sql.types.StructType
 private[parquet] class ParquetRecordMaterializer(
     parquetSchema: MessageType,
     catalystSchema: StructType,
-    schemaConverter: ParquetToSparkSchemaConverter)
+    schemaConverter: ParquetToSparkSchemaConverter,
+    convertTz: Option[TimeZone])
   extends RecordMaterializer[UnsafeRow] {
 
   private val rootConverter =
-    new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, NoopUpdater)
+    new ParquetRowConverter(schemaConverter, parquetSchema, catalystSchema, convertTz, NoopUpdater)
 
   override def getCurrentRecord: UnsafeRow = rootConverter.currentRecord
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
index 10f6c3b4f15e3..1199725941842 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.math.{BigDecimal, BigInteger}
 import java.nio.ByteOrder
+import java.util.TimeZone
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
@@ -117,12 +118,14 @@ private[parquet] class ParquetPrimitiveConverter(val updater: ParentContainerUpd
  * @param parquetType Parquet schema of Parquet records
  * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined
  *        types should have been expanded.
+ * @param convertTz the optional time zone to convert to for int96 data
  * @param updater An updater which propagates converted field values to the parent container
  */
 private[parquet] class ParquetRowConverter(
     schemaConverter: ParquetToSparkSchemaConverter,
     parquetType: GroupType,
     catalystType: StructType,
+    convertTz: Option[TimeZone],
     updater: ParentContainerUpdater)
   extends ParquetGroupConverter(updater) with Logging {
 
@@ -151,6 +154,8 @@ private[parquet] class ParquetRowConverter(
        |${catalystType.prettyJson}
      """.stripMargin)
 
+  private val UTC = DateTimeUtils.TimeZoneUTC
+
   /**
    * Updater used together with field converters within a [[ParquetRowConverter]].  It propagates
    * converted filed values to the `ordinal`-th cell in `currentRow`.
@@ -279,7 +284,9 @@ private[parquet] class ParquetRowConverter(
             val buf = value.toByteBuffer.order(ByteOrder.LITTLE_ENDIAN)
             val timeOfDayNanos = buf.getLong
             val julianDay = buf.getInt
-            updater.setLong(DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos))
+            val rawTime = DateTimeUtils.fromJulianDay(julianDay, timeOfDayNanos)
+            val adjTime = convertTz.map(DateTimeUtils.convertTz(rawTime, _, UTC)).getOrElse(rawTime)
+            updater.setLong(adjTime)
           }
         }
 
@@ -309,7 +316,7 @@ private[parquet] class ParquetRowConverter(
 
       case t: StructType =>
         new ParquetRowConverter(
-          schemaConverter, parquetType.asGroupType(), t, new ParentContainerUpdater {
+          schemaConverter, parquetType.asGroupType(), t, convertTz, new ParentContainerUpdater {
             override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
           })
 
diff --git a/sql/core/src/test/resources/test-data/impala_timestamp.parq b/sql/core/src/test/resources/test-data/impala_timestamp.parq
new file mode 100644
index 0000000000000000000000000000000000000000..21e5318db98c9d0e328bd7c00999a2d1d4bce86d
GIT binary patch
literal 374
zcmWG=3^EjD5%m!D@eyScWno}Y>0vlDD<y=Pf#G<jDg#4{>B$vr3=ARJK(^Zj-M^d+
z4EJ(W8AKUGMMNcZKw5wpsDMj_iGhKEjgg62g+Xlvql_q<q$&%Wqy$q*u_%)mo7fCS
zu?383d1@RoW(*oU$VN%<pjarW12mFFlnG=7h+<$6Ws=nKU@K0}NX<<IiL<FPfsADk
zW0RDTl%cX^Kv#;fF=zy2<`yL8Br24p78Pga=P4NJ85-yr=q9IRnCclC>s2UdB$a07
zq$pUJ8z!ctrJ1FerJ9;0nVP4h8XFs!TBf8X8>N`1r5anL8X23JC8cOe%E$n{avF#O
HfFTY5EiXyU

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index 9dc56292c3720..e3edafa9c84e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -19,7 +19,15 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.File
 
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
+import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER
+import org.apache.parquet.hadoop.ParquetFileReader
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
+
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
@@ -87,4 +95,110 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
           Row(Seq(2, 3))))
     }
   }
+
+  test("parquet timestamp conversion") {
+    // Make a table with one parquet file written by impala, and one parquet file written by spark.
+    // We should only adjust the timestamps in the impala file, and only if the conf is set
+    val impalaFile = "test-data/impala_timestamp.parq"
+
+    // here are the timestamps in the impala file, as they were saved by impala
+    val impalaFileData =
+      Seq(
+        "2001-01-01 01:01:01",
+        "2002-02-02 02:02:02",
+        "2003-03-03 03:03:03"
+      ).map(java.sql.Timestamp.valueOf)
+    val impalaPath = Thread.currentThread().getContextClassLoader.getResource(impalaFile)
+      .toURI.getPath
+    withTempPath { tableDir =>
+      val ts = Seq(
+        "2004-04-04 04:04:04",
+        "2005-05-05 05:05:05",
+        "2006-06-06 06:06:06"
+      ).map { s => java.sql.Timestamp.valueOf(s) }
+      import testImplicits._
+      // match the column names of the file from impala
+      val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts")
+      df.write.parquet(tableDir.getAbsolutePath)
+      FileUtils.copyFile(new File(impalaPath), new File(tableDir, "part-00001.parq"))
+
+      Seq(false, true).foreach { int96TimestampConversion =>
+        Seq(false, true).foreach { vectorized =>
+          withSQLConf(
+              (SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key,
+                SQLConf.ParquetOutputTimestampType.INT96.toString),
+              (SQLConf.PARQUET_INT96_TIMESTAMP_CONVERSION.key, int96TimestampConversion.toString()),
+              (SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key, vectorized.toString())
+          ) {
+            val readBack = spark.read.parquet(tableDir.getAbsolutePath).collect()
+            assert(readBack.size === 6)
+            // if we apply the conversion, we'll get the "right" values, as saved by impala in the
+            // original file.  Otherwise, they're off by the local timezone offset, set to
+            // America/Los_Angeles in tests
+            val impalaExpectations = if (int96TimestampConversion) {
+              impalaFileData
+            } else {
+              impalaFileData.map { ts =>
+                DateTimeUtils.toJavaTimestamp(DateTimeUtils.convertTz(
+                  DateTimeUtils.fromJavaTimestamp(ts),
+                  DateTimeUtils.TimeZoneUTC,
+                  DateTimeUtils.getTimeZone(conf.sessionLocalTimeZone)))
+              }
+            }
+            val fullExpectations = (ts ++ impalaExpectations).map(_.toString).sorted.toArray
+            val actual = readBack.map(_.getTimestamp(0).toString).sorted
+            withClue(
+              s"int96TimestampConversion = $int96TimestampConversion; vectorized = $vectorized") {
+              assert(fullExpectations === actual)
+
+              // Now test that the behavior is still correct even with a filter which could get
+              // pushed down into parquet.  We don't need extra handling for pushed down
+              // predicates because (a) in ParquetFilters, we ignore TimestampType and (b) parquet
+              // does not read statistics from int96 fields, as they are unsigned.  See
+              // scalastyle:off line.size.limit
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L419
+              // https://github.com/apache/parquet-mr/blob/2fd62ee4d524c270764e9b91dca72e5cf1a005b7/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java#L348
+              // scalastyle:on line.size.limit
+              //
+              // Just to be defensive in case anything ever changes in parquet, this test checks
+              // the assumption on column stats, and also the end-to-end behavior.
+
+              val hadoopConf = sparkContext.hadoopConfiguration
+              val fs = FileSystem.get(hadoopConf)
+              val parts = fs.listStatus(new Path(tableDir.getAbsolutePath), new PathFilter {
+                override def accept(path: Path): Boolean = !path.getName.startsWith("_")
+              })
+              // grab the meta data from the parquet file.  The next section of asserts just make
+              // sure the test is configured correctly.
+              assert(parts.size == 2)
+              parts.foreach { part =>
+                val oneFooter =
+                  ParquetFileReader.readFooter(hadoopConf, part.getPath, NO_FILTER)
+                assert(oneFooter.getFileMetaData.getSchema.getColumns.size === 1)
+                assert(oneFooter.getFileMetaData.getSchema.getColumns.get(0).getType() ===
+                  PrimitiveTypeName.INT96)
+                val oneBlockMeta = oneFooter.getBlocks().get(0)
+                val oneBlockColumnMeta = oneBlockMeta.getColumns().get(0)
+                val columnStats = oneBlockColumnMeta.getStatistics
+                // This is the important assert.  Column stats are written, but they are ignored
+                // when the data is read back as mentioned above, b/c int96 is unsigned.  This
+                // assert makes sure this holds even if we change parquet versions (if eg. there
+                // were ever statistics even on unsigned columns).
+                assert(columnStats.isEmpty)
+              }
+
+              // These queries should return the entire dataset with the conversion applied,
+              // but if the predicates were applied to the raw values in parquet, they would
+              // incorrectly filter data out.
+              val query = spark.read.parquet(tableDir.getAbsolutePath)
+                .where("ts > '2001-01-01 01:00:00'")
+              val countWithFilter = query.count()
+              val exp = if (int96TimestampConversion) 6 else 5
+              assert(countWithFilter === exp, query)
+            }
+          }
+        }
+      }
+    }
+  }
 }

From 251b2c03b4b2846c9577390f87db3db511be7721 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 9 Dec 2017 20:20:28 +0900
Subject: [PATCH 076/356] [SPARK-22672][SQL][TEST][FOLLOWUP] Fix to use
 `spark.conf`

## What changes were proposed in this pull request?

During https://github.com/apache/spark/pull/19882, `conf` is mistakenly used to switch ORC implementation between `native` and `hive`. To affect `OrcTest` correctly, `spark.conf` should be used.

## How was this patch tested?

Pass the tests.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19931 from dongjoon-hyun/SPARK-22672-2.
---
 .../spark/sql/execution/datasources/orc/OrcTest.scala       | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
index d94cb850ed2a2..38b34a03e3e4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcTest.scala
@@ -51,12 +51,12 @@ abstract class OrcTest extends QueryTest with SQLTestUtils with BeforeAndAfterAl
 
   protected override def beforeAll(): Unit = {
     super.beforeAll()
-    originalConfORCImplementation = conf.getConf(ORC_IMPLEMENTATION)
-    conf.setConf(ORC_IMPLEMENTATION, orcImp)
+    originalConfORCImplementation = spark.conf.get(ORC_IMPLEMENTATION)
+    spark.conf.set(ORC_IMPLEMENTATION.key, orcImp)
   }
 
   protected override def afterAll(): Unit = {
-    conf.setConf(ORC_IMPLEMENTATION, originalConfORCImplementation)
+    spark.conf.set(ORC_IMPLEMENTATION.key, originalConfORCImplementation)
     super.afterAll()
   }
 

From ab1b6ee73157fa5401b189924263795bfad919c9 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sat, 9 Dec 2017 09:28:46 -0600
Subject: [PATCH 077/356] [BUILD] update release scripts

## What changes were proposed in this pull request?

Change to dist.apache.org instead of home directory
sha512 should have .sha512 extension. From ASF release signing doc: "The checksum SHOULD be generated using SHA-512. A .sha file SHOULD contain a SHA-1 checksum, for historical reasons."

NOTE: I *think* should require some changes to work with Jenkins' release build

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #19754 from felixcheung/releasescript.
---
 dev/create-release/release-build.sh | 122 ++++++++++++----------------
 dev/create-release/release-tag.sh   |   8 +-
 2 files changed, 60 insertions(+), 70 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 5b43f9bab7505..c71137468054f 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -23,23 +23,19 @@ usage: release-build.sh <package|docs|publish-snapshot|publish-release>
 Creates build deliverables from a Spark commit.
 
 Top level targets are
-  package: Create binary packages and copy them to home.apache
-  docs: Build docs and copy them to home.apache
+  package: Create binary packages and commit them to dist.apache.org/repos/dist/dev/spark/
+  docs: Build docs and commit them to dist.apache.org/repos/dist/dev/spark/
   publish-snapshot: Publish snapshot release to Apache snapshots
   publish-release: Publish a release to Apache release repo
 
 All other inputs are environment variables
 
 GIT_REF - Release tag or commit to build from
-SPARK_VERSION - Version of Spark being built (e.g. 2.1.2)
 SPARK_PACKAGE_VERSION - Release identifier in top level package directory (e.g. 2.1.2-rc1)
-REMOTE_PARENT_DIR - Parent in which to create doc or release builds.
-REMOTE_PARENT_MAX_LENGTH - If set, parent directory will be cleaned to only
- have this number of subdirectories (by deleting old ones). WARNING: This deletes data.
+SPARK_VERSION - (optional) Version of Spark being built (e.g. 2.1.2)
 
 ASF_USERNAME - Username of ASF committer account
 ASF_PASSWORD - Password of ASF committer account
-ASF_RSA_KEY - RSA private key file for ASF committer account
 
 GPG_KEY - GPG key used to sign release artifacts
 GPG_PASSPHRASE - Passphrase for GPG key
@@ -57,7 +53,20 @@ if [[ $@ == *"help"* ]]; then
   exit_with_usage
 fi
 
-for env in ASF_USERNAME ASF_RSA_KEY GPG_PASSPHRASE GPG_KEY; do
+if [[ -z "$ASF_PASSWORD" ]]; then
+  echo 'The environment variable ASF_PASSWORD is not set. Enter the password.'
+  echo
+  stty -echo && printf "ASF password: " && read ASF_PASSWORD && printf '\n' && stty echo
+fi
+
+if [[ -z "$GPG_PASSPHRASE" ]]; then
+  echo 'The environment variable GPG_PASSPHRASE is not set. Enter the passphrase to'
+  echo 'unlock the GPG signing key that will be used to sign the release!'
+  echo
+  stty -echo && printf "GPG passphrase: " && read GPG_PASSPHRASE && printf '\n' && stty echo
+fi
+
+for env in ASF_USERNAME GPG_PASSPHRASE GPG_KEY; do
   if [ -z "${!env}" ]; then
     echo "ERROR: $env must be set to run this script"
     exit_with_usage
@@ -71,8 +80,7 @@ export LC_ALL=C
 # Commit ref to checkout when building
 GIT_REF=${GIT_REF:-master}
 
-# Destination directory parent on remote server
-REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
+RELEASE_STAGING_LOCATION="https://dist.apache.org/repos/dist/dev/spark"
 
 GPG="gpg -u $GPG_KEY --no-tty --batch"
 NEXUS_ROOT=https://repository.apache.org/service/local/staging
@@ -142,42 +150,16 @@ if [ -z "$SPARK_PACKAGE_VERSION" ]; then
   SPARK_PACKAGE_VERSION="${SPARK_VERSION}-$(date +%Y_%m_%d_%H_%M)-${git_hash}"
 fi
 
-DEST_DIR_NAME="spark-$SPARK_PACKAGE_VERSION"
-
-function LFTP {
-  SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
-  COMMANDS=$(cat <<EOF
-     set net:max-retries 2 &&
-     set sftp:connect-program $SSH &&
-     connect -u $ASF_USERNAME,p sftp://home.apache.org &&
-     $@
-EOF
-)
-  lftp --norc -c "$COMMANDS"
-}
-export -f LFTP
-
+DEST_DIR_NAME="$SPARK_PACKAGE_VERSION"
 
 git clean -d -f -x
 rm .gitignore
 rm -rf .git
 cd ..
 
-if [ -n "$REMOTE_PARENT_MAX_LENGTH" ]; then
-  old_dirs=$(
-    LFTP nlist $REMOTE_PARENT_DIR \
-        | grep -v "^\." \
-        | sort -r \
-        | tail -n +$REMOTE_PARENT_MAX_LENGTH)
-  for old_dir in $old_dirs; do
-    echo "Removing directory: $old_dir"
-    LFTP "rm -rf $REMOTE_PARENT_DIR/$old_dir && exit 0"
-  done
-fi
-
 if [[ "$1" == "package" ]]; then
   # Source and binary tarballs
-  echo "Packaging release tarballs"
+  echo "Packaging release source tarballs"
   cp -r spark spark-$SPARK_VERSION
   tar cvzf spark-$SPARK_VERSION.tgz spark-$SPARK_VERSION
   echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --armour --output spark-$SPARK_VERSION.tgz.asc \
@@ -185,7 +167,7 @@ if [[ "$1" == "package" ]]; then
   echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md MD5 spark-$SPARK_VERSION.tgz > \
     spark-$SPARK_VERSION.tgz.md5
   echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
-    SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha
+    SHA512 spark-$SPARK_VERSION.tgz > spark-$SPARK_VERSION.tgz.sha512
   rm -rf spark-$SPARK_VERSION
 
   # Updated for each binary build
@@ -238,7 +220,7 @@ if [[ "$1" == "package" ]]; then
         $R_DIST_NAME.md5
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
         SHA512 $R_DIST_NAME > \
-        $R_DIST_NAME.sha
+        $R_DIST_NAME.sha512
     else
       echo "Creating distribution with PIP package"
       ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
@@ -257,7 +239,7 @@ if [[ "$1" == "package" ]]; then
         $PYTHON_DIST_NAME.md5
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
         SHA512 $PYTHON_DIST_NAME > \
-        $PYTHON_DIST_NAME.sha
+        $PYTHON_DIST_NAME.sha512
     fi
 
     echo "Copying and signing regular binary distribution"
@@ -270,7 +252,7 @@ if [[ "$1" == "package" ]]; then
       spark-$SPARK_VERSION-bin-$NAME.tgz.md5
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
       SHA512 spark-$SPARK_VERSION-bin-$NAME.tgz > \
-      spark-$SPARK_VERSION-bin-$NAME.tgz.sha
+      spark-$SPARK_VERSION-bin-$NAME.tgz.sha512
   }
 
   # TODO: Check exit codes of children here:
@@ -284,22 +266,20 @@ if [[ "$1" == "package" ]]; then
   wait
   rm -rf spark-$SPARK_VERSION-bin-*/
 
-  # Copy data
-  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-bin"
-  echo "Copying release tarballs to $dest_dir"
-  # Put to new directory:
-  LFTP mkdir -p $dest_dir || true
-  LFTP mput -O $dest_dir 'spark-*'
-  LFTP mput -O $dest_dir 'pyspark-*'
-  LFTP mput -O $dest_dir 'SparkR_*'
-  # Delete /latest directory and rename new upload to /latest
-  LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
-  LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"
-  # Re-upload a second time and leave the files in the timestamped upload directory:
-  LFTP mkdir -p $dest_dir || true
-  LFTP mput -O $dest_dir 'spark-*'
-  LFTP mput -O $dest_dir 'pyspark-*'
-  LFTP mput -O $dest_dir 'SparkR_*'
+  svn co --depth=empty $RELEASE_STAGING_LOCATION svn-spark
+  rm -rf "svn-spark/${DEST_DIR_NAME}-bin"
+  mkdir -p "svn-spark/${DEST_DIR_NAME}-bin"
+
+  echo "Copying release tarballs"
+  cp spark-* "svn-spark/${DEST_DIR_NAME}-bin/"
+  cp pyspark-* "svn-spark/${DEST_DIR_NAME}-bin/"
+  cp SparkR_* "svn-spark/${DEST_DIR_NAME}-bin/"
+  svn add "svn-spark/${DEST_DIR_NAME}-bin"
+
+  cd svn-spark
+  svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m"Apache Spark $SPARK_PACKAGE_VERSION"
+  cd ..
+  rm -rf svn-spark
   exit 0
 fi
 
@@ -307,21 +287,24 @@ if [[ "$1" == "docs" ]]; then
   # Documentation
   cd spark
   echo "Building Spark docs"
-  dest_dir="$REMOTE_PARENT_DIR/${DEST_DIR_NAME}-docs"
   cd docs
   # TODO: Make configurable to add this: PRODUCTION=1
   PRODUCTION=1 RELEASE_VERSION="$SPARK_VERSION" jekyll build
-  echo "Copying release documentation to $dest_dir"
-  # Put to new directory:
-  LFTP mkdir -p $dest_dir || true
-  LFTP mirror -R _site $dest_dir
-  # Delete /latest directory and rename new upload to /latest
-  LFTP "rm -r -f $REMOTE_PARENT_DIR/latest || exit 0"
-  LFTP mv $dest_dir "$REMOTE_PARENT_DIR/latest"
-  # Re-upload a second time and leave the files in the timestamped upload directory:
-  LFTP mkdir -p $dest_dir || true
-  LFTP mirror -R _site $dest_dir
   cd ..
+  cd ..
+
+  svn co --depth=empty $RELEASE_STAGING_LOCATION svn-spark
+  rm -rf "svn-spark/${DEST_DIR_NAME}-docs"
+  mkdir -p "svn-spark/${DEST_DIR_NAME}-docs"
+
+  echo "Copying release documentation"
+  cp -R "spark/docs/_site" "svn-spark/${DEST_DIR_NAME}-docs/"
+  svn add "svn-spark/${DEST_DIR_NAME}-docs"
+
+  cd svn-spark
+  svn ci --username $ASF_USERNAME --password "$ASF_PASSWORD" -m"Apache Spark $SPARK_PACKAGE_VERSION docs"
+  cd ..
+  rm -rf svn-spark
   exit 0
 fi
 
@@ -399,6 +382,7 @@ if [[ "$1" == "publish-release" ]]; then
   find . -type f |grep -v \.jar |grep -v \.pom | xargs rm
 
   echo "Creating hash and signature files"
+  # this must have .asc, .md5 and .sha1 - it really doesn't like anything else there
   for file in $(find . -type f)
   do
     echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --output $file.asc \
diff --git a/dev/create-release/release-tag.sh b/dev/create-release/release-tag.sh
index 370a62ce15bc4..a05716a5f66bb 100755
--- a/dev/create-release/release-tag.sh
+++ b/dev/create-release/release-tag.sh
@@ -41,6 +41,12 @@ if [[ $@ == *"help"* ]]; then
   exit_with_usage
 fi
 
+if [[ -z "$ASF_PASSWORD" ]]; then
+  echo 'The environment variable ASF_PASSWORD is not set. Enter the password.'
+  echo
+  stty -echo && printf "ASF password: " && read ASF_PASSWORD && printf '\n' && stty echo
+fi
+
 for env in ASF_USERNAME ASF_PASSWORD RELEASE_VERSION RELEASE_TAG NEXT_VERSION GIT_EMAIL GIT_NAME GIT_BRANCH; do
   if [ -z "${!env}" ]; then
     echo "$env must be set to run this script"
@@ -52,7 +58,7 @@ ASF_SPARK_REPO="git-wip-us.apache.org/repos/asf/spark.git"
 MVN="build/mvn --force"
 
 rm -rf spark
-git clone https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO -b $GIT_BRANCH
+git clone "https://$ASF_USERNAME:$ASF_PASSWORD@$ASF_SPARK_REPO" -b $GIT_BRANCH
 cd spark
 
 git config user.name "$GIT_NAME"

From 4289ac9d8dbbc45fc2ee6d0250a2113107bf08d0 Mon Sep 17 00:00:00 2001
From: zouchenjun <zouchenjun@youzan.com>
Date: Sun, 10 Dec 2017 20:36:14 -0800
Subject: [PATCH 078/356] [SPARK-22496][SQL] thrift server adds operation logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
since hive 2.0+  upgrades log4j to log4j2，a lot of [changes](https://issues.apache.org/jira/browse/HIVE-11304) are made working on it.
as spark is not to ready to update its inner hive version(1.2.1) , so I manage to make little changes.
the function registerCurrentOperationLog  is moved from SQLOperstion to its parent class ExecuteStatementOperation so spark can use it.

## How was this patch tested?
manual test

Author: zouchenjun <zouchenjun@youzan.com>

Closes #19721 from ChenjunZou/operation-log.
---
 .../cli/operation/ExecuteStatementOperation.java    | 13 +++++++++++++
 .../hive/service/cli/operation/SQLOperation.java    | 12 ------------
 .../SparkExecuteStatementOperation.scala            |  1 +
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
index 3f2de108f069a..dc7de3c49554b 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
@@ -23,6 +23,7 @@
 
 import org.apache.hadoop.hive.ql.processors.CommandProcessor;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory;
+import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hive.service.cli.HiveSQLException;
 import org.apache.hive.service.cli.OperationType;
 import org.apache.hive.service.cli.session.HiveSession;
@@ -67,4 +68,16 @@ protected void setConfOverlay(Map<String, String> confOverlay) {
       this.confOverlay = confOverlay;
     }
   }
+
+  protected void registerCurrentOperationLog() {
+    if (isOperationLogEnabled) {
+      if (operationLog == null) {
+        LOG.warn("Failed to get current OperationLog object of Operation: " +
+                getHandle().getHandleIdentifier());
+        isOperationLogEnabled = false;
+        return;
+      }
+      OperationLog.setCurrentOperationLog(operationLog);
+    }
+  }
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index 5014cedd870b6..fd9108eb53ca9 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -274,18 +274,6 @@ private Hive getSessionHive() throws HiveSQLException {
     }
   }
 
-  private void registerCurrentOperationLog() {
-    if (isOperationLogEnabled) {
-      if (operationLog == null) {
-        LOG.warn("Failed to get current OperationLog object of Operation: " +
-            getHandle().getHandleIdentifier());
-        isOperationLogEnabled = false;
-        return;
-      }
-      OperationLog.setCurrentOperationLog(operationLog);
-    }
-  }
-
   private void cleanup(OperationState state) throws HiveSQLException {
     setState(state);
     if (shouldRunAsync()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index f5191fa9132bd..664bc20601eaa 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -170,6 +170,7 @@ private[hive] class SparkExecuteStatementOperation(
         override def run(): Unit = {
           val doAsAction = new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
+              registerCurrentOperationLog()
               try {
                 execute()
               } catch {

From ec873a4fd20a47cf0791456bfb301f25a34ae014 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Mon, 11 Dec 2017 06:35:31 -0600
Subject: [PATCH 079/356] [SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to
 examples

## What changes were proposed in this pull request?

In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model.

The PR adds the ClusteringEvaluator in the examples.

## How was this patch tested?

Manual runs of the examples.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19676 from mgaido91/SPARK-14516_examples.
---
 .../apache/spark/examples/ml/JavaKMeansExample.java  | 12 +++++++++---
 examples/src/main/python/ml/kmeans_example.py        | 12 +++++++++---
 .../org/apache/spark/examples/ml/KMeansExample.scala | 12 +++++++++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
index d8f948ae38cb3..dc4b0bcb59657 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java
@@ -20,6 +20,7 @@
 // $example on$
 import org.apache.spark.ml.clustering.KMeansModel;
 import org.apache.spark.ml.clustering.KMeans;
+import org.apache.spark.ml.evaluation.ClusteringEvaluator;
 import org.apache.spark.ml.linalg.Vector;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
@@ -51,9 +52,14 @@ public static void main(String[] args) {
     KMeans kmeans = new KMeans().setK(2).setSeed(1L);
     KMeansModel model = kmeans.fit(dataset);
 
-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    double WSSSE = model.computeCost(dataset);
-    System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
+    // Make predictions
+    Dataset<Row> predictions = model.transform(dataset);
+
+    // Evaluate clustering by computing Silhouette score
+    ClusteringEvaluator evaluator = new ClusteringEvaluator();
+
+    double silhouette = evaluator.evaluate(predictions);
+    System.out.println("Silhouette with squared euclidean distance = " + silhouette);
 
     // Shows the result.
     Vector[] centers = model.clusterCenters();
diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py
index 6846ec4599714..5f77843e3743a 100644
--- a/examples/src/main/python/ml/kmeans_example.py
+++ b/examples/src/main/python/ml/kmeans_example.py
@@ -19,6 +19,7 @@
 
 # $example on$
 from pyspark.ml.clustering import KMeans
+from pyspark.ml.evaluation import ClusteringEvaluator
 # $example off$
 
 from pyspark.sql import SparkSession
@@ -45,9 +46,14 @@
     kmeans = KMeans().setK(2).setSeed(1)
     model = kmeans.fit(dataset)
 
-    # Evaluate clustering by computing Within Set Sum of Squared Errors.
-    wssse = model.computeCost(dataset)
-    print("Within Set Sum of Squared Errors = " + str(wssse))
+    # Make predictions
+    predictions = model.transform(dataset)
+
+    # Evaluate clustering by computing Silhouette score
+    evaluator = ClusteringEvaluator()
+
+    silhouette = evaluator.evaluate(predictions)
+    print("Silhouette with squared euclidean distance = " + str(silhouette))
 
     # Shows the result.
     centers = model.clusterCenters()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
index a1d19e138dedb..2bc8184e623ff 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala
@@ -21,6 +21,7 @@ package org.apache.spark.examples.ml
 
 // $example on$
 import org.apache.spark.ml.clustering.KMeans
+import org.apache.spark.ml.evaluation.ClusteringEvaluator
 // $example off$
 import org.apache.spark.sql.SparkSession
 
@@ -47,9 +48,14 @@ object KMeansExample {
     val kmeans = new KMeans().setK(2).setSeed(1L)
     val model = kmeans.fit(dataset)
 
-    // Evaluate clustering by computing Within Set Sum of Squared Errors.
-    val WSSSE = model.computeCost(dataset)
-    println(s"Within Set Sum of Squared Errors = $WSSSE")
+    // Make predictions
+    val predictions = model.transform(dataset)
+
+    // Evaluate clustering by computing Silhouette score
+    val evaluator = new ClusteringEvaluator()
+
+    val silhouette = evaluator.evaluate(predictions)
+    println(s"Silhouette with squared euclidean distance = $silhouette")
 
     // Shows the result.
     println("Cluster Centers: ")

From 6cc7021a40b64c41a51f337ec4be9545a25e838c Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 11 Dec 2017 21:52:57 +0800
Subject: [PATCH 080/356] [SPARK-22267][SQL][TEST] Spark SQL incorrectly reads
 ORC files when column order is different

## What changes were proposed in this pull request?

Until 2.2.1, with the default configuration, Apache Spark returns incorrect results when ORC file schema is different from metastore schema order. This is due to Hive 1.2.1 library and some issues on `convertMetastoreOrc` option.

```scala
scala> Seq(1 -> 2).toDF("c1", "c2").write.format("orc").mode("overwrite").save("/tmp/o")
scala> sql("CREATE EXTERNAL TABLE o(c2 INT, c1 INT) STORED AS orc LOCATION '/tmp/o'")
scala> spark.table("o").show    // This is wrong.
+---+---+
| c2| c1|
+---+---+
|  1|  2|
+---+---+
scala> spark.read.orc("/tmp/o").show  // This is correct.
+---+---+
| c1| c2|
+---+---+
|  1|  2|
+---+---+
```

After [SPARK-22279](https://github.com/apache/spark/pull/19499), the default configuration doesn't have this bug. Although Hive 1.2.1 library code path still has the problem, we had better have a test coverage on what we have now in order to prevent future regression on it.

## How was this patch tested?

Pass the Jenkins with a newly added test test.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19928 from dongjoon-hyun/SPARK-22267.
---
 .../sql/hive/execution/SQLQuerySuite.scala    | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c11e37a516646..f2562c33e2a6e 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2153,4 +2153,23 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-22267 Spark SQL incorrectly reads ORC files when column order is different") {
+    Seq("native", "hive").foreach { orcImpl =>
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+        withTempPath { f =>
+          val path = f.getCanonicalPath
+          Seq(1 -> 2).toDF("c1", "c2").write.orc(path)
+          checkAnswer(spark.read.orc(path), Row(1, 2))
+
+          withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
+            withTable("t") {
+              sql(s"CREATE EXTERNAL TABLE t(c2 INT, c1 INT) STORED AS ORC LOCATION '$path'")
+              checkAnswer(spark.table("t"), Row(2, 1))
+            }
+          }
+        }
+      }
+    }
+  }
 }

From bf20abb2dc084b743289ea24fc8d6e47d6e6c0dd Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Mon, 11 Dec 2017 13:36:15 -0600
Subject: [PATCH 081/356] [SPARK-22642][SQL] the createdTempDir will not be
 deleted if an exception occurs, should delete it with try-finally.

## What changes were proposed in this pull request?

We found staging directories will not be dropped sometimes in our production environment.
The createdTempDir will not be deleted if an exception occurs, we should delete createdTempDir with try-finally.

This PR is follow-up SPARK-18703.

## How was this patch tested?

exist tests

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #19841 from zuotingbing/SPARK-stagedir.
---
 .../hive/execution/InsertIntoHiveTable.scala  | 47 ++++++++++++-------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index 56e10bc457a00..b46addb6aa85b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
-import org.apache.spark.sql.catalyst.catalog.CatalogTable
+import org.apache.spark.sql.catalyst.catalog.{CatalogTable, ExternalCatalog}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.command.CommandUtils
@@ -91,6 +92,34 @@ case class InsertIntoHiveTable(
     )
     val tableLocation = hiveQlTable.getDataLocation
     val tmpLocation = getExternalTmpPath(sparkSession, hadoopConf, tableLocation)
+
+    try {
+      processInsert(sparkSession, externalCatalog, hadoopConf, tableDesc, tmpLocation)
+    } finally {
+      // Attempt to delete the staging directory and the inclusive files. If failed, the files are
+      // expected to be dropped at the normal termination of VM since deleteOnExit is used.
+      deleteExternalTmpPath(hadoopConf)
+    }
+
+    // un-cache this table.
+    sparkSession.catalog.uncacheTable(table.identifier.quotedString)
+    sparkSession.sessionState.catalog.refreshTable(table.identifier)
+
+    CommandUtils.updateTableStats(sparkSession, table)
+
+    // It would be nice to just return the childRdd unchanged so insert operations could be chained,
+    // however for now we return an empty list to simplify compatibility checks with hive, which
+    // does not return anything for insert operations.
+    // TODO: implement hive compatibility as rules.
+    Seq.empty[Row]
+  }
+
+  private def processInsert(
+      sparkSession: SparkSession,
+      externalCatalog: ExternalCatalog,
+      hadoopConf: Configuration,
+      tableDesc: TableDesc,
+      tmpLocation: Path): Unit = {
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
 
     val numDynamicPartitions = partition.values.count(_.isEmpty)
@@ -231,21 +260,5 @@ case class InsertIntoHiveTable(
         overwrite,
         isSrcLocal = false)
     }
-
-    // Attempt to delete the staging directory and the inclusive files. If failed, the files are
-    // expected to be dropped at the normal termination of VM since deleteOnExit is used.
-    deleteExternalTmpPath(hadoopConf)
-
-    // un-cache this table.
-    sparkSession.catalog.uncacheTable(table.identifier.quotedString)
-    sparkSession.sessionState.catalog.refreshTable(table.identifier)
-
-    CommandUtils.updateTableStats(sparkSession, table)
-
-    // It would be nice to just return the childRdd unchanged so insert operations could be chained,
-    // however for now we return an empty list to simplify compatibility checks with hive, which
-    // does not return anything for insert operations.
-    // TODO: implement hive compatibility as rules.
-    Seq.empty[Row]
   }
 }

From a04f2bea67c9abf4149d33ac6c319cd4f85344d5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 11 Dec 2017 13:08:42 -0800
Subject: [PATCH 082/356] Revert "[SPARK-22496][SQL] thrift server adds
 operation logs"

This reverts commit 4289ac9d8dbbc45fc2ee6d0250a2113107bf08d0.
---
 .../cli/operation/ExecuteStatementOperation.java    | 13 -------------
 .../hive/service/cli/operation/SQLOperation.java    | 12 ++++++++++++
 .../SparkExecuteStatementOperation.scala            |  1 -
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
index dc7de3c49554b..3f2de108f069a 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
@@ -23,7 +23,6 @@
 
 import org.apache.hadoop.hive.ql.processors.CommandProcessor;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory;
-import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hive.service.cli.HiveSQLException;
 import org.apache.hive.service.cli.OperationType;
 import org.apache.hive.service.cli.session.HiveSession;
@@ -68,16 +67,4 @@ protected void setConfOverlay(Map<String, String> confOverlay) {
       this.confOverlay = confOverlay;
     }
   }
-
-  protected void registerCurrentOperationLog() {
-    if (isOperationLogEnabled) {
-      if (operationLog == null) {
-        LOG.warn("Failed to get current OperationLog object of Operation: " +
-                getHandle().getHandleIdentifier());
-        isOperationLogEnabled = false;
-        return;
-      }
-      OperationLog.setCurrentOperationLog(operationLog);
-    }
-  }
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index fd9108eb53ca9..5014cedd870b6 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -274,6 +274,18 @@ private Hive getSessionHive() throws HiveSQLException {
     }
   }
 
+  private void registerCurrentOperationLog() {
+    if (isOperationLogEnabled) {
+      if (operationLog == null) {
+        LOG.warn("Failed to get current OperationLog object of Operation: " +
+            getHandle().getHandleIdentifier());
+        isOperationLogEnabled = false;
+        return;
+      }
+      OperationLog.setCurrentOperationLog(operationLog);
+    }
+  }
+
   private void cleanup(OperationState state) throws HiveSQLException {
     setState(state);
     if (shouldRunAsync()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 664bc20601eaa..f5191fa9132bd 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -170,7 +170,6 @@ private[hive] class SparkExecuteStatementOperation(
         override def run(): Unit = {
           val doAsAction = new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
-              registerCurrentOperationLog()
               try {
                 execute()
               } catch {

From c235b5f9772be57122ca5109b9295cf8c85926df Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 11 Dec 2017 13:15:45 -0800
Subject: [PATCH 083/356] [SPARK-22746][SQL] Avoid the generation of useless
 mutable states by SortMergeJoin

## What changes were proposed in this pull request?

This PR reduce the number of global mutable variables in generated code of `SortMergeJoin`.

Before this PR, global mutable variables are used to extend lifetime of variables in the nested loop. This can be achieved by declaring variable at the outer most loop level where the variables are used.
In the following example, `smj_value8`, `smj_value8`, and `smj_value9` are declared as local variable at lines 145-147 in `With this PR`.

This PR fixes potential assertion error by #19865. Without this PR, a global mutable variable is potentially passed to arguments in generated code of split function.

Without this PR
```
/* 010 */   int smj_value8;
/* 011 */   boolean smj_value8;
/* 012 */   int smj_value9;
..
/* 143 */   protected void processNext() throws java.io.IOException {
/* 144 */     while (findNextInnerJoinRows(smj_leftInput, smj_rightInput)) {
/* 145 */       boolean smj_loaded = false;
/* 146 */       smj_isNull6 = smj_leftRow.isNullAt(1);
/* 147 */       smj_value9 = smj_isNull6 ? -1 : (smj_leftRow.getInt(1));
/* 148 */       scala.collection.Iterator<UnsafeRow> smj_iterator = smj_matches.generateIterator();
/* 149 */       while (smj_iterator.hasNext()) {
/* 150 */         InternalRow smj_rightRow1 = (InternalRow) smj_iterator.next();
/* 151 */         boolean smj_isNull8 = smj_rightRow1.isNullAt(1);
/* 152 */         int smj_value11 = smj_isNull8 ? -1 : (smj_rightRow1.getInt(1));
/* 153 */
/* 154 */         boolean smj_value12 = (smj_isNull6 && smj_isNull8) ||
/* 155 */         (!smj_isNull6 && !smj_isNull8 && smj_value9 == smj_value11);
/* 156 */         if (false || !smj_value12) continue;
/* 157 */         if (!smj_loaded) {
/* 158 */           smj_loaded = true;
/* 159 */           smj_value8 = smj_leftRow.getInt(0);
/* 160 */         }
/* 161 */         int smj_value10 = smj_rightRow1.getInt(0);
/* 162 */         smj_numOutputRows.add(1);
/* 163 */
/* 164 */         smj_rowWriter.zeroOutNullBytes();
/* 165 */
/* 166 */         smj_rowWriter.write(0, smj_value8);
/* 167 */
/* 168 */         if (smj_isNull6) {
/* 169 */           smj_rowWriter.setNullAt(1);
/* 170 */         } else {
/* 171 */           smj_rowWriter.write(1, smj_value9);
/* 172 */         }
/* 173 */
/* 174 */         smj_rowWriter.write(2, smj_value10);
/* 175 */
/* 176 */         if (smj_isNull8) {
/* 177 */           smj_rowWriter.setNullAt(3);
/* 178 */         } else {
/* 179 */           smj_rowWriter.write(3, smj_value11);
/* 180 */         }
/* 181 */         append(smj_result.copy());
/* 182 */
/* 183 */       }
/* 184 */       if (shouldStop()) return;
/* 185 */     }
/* 186 */   }
```

With this PR
```
/* 143 */   protected void processNext() throws java.io.IOException {
/* 144 */     while (findNextInnerJoinRows(smj_leftInput, smj_rightInput)) {
/* 145 */       int smj_value8 = -1;
/* 146 */       boolean smj_isNull6 = false;
/* 147 */       int smj_value9 = -1;
/* 148 */       boolean smj_loaded = false;
/* 149 */       smj_isNull6 = smj_leftRow.isNullAt(1);
/* 150 */       smj_value9 = smj_isNull6 ? -1 : (smj_leftRow.getInt(1));
/* 151 */       scala.collection.Iterator<UnsafeRow> smj_iterator = smj_matches.generateIterator();
/* 152 */       while (smj_iterator.hasNext()) {
/* 153 */         InternalRow smj_rightRow1 = (InternalRow) smj_iterator.next();
/* 154 */         boolean smj_isNull8 = smj_rightRow1.isNullAt(1);
/* 155 */         int smj_value11 = smj_isNull8 ? -1 : (smj_rightRow1.getInt(1));
/* 156 */
/* 157 */         boolean smj_value12 = (smj_isNull6 && smj_isNull8) ||
/* 158 */         (!smj_isNull6 && !smj_isNull8 && smj_value9 == smj_value11);
/* 159 */         if (false || !smj_value12) continue;
/* 160 */         if (!smj_loaded) {
/* 161 */           smj_loaded = true;
/* 162 */           smj_value8 = smj_leftRow.getInt(0);
/* 163 */         }
/* 164 */         int smj_value10 = smj_rightRow1.getInt(0);
/* 165 */         smj_numOutputRows.add(1);
/* 166 */
/* 167 */         smj_rowWriter.zeroOutNullBytes();
/* 168 */
/* 169 */         smj_rowWriter.write(0, smj_value8);
/* 170 */
/* 171 */         if (smj_isNull6) {
/* 172 */           smj_rowWriter.setNullAt(1);
/* 173 */         } else {
/* 174 */           smj_rowWriter.write(1, smj_value9);
/* 175 */         }
/* 176 */
/* 177 */         smj_rowWriter.write(2, smj_value10);
/* 178 */
/* 179 */         if (smj_isNull8) {
/* 180 */           smj_rowWriter.setNullAt(3);
/* 181 */         } else {
/* 182 */           smj_rowWriter.write(3, smj_value11);
/* 183 */         }
/* 184 */         append(smj_result.copy());
/* 185 */
/* 186 */       }
/* 187 */       if (shouldStop()) return;
/* 188 */     }
/* 189 */   }
```

## How was this patch tested?

Existing test cases

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19937 from kiszk/SPARK-22746.
---
 .../execution/joins/SortMergeJoinExec.scala   | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 9c08ec71c1fde..554b73181116c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -507,32 +507,38 @@ case class SortMergeJoinExec(
   }
 
   /**
-   * Creates variables for left part of result row.
+   * Creates variables and declarations for left part of result row.
    *
    * In order to defer the access after condition and also only access once in the loop,
    * the variables should be declared separately from accessing the columns, we can't use the
    * codegen of BoundReference here.
    */
-  private def createLeftVars(ctx: CodegenContext, leftRow: String): Seq[ExprCode] = {
+  private def createLeftVars(ctx: CodegenContext, leftRow: String): (Seq[ExprCode], Seq[String]) = {
     ctx.INPUT_ROW = leftRow
     left.output.zipWithIndex.map { case (a, i) =>
       val value = ctx.freshName("value")
       val valueCode = ctx.getValue(leftRow, a.dataType, i.toString)
-      // declare it as class member, so we can access the column before or in the loop.
-      ctx.addMutableState(ctx.javaType(a.dataType), value)
+      val javaType = ctx.javaType(a.dataType)
+      val defaultValue = ctx.defaultValue(a.dataType)
       if (a.nullable) {
         val isNull = ctx.freshName("isNull")
-        ctx.addMutableState(ctx.JAVA_BOOLEAN, isNull)
         val code =
           s"""
              |$isNull = $leftRow.isNullAt($i);
-             |$value = $isNull ? ${ctx.defaultValue(a.dataType)} : ($valueCode);
+             |$value = $isNull ? $defaultValue : ($valueCode);
            """.stripMargin
-        ExprCode(code, isNull, value)
+        val leftVarsDecl =
+          s"""
+             |boolean $isNull = false;
+             |$javaType $value = $defaultValue;
+           """.stripMargin
+        (ExprCode(code, isNull, value), leftVarsDecl)
       } else {
-        ExprCode(s"$value = $valueCode;", "false", value)
+        val code = s"$value = $valueCode;"
+        val leftVarsDecl = s"""$javaType $value = $defaultValue;"""
+        (ExprCode(code, "false", value), leftVarsDecl)
       }
-    }
+    }.unzip
   }
 
   /**
@@ -580,7 +586,7 @@ case class SortMergeJoinExec(
     val (leftRow, matches) = genScanner(ctx)
 
     // Create variables for row from both sides.
-    val leftVars = createLeftVars(ctx, leftRow)
+    val (leftVars, leftVarDecl) = createLeftVars(ctx, leftRow)
     val rightRow = ctx.freshName("rightRow")
     val rightVars = createRightVar(ctx, rightRow)
 
@@ -617,6 +623,7 @@ case class SortMergeJoinExec(
 
     s"""
        |while (findNextInnerJoinRows($leftInput, $rightInput)) {
+       |  ${leftVarDecl.mkString("\n")}
        |  ${beforeLoop.trim}
        |  scala.collection.Iterator<UnsafeRow> $iterator = $matches.generateIterator();
        |  while ($iterator.hasNext()) {

From 3f4060c340d6bac412e8819c4388ccba226efcf3 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Mon, 11 Dec 2017 15:14:59 -0800
Subject: [PATCH 084/356] [SPARK-22646][K8S] Spark on Kubernetes - basic
 submission client

This PR contains implementation of the basic submission client for the cluster mode of Spark on Kubernetes. It's step 2 from the step-wise plan documented [here](https://github.com/apache-spark-on-k8s/spark/issues/441#issuecomment-330802935).
This addition is covered by the [SPIP](http://apache-spark-developers-list.1001551.n3.nabble.com/SPIP-Spark-on-Kubernetes-td22147.html) vote which passed on Aug 31.

This PR and #19468 together form a MVP of Spark on Kubernetes that allows users to run Spark applications that use resources locally within the driver and executor containers on Kubernetes 1.6 and up. Some changes on pom and build/test setup are copied over from #19468 to make this PR self contained and testable.

The submission client is mainly responsible for creating the Kubernetes pod that runs the Spark driver. It follows a step-based approach to construct the driver pod, as the code under the `submit.steps` package shows. The steps are orchestrated by `DriverConfigurationStepsOrchestrator`. `Client` creates the driver pod and waits for the application to complete if it's configured to do so, which is the case by default.

This PR also contains Dockerfiles of the driver and executor images. They are included because some of the environment variables set in the code would not make sense without referring to the Dockerfiles.

* The patch contains unit tests which are passing.
* Manual testing: ./build/mvn -Pkubernetes clean package succeeded.
* It is a subset of the entire changelist hosted at http://github.com/apache-spark-on-k8s/spark which is in active use in several organizations.
* There is integration testing enabled in the fork currently hosted by PepperData which is being moved over to RiseLAB CI.
* Detailed documentation on trying out the patch in its entirety is in: https://apache-spark-on-k8s.github.io/userdocs/running-on-kubernetes.html

cc rxin felixcheung mateiz (shepherd)
k8s-big-data SIG members & contributors: mccheah foxish ash211 ssuchter varunkatta kimoonkim erikerlandson tnachen ifilonenko liyinan926

Author: Yinan Li <liyinan926@gmail.com>

Closes #19717 from liyinan926/spark-kubernetes-4.
---
 assembly/pom.xml                              |  10 +
 .../scala/org/apache/spark/SparkConf.scala    |   6 +-
 .../scala/org/apache/spark/SparkContext.scala |   1 -
 .../org/apache/spark/deploy/SparkSubmit.scala |  48 +++-
 .../spark/deploy/SparkSubmitArguments.scala   |   4 +-
 .../spark/internal/config/package.scala       |   8 +
 .../scala/org/apache/spark/util/Utils.scala   |  36 +++
 .../org/apache/spark/SparkContextSuite.scala  |   6 +
 .../spark/deploy/SparkSubmitSuite.scala       |  27 ++
 .../org/apache/spark/util/UtilsSuite.scala    |  21 ++
 docs/configuration.md                         |  20 ++
 docs/running-on-yarn.md                       |  16 +-
 .../launcher/SparkSubmitOptionParser.java     |   2 +-
 .../org/apache/spark/deploy/k8s/Config.scala  |  85 ++++--
 .../apache/spark/deploy/k8s/Constants.scala   |  28 ++
 ...DriverConfigurationStepsOrchestrator.scala | 125 +++++++++
 .../submit/KubernetesClientApplication.scala  | 240 +++++++++++++++++
 .../k8s/submit/KubernetesDriverSpec.scala     |  47 ++++
 .../k8s/submit/KubernetesFileUtils.scala      |  68 +++++
 .../k8s/submit/LoggingPodStatusWatcher.scala  | 180 +++++++++++++
 .../deploy/k8s/submit/MainAppResource.scala   |  21 ++
 .../steps/BaseDriverConfigurationStep.scala   | 165 ++++++++++++
 .../steps/DependencyResolutionStep.scala      |  62 +++++
 .../steps/DriverConfigurationStep.scala       |  30 +++
 .../DriverKubernetesCredentialsStep.scala     | 245 ++++++++++++++++++
 .../steps/DriverServiceBootstrapStep.scala    | 103 ++++++++
 .../cluster/k8s/ExecutorPodFactory.scala      |  13 +-
 .../k8s/KubernetesClusterManager.scala        |   8 +-
 .../spark/deploy/k8s/submit/ClientSuite.scala | 234 +++++++++++++++++
 ...rConfigurationStepsOrchestratorSuite.scala |  82 ++++++
 .../BaseDriverConfigurationStepSuite.scala    | 118 +++++++++
 .../steps/DependencyResolutionStepSuite.scala |  81 ++++++
 ...DriverKubernetesCredentialsStepSuite.scala | 153 +++++++++++
 .../DriverServiceBootstrapStepSuite.scala     | 180 +++++++++++++
 .../src/main/dockerfiles/driver/Dockerfile    |  34 +++
 .../src/main/dockerfiles/executor/Dockerfile  |  34 +++
 .../main/dockerfiles/spark-base/Dockerfile    |  47 ++++
 .../main/dockerfiles/spark-base/entrypoint.sh |  37 +++
 .../org/apache/spark/deploy/yarn/config.scala |   8 -
 39 files changed, 2566 insertions(+), 67 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
 create mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
 create mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh

diff --git a/assembly/pom.xml b/assembly/pom.xml
index 01fe354235e5b..b3b4239771bc3 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -148,6 +148,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>kubernetes</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-kubernetes_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>hive</id>
       <dependencies>
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 0e08ff65e4784..4b1286d91e8f3 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -668,7 +668,11 @@ private[spark] object SparkConf extends Logging {
     MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM.key -> Seq(
       AlternateConfig("spark.reducer.maxReqSizeShuffleToMem", "2.3")),
     LISTENER_BUS_EVENT_QUEUE_CAPACITY.key -> Seq(
-      AlternateConfig("spark.scheduler.listenerbus.eventqueue.size", "2.3"))
+      AlternateConfig("spark.scheduler.listenerbus.eventqueue.size", "2.3")),
+    DRIVER_MEMORY_OVERHEAD.key -> Seq(
+      AlternateConfig("spark.yarn.driver.memoryOverhead", "2.3")),
+    EXECUTOR_MEMORY_OVERHEAD.key -> Seq(
+      AlternateConfig("spark.yarn.executor.memoryOverhead", "2.3"))
   )
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 71f1e7c7321bc..92e13ce1ba042 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -18,7 +18,6 @@
 package org.apache.spark
 
 import java.io._
-import java.lang.reflect.Constructor
 import java.net.URI
 import java.util.{Arrays, Locale, Properties, ServiceLoader, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentMap}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index cfcdce648d330..ab834bb682041 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -76,7 +76,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private val STANDALONE = 2
   private val MESOS = 4
   private val LOCAL = 8
-  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL
+  private val KUBERNETES = 16
+  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL | KUBERNETES
 
   // Deploy modes
   private val CLIENT = 1
@@ -97,6 +98,8 @@ object SparkSubmit extends CommandLineUtils with Logging {
     "org.apache.spark.deploy.yarn.YarnClusterApplication"
   private[deploy] val REST_CLUSTER_SUBMIT_CLASS = classOf[RestSubmissionClientApp].getName()
   private[deploy] val STANDALONE_CLUSTER_SUBMIT_CLASS = classOf[ClientApp].getName()
+  private[deploy] val KUBERNETES_CLUSTER_SUBMIT_CLASS =
+    "org.apache.spark.deploy.k8s.submit.KubernetesClientApplication"
 
   // scalastyle:off println
   private[spark] def printVersionAndExit(): Unit = {
@@ -257,9 +260,10 @@ object SparkSubmit extends CommandLineUtils with Logging {
         YARN
       case m if m.startsWith("spark") => STANDALONE
       case m if m.startsWith("mesos") => MESOS
+      case m if m.startsWith("k8s") => KUBERNETES
       case m if m.startsWith("local") => LOCAL
       case _ =>
-        printErrorAndExit("Master must either be yarn or start with spark, mesos, local")
+        printErrorAndExit("Master must either be yarn or start with spark, mesos, k8s, or local")
         -1
     }
 
@@ -294,6 +298,16 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
     }
 
+    if (clusterManager == KUBERNETES) {
+      args.master = Utils.checkAndGetK8sMasterUrl(args.master)
+      // Make sure KUBERNETES is included in our build if we're trying to use it
+      if (!Utils.classIsLoadable(KUBERNETES_CLUSTER_SUBMIT_CLASS) && !Utils.isTesting) {
+        printErrorAndExit(
+          "Could not load KUBERNETES classes. " +
+            "This copy of Spark may not have been compiled with KUBERNETES support.")
+      }
+    }
+
     // Fail fast, the following modes are not supported or applicable
     (clusterManager, deployMode) match {
       case (STANDALONE, CLUSTER) if args.isPython =>
@@ -302,6 +316,12 @@ object SparkSubmit extends CommandLineUtils with Logging {
       case (STANDALONE, CLUSTER) if args.isR =>
         printErrorAndExit("Cluster deploy mode is currently not supported for R " +
           "applications on standalone clusters.")
+      case (KUBERNETES, _) if args.isPython =>
+        printErrorAndExit("Python applications are currently not supported for Kubernetes.")
+      case (KUBERNETES, _) if args.isR =>
+        printErrorAndExit("R applications are currently not supported for Kubernetes.")
+      case (KUBERNETES, CLIENT) =>
+        printErrorAndExit("Client mode is currently not supported for Kubernetes.")
       case (LOCAL, CLUSTER) =>
         printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
       case (_, CLUSTER) if isShell(args.primaryResource) =>
@@ -322,6 +342,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
     val isYarnCluster = clusterManager == YARN && deployMode == CLUSTER
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
     val isStandAloneCluster = clusterManager == STANDALONE && deployMode == CLUSTER
+    val isKubernetesCluster = clusterManager == KUBERNETES && deployMode == CLUSTER
 
     if (!isMesosCluster && !isStandAloneCluster) {
       // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
@@ -557,19 +578,19 @@ object SparkSubmit extends CommandLineUtils with Logging {
       OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.keytab"),
 
       // Other options
-      OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
+      OptionAssigner(args.executorCores, STANDALONE | YARN | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.executor.cores"),
-      OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN, ALL_DEPLOY_MODES,
+      OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.executor.memory"),
-      OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS, ALL_DEPLOY_MODES,
+      OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.cores.max"),
       OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
         confKey = "spark.files"),
       OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
       OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, confKey = "spark.jars"),
-      OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN, CLUSTER,
+      OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.memory"),
-      OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN, CLUSTER,
+      OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.cores"),
       OptionAssigner(args.supervise.toString, STANDALONE | MESOS, CLUSTER,
         confKey = "spark.driver.supervise"),
@@ -703,6 +724,19 @@ object SparkSubmit extends CommandLineUtils with Logging {
       }
     }
 
+    if (isKubernetesCluster) {
+      childMainClass = KUBERNETES_CLUSTER_SUBMIT_CLASS
+      if (args.primaryResource != SparkLauncher.NO_RESOURCE) {
+        childArgs ++= Array("--primary-java-resource", args.primaryResource)
+      }
+      childArgs ++= Array("--main-class", args.mainClass)
+      if (args.childArgs != null) {
+        args.childArgs.foreach { arg =>
+          childArgs += ("--arg", arg)
+        }
+      }
+    }
+
     // Load any properties specified through --conf and the default properties file
     for ((k, v) <- args.sparkProperties) {
       sparkConf.setIfMissing(k, v)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index a7722e4f86023..9db7a1fe3106d 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -515,8 +515,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     outStream.println(
       s"""
         |Options:
-        |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn, or local
-        |                              (Default: local[*]).
+        |  --master MASTER_URL         spark://host:port, mesos://host:port, yarn,
+        |                              k8s://https://host:port, or local (Default: local[*]).
         |  --deploy-mode DEPLOY_MODE   Whether to launch the driver program locally ("client") or
         |                              on one of the worker machines inside the cluster ("cluster")
         |                              (Default: client).
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 8fa25c0281493..172ba85359da7 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -41,6 +41,10 @@ package object config {
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
+  private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.driver.memoryOverhead")
+    .bytesConf(ByteUnit.MiB)
+    .createOptional
+
   private[spark] val EVENT_LOG_COMPRESS =
     ConfigBuilder("spark.eventLog.compress")
       .booleanConf
@@ -80,6 +84,10 @@ package object config {
     .bytesConf(ByteUnit.MiB)
     .createWithDefaultString("1g")
 
+  private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead")
+    .bytesConf(ByteUnit.MiB)
+    .createOptional
+
   private[spark] val MEMORY_OFFHEAP_ENABLED = ConfigBuilder("spark.memory.offHeap.enabled")
     .doc("If true, Spark will attempt to use off-heap memory for certain operations. " +
       "If off-heap memory use is enabled, then spark.memory.offHeap.size must be positive.")
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 51bf91614c866..1ed09dc489440 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2744,6 +2744,42 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Check the validity of the given Kubernetes master URL and return the resolved URL. Prefix
+   * "k8s:" is appended to the resolved URL as the prefix is used by KubernetesClusterManager
+   * in canCreate to determine if the KubernetesClusterManager should be used.
+   */
+  def checkAndGetK8sMasterUrl(rawMasterURL: String): String = {
+    require(rawMasterURL.startsWith("k8s://"),
+      "Kubernetes master URL must start with k8s://.")
+    val masterWithoutK8sPrefix = rawMasterURL.substring("k8s://".length)
+
+    // To handle master URLs, e.g., k8s://host:port.
+    if (!masterWithoutK8sPrefix.contains("://")) {
+      val resolvedURL = s"https://$masterWithoutK8sPrefix"
+      logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " +
+        s"URL is $resolvedURL.")
+      return s"k8s:$resolvedURL"
+    }
+
+    val masterScheme = new URI(masterWithoutK8sPrefix).getScheme
+    val resolvedURL = masterScheme.toLowerCase match {
+      case "https" =>
+        masterWithoutK8sPrefix
+      case "http" =>
+        logWarning("Kubernetes master URL uses HTTP instead of HTTPS.")
+        masterWithoutK8sPrefix
+      case null =>
+        val resolvedURL = s"https://$masterWithoutK8sPrefix"
+        logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " +
+          s"URL is $resolvedURL.")
+        resolvedURL
+      case _ =>
+        throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme)
+    }
+
+    return s"k8s:$resolvedURL"
+  }
 }
 
 private[util] object CallerContext extends Logging {
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 2bde8757dae5d..37fcc93c62fa8 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -550,6 +550,12 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
     }
   }
 
+  test("client mode with a k8s master url") {
+    intercept[SparkException] {
+      sc = new SparkContext("k8s://https://host:port", "test", new SparkConf())
+    }
+  }
+
   testCancellingTasks("that raise interrupted exception on cancel") {
     Thread.sleep(9999999)
   }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index e200755e639e1..35594ec47c941 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -388,6 +388,33 @@ class SparkSubmitSuite
     conf.get("spark.ui.enabled") should be ("false")
   }
 
+  test("handles k8s cluster mode") {
+    val clArgs = Seq(
+      "--deploy-mode", "cluster",
+      "--master", "k8s://host:port",
+      "--executor-memory", "5g",
+      "--class", "org.SomeClass",
+      "--driver-memory", "4g",
+      "--conf", "spark.kubernetes.namespace=spark",
+      "--conf", "spark.kubernetes.driver.docker.image=bar",
+      "/home/thejar.jar",
+      "arg1")
+    val appArgs = new SparkSubmitArguments(clArgs)
+    val (childArgs, classpath, conf, mainClass) = prepareSubmitEnvironment(appArgs)
+
+    val childArgsMap = childArgs.grouped(2).map(a => a(0) -> a(1)).toMap
+    childArgsMap.get("--primary-java-resource") should be (Some("file:/home/thejar.jar"))
+    childArgsMap.get("--main-class") should be (Some("org.SomeClass"))
+    childArgsMap.get("--arg") should be (Some("arg1"))
+    mainClass should be (KUBERNETES_CLUSTER_SUBMIT_CLASS)
+    classpath should have length (0)
+    conf.get("spark.master") should be ("k8s:https://host:port")
+    conf.get("spark.executor.memory") should be ("5g")
+    conf.get("spark.driver.memory") should be ("4g")
+    conf.get("spark.kubernetes.namespace") should be ("spark")
+    conf.get("spark.kubernetes.driver.docker.image") should be ("bar")
+  }
+
   test("handles confs with flag equivalents") {
     val clArgs = Seq(
       "--deploy-mode", "cluster",
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 4d3adeb968e84..5c4e4ca0cded6 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1146,6 +1146,27 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     }
   }
 
+  test("check Kubernetes master URL") {
+    val k8sMasterURLHttps = Utils.checkAndGetK8sMasterUrl("k8s://https://host:port")
+    assert(k8sMasterURLHttps === "k8s:https://host:port")
+
+    val k8sMasterURLHttp = Utils.checkAndGetK8sMasterUrl("k8s://http://host:port")
+    assert(k8sMasterURLHttp === "k8s:http://host:port")
+
+    val k8sMasterURLWithoutScheme = Utils.checkAndGetK8sMasterUrl("k8s://127.0.0.1:8443")
+    assert(k8sMasterURLWithoutScheme === "k8s:https://127.0.0.1:8443")
+
+    val k8sMasterURLWithoutScheme2 = Utils.checkAndGetK8sMasterUrl("k8s://127.0.0.1")
+    assert(k8sMasterURLWithoutScheme2 === "k8s:https://127.0.0.1")
+
+    intercept[IllegalArgumentException] {
+      Utils.checkAndGetK8sMasterUrl("k8s:https://host:port")
+    }
+
+    intercept[IllegalArgumentException] {
+      Utils.checkAndGetK8sMasterUrl("k8s://foo://host:port")
+    }
+  }
 }
 
 private class SimpleExtension
diff --git a/docs/configuration.md b/docs/configuration.md
index ef061dd39dcba..d70bac134808f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -157,6 +157,16 @@ of the most common options to set are:
     or in your default properties file.
   </td>
 </tr>
+<tr>
+  <td><code>spark.driver.memoryOverhead</code></td>
+  <td>driverMemory * 0.10, with minimum of 384 </td>
+  <td>
+    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is
+    memory that accounts for things like VM overheads, interned strings, other native overheads, etc.
+    This tends to grow with the container size (typically 6-10%). This option is currently supported
+    on YARN and Kubernetes.
+  </td>
+</tr>
 <tr>
   <td><code>spark.executor.memory</code></td>
   <td>1g</td>
@@ -164,6 +174,16 @@ of the most common options to set are:
     Amount of memory to use per executor process (e.g. <code>2g</code>, <code>8g</code>).
   </td>
 </tr>
+<tr>
+ <td><code>spark.executor.memoryOverhead</code></td>
+  <td>executorMemory * 0.10, with minimum of 384 </td>
+  <td>
+    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that
+    accounts for things like VM overheads, interned strings, other native overheads, etc. This tends
+    to grow with the executor size (typically 6-10%). This option is currently supported on YARN and
+    Kubernetes.
+  </td>
+</tr>
 <tr>
   <td><code>spark.extraListeners</code></td>
   <td>(none)</td>
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 9599d40c545b2..7e2386f33b583 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -227,25 +227,11 @@ To use a custom metrics.properties for the application master and executors, upd
     The number of executors for static allocation. With <code>spark.dynamicAllocation.enabled</code>, the initial set of executors will be at least this large.
   </td>
 </tr>
-<tr>
- <td><code>spark.yarn.executor.memoryOverhead</code></td>
-  <td>executorMemory * 0.10, with minimum of 384 </td>
-  <td>
-    The amount of off-heap memory (in megabytes) to be allocated per executor. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the executor size (typically 6-10%).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.yarn.driver.memoryOverhead</code></td>
-  <td>driverMemory * 0.10, with minimum of 384 </td>
-  <td>
-    The amount of off-heap memory (in megabytes) to be allocated per driver in cluster mode. This is memory that accounts for things like VM overheads, interned strings, other native overheads, etc. This tends to grow with the container size (typically 6-10%).
-  </td>
-</tr>
 <tr>
   <td><code>spark.yarn.am.memoryOverhead</code></td>
   <td>AM memory * 0.10, with minimum of 384 </td>
   <td>
-    Same as <code>spark.yarn.driver.memoryOverhead</code>, but for the YARN Application Master in client mode.
+    Same as <code>spark.driver.memoryOverhead</code>, but for the YARN Application Master in client mode.
   </td>
 </tr>
 <tr>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
index 6767cc5079649..c57af92029460 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitOptionParser.java
@@ -82,7 +82,7 @@ class SparkSubmitOptionParser {
    * name of the option, passed to {@link #handle(String, String)}.
    * <p>
    * Options not listed here nor in the "switch" list below will result in a call to
-   * {@link $#handleUnknown(String)}.
+   * {@link #handleUnknown(String)}.
    * <p>
    * These two arrays are visible for tests.
    */
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index f0742b91987b6..f35fb38798218 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.deploy.k8s
 
+import java.util.concurrent.TimeUnit
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.ConfigBuilder
 import org.apache.spark.network.util.ByteUnit
@@ -24,12 +26,16 @@ private[spark] object Config extends Logging {
 
   val KUBERNETES_NAMESPACE =
     ConfigBuilder("spark.kubernetes.namespace")
-      .doc("The namespace that will be used for running the driver and executor pods. When using " +
-        "spark-submit in cluster mode, this can also be passed to spark-submit via the " +
-        "--kubernetes-namespace command line argument.")
+      .doc("The namespace that will be used for running the driver and executor pods.")
       .stringConf
       .createWithDefault("default")
 
+  val DRIVER_DOCKER_IMAGE =
+    ConfigBuilder("spark.kubernetes.driver.docker.image")
+      .doc("Docker image to use for the driver. Specify this using the standard Docker tag format.")
+      .stringConf
+      .createOptional
+
   val EXECUTOR_DOCKER_IMAGE =
     ConfigBuilder("spark.kubernetes.executor.docker.image")
       .doc("Docker image to use for the executors. Specify this using the standard Docker tag " +
@@ -44,9 +50,9 @@ private[spark] object Config extends Logging {
       .checkValues(Set("Always", "Never", "IfNotPresent"))
       .createWithDefault("IfNotPresent")
 
-  val APISERVER_AUTH_DRIVER_CONF_PREFIX =
+  val KUBERNETES_AUTH_DRIVER_CONF_PREFIX =
       "spark.kubernetes.authenticate.driver"
-  val APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX =
+  val KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX =
       "spark.kubernetes.authenticate.driver.mounted"
   val OAUTH_TOKEN_CONF_SUFFIX = "oauthToken"
   val OAUTH_TOKEN_FILE_CONF_SUFFIX = "oauthTokenFile"
@@ -55,7 +61,7 @@ private[spark] object Config extends Logging {
   val CA_CERT_FILE_CONF_SUFFIX = "caCertFile"
 
   val KUBERNETES_SERVICE_ACCOUNT_NAME =
-    ConfigBuilder(s"$APISERVER_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
+    ConfigBuilder(s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
       .doc("Service account that is used when running the driver pod. The driver pod uses " +
         "this service account when requesting executor pods from the API server. If specific " +
         "credentials are given for the driver pod to use, the driver will favor " +
@@ -63,19 +69,17 @@ private[spark] object Config extends Logging {
       .stringConf
       .createOptional
 
-  // Note that while we set a default for this when we start up the
-  // scheduler, the specific default value is dynamically determined
-  // based on the executor memory.
-  val KUBERNETES_EXECUTOR_MEMORY_OVERHEAD =
-    ConfigBuilder("spark.kubernetes.executor.memoryOverhead")
-      .doc("The amount of off-heap memory (in megabytes) to be allocated per executor. This " +
-        "is memory that accounts for things like VM overheads, interned strings, other native " +
-        "overheads, etc. This tends to grow with the executor size. (typically 6-10%).")
-      .bytesConf(ByteUnit.MiB)
+  val KUBERNETES_DRIVER_LIMIT_CORES =
+    ConfigBuilder("spark.kubernetes.driver.limit.cores")
+      .doc("Specify the hard cpu limit for the driver pod")
+      .stringConf
       .createOptional
 
-  val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
-  val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
+  val KUBERNETES_EXECUTOR_LIMIT_CORES =
+    ConfigBuilder("spark.kubernetes.executor.limit.cores")
+      .doc("Specify the hard cpu limit for each executor pod")
+      .stringConf
+      .createOptional
 
   val KUBERNETES_DRIVER_POD_NAME =
     ConfigBuilder("spark.kubernetes.driver.pod.name")
@@ -104,12 +108,6 @@ private[spark] object Config extends Logging {
       .checkValue(value => value > 0, "Allocation batch delay should be a positive integer")
       .createWithDefault(1)
 
-  val KUBERNETES_EXECUTOR_LIMIT_CORES =
-    ConfigBuilder("spark.kubernetes.executor.limit.cores")
-      .doc("Specify the hard cpu limit for a single executor pod")
-      .stringConf
-      .createOptional
-
   val KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS =
     ConfigBuilder("spark.kubernetes.executor.lostCheck.maxAttempts")
       .doc("Maximum number of attempts allowed for checking the reason of an executor loss " +
@@ -119,5 +117,46 @@ private[spark] object Config extends Logging {
         "must be a positive integer")
       .createWithDefault(10)
 
+  val WAIT_FOR_APP_COMPLETION =
+    ConfigBuilder("spark.kubernetes.submission.waitAppCompletion")
+      .doc("In cluster mode, whether to wait for the application to finish before exiting the " +
+        "launcher process.")
+      .booleanConf
+      .createWithDefault(true)
+
+  val REPORT_INTERVAL =
+    ConfigBuilder("spark.kubernetes.report.interval")
+      .doc("Interval between reports of the current app status in cluster mode.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .checkValue(interval => interval > 0, s"Logging interval must be a positive time value.")
+      .createWithDefaultString("1s")
+
+  val JARS_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.mountDependencies.jarsDownloadDir")
+      .doc("Location to download jars to in the driver and executors. When using" +
+        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
+        " volume on the driver and executor pod.")
+      .stringConf
+      .createWithDefault("/var/spark-data/spark-jars")
+
+  val FILES_DOWNLOAD_LOCATION =
+    ConfigBuilder("spark.kubernetes.mountDependencies.filesDownloadDir")
+      .doc("Location to download files to in the driver and executors. When using" +
+        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
+        " volume on the driver and executor pods.")
+      .stringConf
+      .createWithDefault("/var/spark-data/spark-files")
+
+  val KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX =
+    "spark.kubernetes.authenticate.submission"
+
   val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
+
+  val KUBERNETES_DRIVER_LABEL_PREFIX = "spark.kubernetes.driver.label."
+  val KUBERNETES_DRIVER_ANNOTATION_PREFIX = "spark.kubernetes.driver.annotation."
+
+  val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
+  val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
+
+  val KUBERNETES_DRIVER_ENV_KEY = "spark.kubernetes.driverEnv."
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 4ddeefb15a89d..0b91145405d3a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -25,9 +25,30 @@ private[spark] object Constants {
   val SPARK_POD_DRIVER_ROLE = "driver"
   val SPARK_POD_EXECUTOR_ROLE = "executor"
 
+  // Annotations
+  val SPARK_APP_NAME_ANNOTATION = "spark-app-name"
+
+  // Credentials secrets
+  val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
+    "/mnt/secrets/spark-kubernetes-credentials"
+  val DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME = "ca-cert"
+  val DRIVER_CREDENTIALS_CA_CERT_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME"
+  val DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME = "client-key"
+  val DRIVER_CREDENTIALS_CLIENT_KEY_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME"
+  val DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME = "client-cert"
+  val DRIVER_CREDENTIALS_CLIENT_CERT_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME"
+  val DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME = "oauth-token"
+  val DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH =
+    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME"
+  val DRIVER_CREDENTIALS_SECRET_VOLUME_NAME = "kubernetes-credentials"
+
   // Default and fixed ports
   val DEFAULT_DRIVER_PORT = 7078
   val DEFAULT_BLOCKMANAGER_PORT = 7079
+  val DRIVER_PORT_NAME = "driver-rpc-port"
   val BLOCK_MANAGER_PORT_NAME = "blockmanager"
   val EXECUTOR_PORT_NAME = "executor"
 
@@ -42,9 +63,16 @@ private[spark] object Constants {
   val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
   val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
+  val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
+  val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
+  val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
+  val ENV_DRIVER_BIND_ADDRESS = "SPARK_DRIVER_BIND_ADDRESS"
+  val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
 
   // Miscellaneous
   val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
+  val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
   val MEMORY_OVERHEAD_FACTOR = 0.10
   val MEMORY_OVERHEAD_MIN_MIB = 384L
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
new file mode 100644
index 0000000000000..c563fc5bfbadf
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import java.util.UUID
+
+import com.google.common.primitives.Longs
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.ConfigurationUtils
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.steps._
+import org.apache.spark.launcher.SparkLauncher
+import org.apache.spark.util.SystemClock
+
+/**
+ * Constructs the complete list of driver configuration steps to run to deploy the Spark driver.
+ */
+private[spark] class DriverConfigurationStepsOrchestrator(
+    namespace: String,
+    kubernetesAppId: String,
+    launchTime: Long,
+    mainAppResource: Option[MainAppResource],
+    appName: String,
+    mainClass: String,
+    appArgs: Array[String],
+    submissionSparkConf: SparkConf) {
+
+  // The resource name prefix is derived from the Spark application name, making it easy to connect
+  // the names of the Kubernetes resources from e.g. kubectl or the Kubernetes dashboard to the
+  // application the user submitted.
+  private val kubernetesResourceNamePrefix = {
+    val uuid = UUID.nameUUIDFromBytes(Longs.toByteArray(launchTime)).toString.replaceAll("-", "")
+    s"$appName-$uuid".toLowerCase.replaceAll("\\.", "-")
+  }
+
+  private val dockerImagePullPolicy = submissionSparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+  private val jarsDownloadPath = submissionSparkConf.get(JARS_DOWNLOAD_LOCATION)
+  private val filesDownloadPath = submissionSparkConf.get(FILES_DOWNLOAD_LOCATION)
+
+  def getAllConfigurationSteps(): Seq[DriverConfigurationStep] = {
+    val driverCustomLabels = ConfigurationUtils.parsePrefixedKeyValuePairs(
+      submissionSparkConf,
+      KUBERNETES_DRIVER_LABEL_PREFIX)
+    require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), "Label with key " +
+      s"$SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping " +
+      "operations.")
+    require(!driverCustomLabels.contains(SPARK_ROLE_LABEL), "Label with key " +
+      s"$SPARK_ROLE_LABEL is not allowed as it is reserved for Spark bookkeeping " +
+      "operations.")
+
+    val allDriverLabels = driverCustomLabels ++ Map(
+      SPARK_APP_ID_LABEL -> kubernetesAppId,
+      SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
+
+    val initialSubmissionStep = new BaseDriverConfigurationStep(
+      kubernetesAppId,
+      kubernetesResourceNamePrefix,
+      allDriverLabels,
+      dockerImagePullPolicy,
+      appName,
+      mainClass,
+      appArgs,
+      submissionSparkConf)
+
+    val driverAddressStep = new DriverServiceBootstrapStep(
+      kubernetesResourceNamePrefix,
+      allDriverLabels,
+      submissionSparkConf,
+      new SystemClock)
+
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+      submissionSparkConf, kubernetesResourceNamePrefix)
+
+    val additionalMainAppJar = if (mainAppResource.nonEmpty) {
+       val mayBeResource = mainAppResource.get match {
+        case JavaMainAppResource(resource) if resource != SparkLauncher.NO_RESOURCE =>
+          Some(resource)
+        case _ => None
+      }
+      mayBeResource
+    } else {
+      None
+    }
+
+    val sparkJars = submissionSparkConf.getOption("spark.jars")
+      .map(_.split(","))
+      .getOrElse(Array.empty[String]) ++
+      additionalMainAppJar.toSeq
+    val sparkFiles = submissionSparkConf.getOption("spark.files")
+      .map(_.split(","))
+      .getOrElse(Array.empty[String])
+
+    val maybeDependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
+      Some(new DependencyResolutionStep(
+        sparkJars,
+        sparkFiles,
+        jarsDownloadPath,
+        filesDownloadPath))
+    } else {
+      None
+    }
+
+    Seq(
+      initialSubmissionStep,
+      driverAddressStep,
+      kubernetesCredentialsStep) ++
+      maybeDependencyResolutionStep.toSeq
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
new file mode 100644
index 0000000000000..4d17608c602d8
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import java.util.{Collections, UUID}
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.util.control.NonFatal
+
+import io.fabric8.kubernetes.api.model._
+import io.fabric8.kubernetes.client.KubernetesClient
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkApplication
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.SparkKubernetesClientFactory
+import org.apache.spark.deploy.k8s.submit.steps.DriverConfigurationStep
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Utils
+
+/**
+ * Encapsulates arguments to the submission client.
+ *
+ * @param mainAppResource the main application resource if any
+ * @param mainClass the main class of the application to run
+ * @param driverArgs arguments to the driver
+ */
+private[spark] case class ClientArguments(
+     mainAppResource: Option[MainAppResource],
+     mainClass: String,
+     driverArgs: Array[String])
+
+private[spark] object ClientArguments {
+
+  def fromCommandLineArgs(args: Array[String]): ClientArguments = {
+    var mainAppResource: Option[MainAppResource] = None
+    var mainClass: Option[String] = None
+    val driverArgs = mutable.ArrayBuffer.empty[String]
+
+    args.sliding(2, 2).toList.foreach {
+      case Array("--primary-java-resource", primaryJavaResource: String) =>
+        mainAppResource = Some(JavaMainAppResource(primaryJavaResource))
+      case Array("--main-class", clazz: String) =>
+        mainClass = Some(clazz)
+      case Array("--arg", arg: String) =>
+        driverArgs += arg
+      case other =>
+        val invalid = other.mkString(" ")
+        throw new RuntimeException(s"Unknown arguments: $invalid")
+    }
+
+    require(mainClass.isDefined, "Main class must be specified via --main-class")
+
+    ClientArguments(
+      mainAppResource,
+      mainClass.get,
+      driverArgs.toArray)
+  }
+}
+
+/**
+ * Submits a Spark application to run on Kubernetes by creating the driver pod and starting a
+ * watcher that monitors and logs the application status. Waits for the application to terminate if
+ * spark.kubernetes.submission.waitAppCompletion is true.
+ *
+ * @param submissionSteps steps that collectively configure the driver
+ * @param submissionSparkConf the submission client Spark configuration
+ * @param kubernetesClient the client to talk to the Kubernetes API server
+ * @param waitForAppCompletion a flag indicating whether the client should wait for the application
+ *                             to complete
+ * @param appName the application name
+ * @param loggingPodStatusWatcher a watcher that monitors and logs the application status
+ */
+private[spark] class Client(
+    submissionSteps: Seq[DriverConfigurationStep],
+    submissionSparkConf: SparkConf,
+    kubernetesClient: KubernetesClient,
+    waitForAppCompletion: Boolean,
+    appName: String,
+    loggingPodStatusWatcher: LoggingPodStatusWatcher) extends Logging {
+
+  private val driverJavaOptions = submissionSparkConf.get(
+    org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+
+   /**
+    * Run command that initializes a DriverSpec that will be updated after each
+    * DriverConfigurationStep in the sequence that is passed in. The final KubernetesDriverSpec
+    * will be used to build the Driver Container, Driver Pod, and Kubernetes Resources
+    */
+  def run(): Unit = {
+    var currentDriverSpec = KubernetesDriverSpec.initialSpec(submissionSparkConf)
+    // submissionSteps contain steps necessary to take, to resolve varying
+    // client arguments that are passed in, created by orchestrator
+    for (nextStep <- submissionSteps) {
+      currentDriverSpec = nextStep.configureDriver(currentDriverSpec)
+    }
+
+    val resolvedDriverJavaOpts = currentDriverSpec
+      .driverSparkConf
+      // Remove this as the options are instead extracted and set individually below using
+      // environment variables with prefix SPARK_JAVA_OPT_.
+      .remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
+      .getAll
+      .map {
+        case (confKey, confValue) => s"-D$confKey=$confValue"
+      } ++ driverJavaOptions.map(Utils.splitCommandString).getOrElse(Seq.empty)
+    val driverJavaOptsEnvs: Seq[EnvVar] = resolvedDriverJavaOpts.zipWithIndex.map {
+      case (option, index) =>
+        new EnvVarBuilder()
+          .withName(s"$ENV_JAVA_OPT_PREFIX$index")
+          .withValue(option)
+          .build()
+    }
+
+    val resolvedDriverContainer = new ContainerBuilder(currentDriverSpec.driverContainer)
+      .addAllToEnv(driverJavaOptsEnvs.asJava)
+      .build()
+    val resolvedDriverPod = new PodBuilder(currentDriverSpec.driverPod)
+      .editSpec()
+        .addToContainers(resolvedDriverContainer)
+        .endSpec()
+      .build()
+
+    Utils.tryWithResource(
+      kubernetesClient
+        .pods()
+        .withName(resolvedDriverPod.getMetadata.getName)
+        .watch(loggingPodStatusWatcher)) { _ =>
+      val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
+      try {
+        if (currentDriverSpec.otherKubernetesResources.nonEmpty) {
+          val otherKubernetesResources = currentDriverSpec.otherKubernetesResources
+          addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
+          kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
+        }
+      } catch {
+        case NonFatal(e) =>
+          kubernetesClient.pods().delete(createdDriverPod)
+          throw e
+      }
+
+      if (waitForAppCompletion) {
+        logInfo(s"Waiting for application $appName to finish...")
+        loggingPodStatusWatcher.awaitCompletion()
+        logInfo(s"Application $appName finished.")
+      } else {
+        logInfo(s"Deployed Spark application $appName into Kubernetes.")
+      }
+    }
+  }
+
+  // Add a OwnerReference to the given resources making the driver pod an owner of them so when
+  // the driver pod is deleted, the resources are garbage collected.
+  private def addDriverOwnerReference(driverPod: Pod, resources: Seq[HasMetadata]): Unit = {
+    val driverPodOwnerReference = new OwnerReferenceBuilder()
+      .withName(driverPod.getMetadata.getName)
+      .withApiVersion(driverPod.getApiVersion)
+      .withUid(driverPod.getMetadata.getUid)
+      .withKind(driverPod.getKind)
+      .withController(true)
+      .build()
+    resources.foreach { resource =>
+      val originalMetadata = resource.getMetadata
+      originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
+    }
+  }
+}
+
+/**
+ * Main class and entry point of application submission in KUBERNETES mode.
+ */
+private[spark] class KubernetesClientApplication extends SparkApplication {
+
+  override def start(args: Array[String], conf: SparkConf): Unit = {
+    val parsedArguments = ClientArguments.fromCommandLineArgs(args)
+    run(parsedArguments, conf)
+  }
+
+  private def run(clientArguments: ClientArguments, sparkConf: SparkConf): Unit = {
+    val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
+    // For constructing the app ID, we can't use the Spark application name, as the app ID is going
+    // to be added as a label to group resources belonging to the same application. Label values are
+    // considerably restrictive, e.g. must be no longer than 63 characters in length. So we generate
+    // a unique app ID (captured by spark.app.id) in the format below.
+    val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
+    val launchTime = System.currentTimeMillis()
+    val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
+    val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
+    // The master URL has been checked for validity already in SparkSubmit.
+    // We just need to get rid of the "k8s:" prefix here.
+    val master = sparkConf.get("spark.master").substring("k8s:".length)
+    val loggingInterval = if (waitForAppCompletion) Some(sparkConf.get(REPORT_INTERVAL)) else None
+
+    val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
+      kubernetesAppId, loggingInterval)
+
+    val configurationStepsOrchestrator = new DriverConfigurationStepsOrchestrator(
+      namespace,
+      kubernetesAppId,
+      launchTime,
+      clientArguments.mainAppResource,
+      appName,
+      clientArguments.mainClass,
+      clientArguments.driverArgs,
+      sparkConf)
+
+    Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
+      master,
+      Some(namespace),
+      KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX,
+      sparkConf,
+      None,
+      None)) { kubernetesClient =>
+        val client = new Client(
+          configurationStepsOrchestrator.getAllConfigurationSteps(),
+          sparkConf,
+          kubernetesClient,
+          waitForAppCompletion,
+          appName,
+          loggingPodStatusWatcher)
+        client.run()
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
new file mode 100644
index 0000000000000..db13f09387ef9
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, HasMetadata, Pod, PodBuilder}
+
+import org.apache.spark.SparkConf
+
+/**
+ * Represents the components and characteristics of a Spark driver. The driver can be considered
+ * as being comprised of the driver pod itself, any other Kubernetes resources that the driver
+ * pod depends on, and the SparkConf that should be supplied to the Spark application. The driver
+ * container should be operated on via the specific field of this case class as opposed to trying
+ * to edit the container directly on the pod. The driver container should be attached at the
+ * end of executing all submission steps.
+ */
+private[spark] case class KubernetesDriverSpec(
+    driverPod: Pod,
+    driverContainer: Container,
+    otherKubernetesResources: Seq[HasMetadata],
+    driverSparkConf: SparkConf)
+
+private[spark] object KubernetesDriverSpec {
+  def initialSpec(initialSparkConf: SparkConf): KubernetesDriverSpec = {
+    KubernetesDriverSpec(
+      // Set new metadata and a new spec so that submission steps can use
+      // PodBuilder#editMetadata() and/or PodBuilder#editSpec() safely.
+      new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build(),
+      new ContainerBuilder().build(),
+      Seq.empty[HasMetadata],
+      initialSparkConf.clone())
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala
new file mode 100644
index 0000000000000..a38cf55fc3d58
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import java.io.File
+
+import org.apache.spark.util.Utils
+
+private[spark] object KubernetesFileUtils {
+
+  /**
+   * For the given collection of file URIs, resolves them as follows:
+   * - File URIs with scheme file:// are resolved to the given download path.
+   * - File URIs with scheme local:// resolve to just the path of the URI.
+   * - Otherwise, the URIs are returned as-is.
+   */
+  def resolveFileUris(
+      fileUris: Iterable[String],
+      fileDownloadPath: String): Iterable[String] = {
+    fileUris.map { uri =>
+      resolveFileUri(uri, fileDownloadPath, false)
+    }
+  }
+
+  /**
+   * If any file uri has any scheme other than local:// it is mapped as if the file
+   * was downloaded to the file download path. Otherwise, it is mapped to the path
+   * part of the URI.
+   */
+  def resolveFilePaths(fileUris: Iterable[String], fileDownloadPath: String): Iterable[String] = {
+    fileUris.map { uri =>
+      resolveFileUri(uri, fileDownloadPath, true)
+    }
+  }
+
+  private def resolveFileUri(
+      uri: String,
+      fileDownloadPath: String,
+      assumesDownloaded: Boolean): String = {
+    val fileUri = Utils.resolveURI(uri)
+    val fileScheme = Option(fileUri.getScheme).getOrElse("file")
+    fileScheme match {
+      case "local" =>
+        fileUri.getPath
+      case _ =>
+        if (assumesDownloaded || fileScheme == "file") {
+          val fileName = new File(fileUri.getPath).getName
+          s"$fileDownloadPath/$fileName"
+        } else {
+          uri
+        }
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
new file mode 100644
index 0000000000000..173ac541626a7
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import java.util.concurrent.{CountDownLatch, TimeUnit}
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, Pod, Time}
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+
+import org.apache.spark.SparkException
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.ThreadUtils
+
+private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] {
+  def awaitCompletion(): Unit
+}
+
+/**
+ * A monitor for the running Kubernetes pod of a Spark application. Status logging occurs on
+ * every state change and also at an interval for liveness.
+ *
+ * @param appId application ID.
+ * @param maybeLoggingInterval ms between each state request. If provided, must be a positive
+ *                             number.
+ */
+private[k8s] class LoggingPodStatusWatcherImpl(
+    appId: String,
+    maybeLoggingInterval: Option[Long])
+  extends LoggingPodStatusWatcher with Logging {
+
+  private val podCompletedFuture = new CountDownLatch(1)
+  // start timer for periodic logging
+  private val scheduler =
+    ThreadUtils.newDaemonSingleThreadScheduledExecutor("logging-pod-status-watcher")
+  private val logRunnable: Runnable = new Runnable {
+    override def run() = logShortStatus()
+  }
+
+  private var pod = Option.empty[Pod]
+
+  private def phase: String = pod.map(_.getStatus.getPhase).getOrElse("unknown")
+
+  def start(): Unit = {
+    maybeLoggingInterval.foreach { interval =>
+      scheduler.scheduleAtFixedRate(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
+    }
+  }
+
+  override def eventReceived(action: Action, pod: Pod): Unit = {
+    this.pod = Option(pod)
+    action match {
+      case Action.DELETED | Action.ERROR =>
+        closeWatch()
+
+      case _ =>
+        logLongStatus()
+        if (hasCompleted()) {
+          closeWatch()
+        }
+    }
+  }
+
+  override def onClose(e: KubernetesClientException): Unit = {
+    logDebug(s"Stopping watching application $appId with last-observed phase $phase")
+    closeWatch()
+  }
+
+  private def logShortStatus() = {
+    logInfo(s"Application status for $appId (phase: $phase)")
+  }
+
+  private def logLongStatus() = {
+    logInfo("State changed, new state: " + pod.map(formatPodState).getOrElse("unknown"))
+  }
+
+  private def hasCompleted(): Boolean = {
+    phase == "Succeeded" || phase == "Failed"
+  }
+
+  private def closeWatch(): Unit = {
+    podCompletedFuture.countDown()
+    scheduler.shutdown()
+  }
+
+  private def formatPodState(pod: Pod): String = {
+    val details = Seq[(String, String)](
+      // pod metadata
+      ("pod name", pod.getMetadata.getName),
+      ("namespace", pod.getMetadata.getNamespace),
+      ("labels", pod.getMetadata.getLabels.asScala.mkString(", ")),
+      ("pod uid", pod.getMetadata.getUid),
+      ("creation time", formatTime(pod.getMetadata.getCreationTimestamp)),
+
+      // spec details
+      ("service account name", pod.getSpec.getServiceAccountName),
+      ("volumes", pod.getSpec.getVolumes.asScala.map(_.getName).mkString(", ")),
+      ("node name", pod.getSpec.getNodeName),
+
+      // status
+      ("start time", formatTime(pod.getStatus.getStartTime)),
+      ("container images",
+        pod.getStatus.getContainerStatuses
+          .asScala
+          .map(_.getImage)
+          .mkString(", ")),
+      ("phase", pod.getStatus.getPhase),
+      ("status", pod.getStatus.getContainerStatuses.toString)
+    )
+
+    formatPairsBundle(details)
+  }
+
+  private def formatPairsBundle(pairs: Seq[(String, String)]) = {
+    // Use more loggable format if value is null or empty
+    pairs.map {
+      case (k, v) => s"\n\t $k: ${Option(v).filter(_.nonEmpty).getOrElse("N/A")}"
+    }.mkString("")
+  }
+
+  override def awaitCompletion(): Unit = {
+    podCompletedFuture.await()
+    logInfo(pod.map { p =>
+      s"Container final statuses:\n\n${containersDescription(p)}"
+    }.getOrElse("No containers were found in the driver pod."))
+  }
+
+  private def containersDescription(p: Pod): String = {
+    p.getStatus.getContainerStatuses.asScala.map { status =>
+      Seq(
+        ("Container name", status.getName),
+        ("Container image", status.getImage)) ++
+        containerStatusDescription(status)
+    }.map(formatPairsBundle).mkString("\n\n")
+  }
+
+  private def containerStatusDescription(
+      containerStatus: ContainerStatus): Seq[(String, String)] = {
+    val state = containerStatus.getState
+    Option(state.getRunning)
+      .orElse(Option(state.getTerminated))
+      .orElse(Option(state.getWaiting))
+      .map {
+        case running: ContainerStateRunning =>
+          Seq(
+            ("Container state", "Running"),
+            ("Container started at", formatTime(running.getStartedAt)))
+        case waiting: ContainerStateWaiting =>
+          Seq(
+            ("Container state", "Waiting"),
+            ("Pending reason", waiting.getReason))
+        case terminated: ContainerStateTerminated =>
+          Seq(
+            ("Container state", "Terminated"),
+            ("Exit code", terminated.getExitCode.toString))
+        case unknown =>
+          throw new SparkException(s"Unexpected container status type ${unknown.getClass}.")
+        }.getOrElse(Seq(("Container state", "N/A")))
+  }
+
+  private def formatTime(time: Time): String = {
+    if (time != null) time.getTime else "N/A"
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
new file mode 100644
index 0000000000000..cca9f4627a1f6
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+private[spark] sealed trait MainAppResource
+
+private[spark] case class JavaMainAppResource(primaryResource: String) extends MainAppResource
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
new file mode 100644
index 0000000000000..ba2a11b9e6689
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, EnvVarSourceBuilder, PodBuilder, QuantityBuilder}
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.ConfigurationUtils
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.internal.config.{DRIVER_CLASS_PATH, DRIVER_MEMORY, DRIVER_MEMORY_OVERHEAD}
+
+/**
+ * Represents the initial setup required for the driver.
+ */
+private[spark] class BaseDriverConfigurationStep(
+    kubernetesAppId: String,
+    kubernetesResourceNamePrefix: String,
+    driverLabels: Map[String, String],
+    dockerImagePullPolicy: String,
+    appName: String,
+    mainClass: String,
+    appArgs: Array[String],
+    submissionSparkConf: SparkConf) extends DriverConfigurationStep {
+
+  private val kubernetesDriverPodName = submissionSparkConf.get(KUBERNETES_DRIVER_POD_NAME)
+    .getOrElse(s"$kubernetesResourceNamePrefix-driver")
+
+  private val driverExtraClasspath = submissionSparkConf.get(
+    DRIVER_CLASS_PATH)
+
+  private val driverDockerImage = submissionSparkConf
+    .get(DRIVER_DOCKER_IMAGE)
+    .getOrElse(throw new SparkException("Must specify the driver Docker image"))
+
+  // CPU settings
+  private val driverCpuCores = submissionSparkConf.getOption("spark.driver.cores").getOrElse("1")
+  private val driverLimitCores = submissionSparkConf.get(KUBERNETES_DRIVER_LIMIT_CORES)
+
+  // Memory settings
+  private val driverMemoryMiB = submissionSparkConf.get(
+    DRIVER_MEMORY)
+  private val driverMemoryString = submissionSparkConf.get(
+    DRIVER_MEMORY.key,
+    DRIVER_MEMORY.defaultValueString)
+  private val memoryOverheadMiB = submissionSparkConf
+    .get(DRIVER_MEMORY_OVERHEAD)
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMiB).toInt,
+      MEMORY_OVERHEAD_MIN_MIB))
+  private val driverContainerMemoryWithOverheadMiB = driverMemoryMiB + memoryOverheadMiB
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
+      new EnvVarBuilder()
+        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withValue(classPath)
+        .build()
+    }
+
+    val driverCustomAnnotations = ConfigurationUtils
+      .parsePrefixedKeyValuePairs(
+        submissionSparkConf,
+        KUBERNETES_DRIVER_ANNOTATION_PREFIX)
+    require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
+      s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
+        " Spark bookkeeping operations.")
+
+    val driverCustomEnvs = submissionSparkConf.getAllWithPrefix(KUBERNETES_DRIVER_ENV_KEY).toSeq
+      .map { env =>
+        new EnvVarBuilder()
+          .withName(env._1)
+          .withValue(env._2)
+          .build()
+      }
+
+    val allDriverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
+
+    val nodeSelector = ConfigurationUtils.parsePrefixedKeyValuePairs(
+      submissionSparkConf, KUBERNETES_NODE_SELECTOR_PREFIX)
+
+    val driverCpuQuantity = new QuantityBuilder(false)
+      .withAmount(driverCpuCores)
+      .build()
+    val driverMemoryQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverMemoryMiB}Mi")
+      .build()
+    val driverMemoryLimitQuantity = new QuantityBuilder(false)
+      .withAmount(s"${driverContainerMemoryWithOverheadMiB}Mi")
+      .build()
+    val maybeCpuLimitQuantity = driverLimitCores.map { limitCores =>
+      ("cpu", new QuantityBuilder(false).withAmount(limitCores).build())
+    }
+
+    val driverContainer = new ContainerBuilder(driverSpec.driverContainer)
+      .withName(DRIVER_CONTAINER_NAME)
+      .withImage(driverDockerImage)
+      .withImagePullPolicy(dockerImagePullPolicy)
+      .addAllToEnv(driverCustomEnvs.asJava)
+      .addToEnv(driverExtraClasspathEnv.toSeq: _*)
+      .addNewEnv()
+        .withName(ENV_DRIVER_MEMORY)
+        .withValue(driverMemoryString)
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_MAIN_CLASS)
+        .withValue(mainClass)
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_ARGS)
+        .withValue(appArgs.map(arg => "\"" + arg + "\"").mkString(" "))
+        .endEnv()
+      .addNewEnv()
+        .withName(ENV_DRIVER_BIND_ADDRESS)
+        .withValueFrom(new EnvVarSourceBuilder()
+          .withNewFieldRef("v1", "status.podIP")
+          .build())
+        .endEnv()
+      .withNewResources()
+        .addToRequests("cpu", driverCpuQuantity)
+        .addToRequests("memory", driverMemoryQuantity)
+        .addToLimits("memory", driverMemoryLimitQuantity)
+        .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
+        .endResources()
+      .build()
+
+    val baseDriverPod = new PodBuilder(driverSpec.driverPod)
+      .editOrNewMetadata()
+        .withName(kubernetesDriverPodName)
+        .addToLabels(driverLabels.asJava)
+        .addToAnnotations(allDriverAnnotations.asJava)
+      .endMetadata()
+      .withNewSpec()
+        .withRestartPolicy("Never")
+        .withNodeSelector(nodeSelector.asJava)
+        .endSpec()
+      .build()
+
+    val resolvedSparkConf = driverSpec.driverSparkConf.clone()
+      .setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
+      .set("spark.app.id", kubernetesAppId)
+      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, kubernetesResourceNamePrefix)
+
+    driverSpec.copy(
+      driverPod = baseDriverPod,
+      driverSparkConf = resolvedSparkConf,
+      driverContainer = driverContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
new file mode 100644
index 0000000000000..44e0ecffc0e93
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.File
+
+import io.fabric8.kubernetes.api.model.ContainerBuilder
+
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, KubernetesFileUtils}
+
+/**
+ * Step that configures the classpath, spark.jars, and spark.files for the driver given that the
+ * user may provide remote files or files with local:// schemes.
+ */
+private[spark] class DependencyResolutionStep(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    localFilesDownloadPath: String) extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val resolvedSparkJars = KubernetesFileUtils.resolveFileUris(sparkJars, jarsDownloadPath)
+    val resolvedSparkFiles = KubernetesFileUtils.resolveFileUris(
+      sparkFiles, localFilesDownloadPath)
+    val sparkConfResolvedSparkDependencies = driverSpec.driverSparkConf.clone()
+    if (resolvedSparkJars.nonEmpty) {
+      sparkConfResolvedSparkDependencies.set("spark.jars", resolvedSparkJars.mkString(","))
+    }
+    if (resolvedSparkFiles.nonEmpty) {
+      sparkConfResolvedSparkDependencies.set("spark.files", resolvedSparkFiles.mkString(","))
+    }
+    val resolvedClasspath = KubernetesFileUtils.resolveFilePaths(sparkJars, jarsDownloadPath)
+    val driverContainerWithResolvedClasspath = if (resolvedClasspath.nonEmpty) {
+      new ContainerBuilder(driverSpec.driverContainer)
+        .addNewEnv()
+        .withName(ENV_MOUNTED_CLASSPATH)
+        .withValue(resolvedClasspath.mkString(File.pathSeparator))
+        .endEnv()
+        .build()
+    } else {
+      driverSpec.driverContainer
+    }
+    driverSpec.copy(
+      driverContainer = driverContainerWithResolvedClasspath,
+      driverSparkConf = sparkConfResolvedSparkDependencies)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
new file mode 100644
index 0000000000000..c99c0436cf25f
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+
+/**
+ * Represents a step in preparing the Kubernetes driver.
+ */
+private[spark] trait DriverConfigurationStep {
+
+  /**
+   * Apply some transformation to the previous state of the driver to add a new feature to it.
+   */
+  def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
new file mode 100644
index 0000000000000..ccc18908658f1
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.File
+import java.nio.charset.StandardCharsets
+
+import scala.collection.JavaConverters._
+import scala.language.implicitConversions
+
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret, SecretBuilder}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+
+/**
+ * Mounts Kubernetes credentials into the driver pod. The driver will use such mounted credentials
+ * to request executors.
+ */
+private[spark] class DriverKubernetesCredentialsStep(
+    submissionSparkConf: SparkConf,
+    kubernetesResourceNamePrefix: String) extends DriverConfigurationStep {
+
+  private val maybeMountedOAuthTokenFile = submissionSparkConf.getOption(
+    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX")
+  private val maybeMountedClientKeyFile = submissionSparkConf.getOption(
+    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX")
+  private val maybeMountedClientCertFile = submissionSparkConf.getOption(
+    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX")
+  private val maybeMountedCaCertFile = submissionSparkConf.getOption(
+    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX")
+  private val driverServiceAccount = submissionSparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val driverSparkConf = driverSpec.driverSparkConf.clone()
+
+    val oauthTokenBase64 = submissionSparkConf
+      .getOption(s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX")
+      .map { token =>
+        BaseEncoding.base64().encode(token.getBytes(StandardCharsets.UTF_8))
+      }
+    val caCertDataBase64 = safeFileConfToBase64(
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+      "Driver CA cert file")
+    val clientKeyDataBase64 = safeFileConfToBase64(
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+      "Driver client key file")
+    val clientCertDataBase64 = safeFileConfToBase64(
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+      "Driver client cert file")
+
+    val driverSparkConfWithCredentialsLocations = setDriverPodKubernetesCredentialLocations(
+      driverSparkConf,
+      oauthTokenBase64,
+      caCertDataBase64,
+      clientKeyDataBase64,
+      clientCertDataBase64)
+
+    val kubernetesCredentialsSecret = createCredentialsSecret(
+      oauthTokenBase64,
+      caCertDataBase64,
+      clientKeyDataBase64,
+      clientCertDataBase64)
+
+    val driverPodWithMountedKubernetesCredentials = kubernetesCredentialsSecret.map { secret =>
+      new PodBuilder(driverSpec.driverPod)
+        .editOrNewSpec()
+          .addNewVolume()
+            .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+            .withNewSecret().withSecretName(secret.getMetadata.getName).endSecret()
+            .endVolume()
+          .endSpec()
+        .build()
+    }.getOrElse(
+      driverServiceAccount.map { account =>
+        new PodBuilder(driverSpec.driverPod)
+          .editOrNewSpec()
+          .withServiceAccount(account)
+          .withServiceAccountName(account)
+          .endSpec()
+          .build()
+      }.getOrElse(driverSpec.driverPod)
+    )
+
+    val driverContainerWithMountedSecretVolume = kubernetesCredentialsSecret.map { secret =>
+      new ContainerBuilder(driverSpec.driverContainer)
+        .addNewVolumeMount()
+          .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+          .withMountPath(DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
+          .endVolumeMount()
+        .build()
+    }.getOrElse(driverSpec.driverContainer)
+
+    driverSpec.copy(
+      driverPod = driverPodWithMountedKubernetesCredentials,
+      otherKubernetesResources =
+        driverSpec.otherKubernetesResources ++ kubernetesCredentialsSecret.toSeq,
+      driverSparkConf = driverSparkConfWithCredentialsLocations,
+      driverContainer = driverContainerWithMountedSecretVolume)
+  }
+
+  private def createCredentialsSecret(
+      driverOAuthTokenBase64: Option[String],
+      driverCaCertDataBase64: Option[String],
+      driverClientKeyDataBase64: Option[String],
+      driverClientCertDataBase64: Option[String]): Option[Secret] = {
+    val allSecretData =
+      resolveSecretData(
+        driverClientKeyDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME) ++
+      resolveSecretData(
+        driverClientCertDataBase64,
+        DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME) ++
+      resolveSecretData(
+        driverCaCertDataBase64,
+        DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME) ++
+      resolveSecretData(
+        driverOAuthTokenBase64,
+        DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME)
+
+    if (allSecretData.isEmpty) {
+      None
+    } else {
+      Some(new SecretBuilder()
+        .withNewMetadata()
+          .withName(s"$kubernetesResourceNamePrefix-kubernetes-credentials")
+          .endMetadata()
+        .withData(allSecretData.asJava)
+        .build())
+    }
+  }
+
+  private def setDriverPodKubernetesCredentialLocations(
+      driverSparkConf: SparkConf,
+      driverOauthTokenBase64: Option[String],
+      driverCaCertDataBase64: Option[String],
+      driverClientKeyDataBase64: Option[String],
+      driverClientCertDataBase64: Option[String]): SparkConf = {
+    val resolvedMountedOAuthTokenFile = resolveSecretLocation(
+      maybeMountedOAuthTokenFile,
+      driverOauthTokenBase64,
+      DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
+    val resolvedMountedClientKeyFile = resolveSecretLocation(
+      maybeMountedClientKeyFile,
+      driverClientKeyDataBase64,
+      DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
+    val resolvedMountedClientCertFile = resolveSecretLocation(
+      maybeMountedClientCertFile,
+      driverClientCertDataBase64,
+      DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
+    val resolvedMountedCaCertFile = resolveSecretLocation(
+      maybeMountedCaCertFile,
+      driverCaCertDataBase64,
+      DRIVER_CREDENTIALS_CA_CERT_PATH)
+
+    val sparkConfWithCredentialLocations = driverSparkConf
+      .setOption(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+        resolvedMountedCaCertFile)
+      .setOption(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+        resolvedMountedClientKeyFile)
+      .setOption(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+        resolvedMountedClientCertFile)
+      .setOption(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
+        resolvedMountedOAuthTokenFile)
+
+    // Redact all OAuth token values
+    sparkConfWithCredentialLocations
+      .getAll
+      .filter(_._1.endsWith(OAUTH_TOKEN_CONF_SUFFIX)).map(_._1)
+      .foreach {
+        sparkConfWithCredentialLocations.set(_, "<present_but_redacted>")
+      }
+    sparkConfWithCredentialLocations
+  }
+
+  private def safeFileConfToBase64(conf: String, fileType: String): Option[String] = {
+    submissionSparkConf.getOption(conf)
+      .map(new File(_))
+      .map { file =>
+        require(file.isFile, String.format("%s provided at %s does not exist or is not a file.",
+          fileType, file.getAbsolutePath))
+        BaseEncoding.base64().encode(Files.toByteArray(file))
+      }
+  }
+
+  private def resolveSecretLocation(
+      mountedUserSpecified: Option[String],
+      valueMountedFromSubmitter: Option[String],
+      mountedCanonicalLocation: String): Option[String] = {
+    mountedUserSpecified.orElse(valueMountedFromSubmitter.map { _ =>
+      mountedCanonicalLocation
+    })
+  }
+
+  /**
+   * Resolve a Kubernetes secret data entry from an optional client credential used by the
+   * driver to talk to the Kubernetes API server.
+   *
+   * @param userSpecifiedCredential the optional user-specified client credential.
+   * @param secretName name of the Kubernetes secret storing the client credential.
+   * @return a secret data entry in the form of a map from the secret name to the secret data,
+   *         which may be empty if the user-specified credential is empty.
+   */
+  private def resolveSecretData(
+      userSpecifiedCredential: Option[String],
+      secretName: String): Map[String, String] = {
+    userSpecifiedCredential.map { valueBase64 =>
+      Map(secretName -> valueBase64)
+    }.getOrElse(Map.empty[String, String])
+  }
+
+  private implicit def augmentSparkConf(sparkConf: SparkConf): OptionSettableSparkConf = {
+    new OptionSettableSparkConf(sparkConf)
+  }
+}
+
+private class OptionSettableSparkConf(sparkConf: SparkConf) {
+  def setOption(configEntry: String, option: Option[String]): SparkConf = {
+    option.foreach { opt =>
+      sparkConf.set(configEntry, opt)
+    }
+    sparkConf
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
new file mode 100644
index 0000000000000..696d11f15ed95
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.ServiceBuilder
+
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.Clock
+
+/**
+ * Allows the driver to be reachable by executor pods through a headless service. The service's
+ * ports should correspond to the ports that the executor will reach the pod at for RPC.
+ */
+private[spark] class DriverServiceBootstrapStep(
+    kubernetesResourceNamePrefix: String,
+    driverLabels: Map[String, String],
+    submissionSparkConf: SparkConf,
+    clock: Clock) extends DriverConfigurationStep with Logging {
+  import DriverServiceBootstrapStep._
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    require(submissionSparkConf.getOption(DRIVER_BIND_ADDRESS_KEY).isEmpty,
+      s"$DRIVER_BIND_ADDRESS_KEY is not supported in Kubernetes mode, as the driver's bind " +
+      "address is managed and set to the driver pod's IP address.")
+    require(submissionSparkConf.getOption(DRIVER_HOST_KEY).isEmpty,
+      s"$DRIVER_HOST_KEY is not supported in Kubernetes mode, as the driver's hostname will be " +
+      "managed via a Kubernetes service.")
+
+    val preferredServiceName = s"$kubernetesResourceNamePrefix$DRIVER_SVC_POSTFIX"
+    val resolvedServiceName = if (preferredServiceName.length <= MAX_SERVICE_NAME_LENGTH) {
+      preferredServiceName
+    } else {
+      val randomServiceId = clock.getTimeMillis()
+      val shorterServiceName = s"spark-$randomServiceId$DRIVER_SVC_POSTFIX"
+      logWarning(s"Driver's hostname would preferably be $preferredServiceName, but this is " +
+        s"too long (must be <= $MAX_SERVICE_NAME_LENGTH characters). Falling back to use " +
+        s"$shorterServiceName as the driver service's name.")
+      shorterServiceName
+    }
+
+    val driverPort = submissionSparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)
+    val driverBlockManagerPort = submissionSparkConf.getInt(
+        org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT.key, DEFAULT_BLOCKMANAGER_PORT)
+    val driverService = new ServiceBuilder()
+      .withNewMetadata()
+        .withName(resolvedServiceName)
+        .endMetadata()
+      .withNewSpec()
+        .withClusterIP("None")
+        .withSelector(driverLabels.asJava)
+        .addNewPort()
+          .withName(DRIVER_PORT_NAME)
+          .withPort(driverPort)
+          .withNewTargetPort(driverPort)
+          .endPort()
+        .addNewPort()
+          .withName(BLOCK_MANAGER_PORT_NAME)
+          .withPort(driverBlockManagerPort)
+          .withNewTargetPort(driverBlockManagerPort)
+          .endPort()
+        .endSpec()
+      .build()
+
+    val namespace = submissionSparkConf.get(KUBERNETES_NAMESPACE)
+    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc.cluster.local"
+    val resolvedSparkConf = driverSpec.driverSparkConf.clone()
+      .set(DRIVER_HOST_KEY, driverHostname)
+      .set("spark.driver.port", driverPort.toString)
+      .set(
+        org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, driverBlockManagerPort)
+
+    driverSpec.copy(
+      driverSparkConf = resolvedSparkConf,
+      otherKubernetesResources = driverSpec.otherKubernetesResources ++ Seq(driverService))
+  }
+}
+
+private[spark] object DriverServiceBootstrapStep {
+  val DRIVER_BIND_ADDRESS_KEY = org.apache.spark.internal.config.DRIVER_BIND_ADDRESS.key
+  val DRIVER_HOST_KEY = org.apache.spark.internal.config.DRIVER_HOST_ADDRESS.key
+  val DRIVER_SVC_POSTFIX = "-driver-svc"
+  val MAX_SERVICE_NAME_LENGTH = 63
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index f79155b117b67..9d8f3b912c33d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -24,6 +24,7 @@ import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.ConfigurationUtils
 import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.internal.config.{EXECUTOR_CLASS_PATH, EXECUTOR_JAVA_OPTIONS, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD}
 import org.apache.spark.util.Utils
 
 /**
@@ -46,8 +47,7 @@ private[spark] trait ExecutorPodFactory {
 private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
   extends ExecutorPodFactory {
 
-  private val executorExtraClasspath =
-    sparkConf.get(org.apache.spark.internal.config.EXECUTOR_CLASS_PATH)
+  private val executorExtraClasspath = sparkConf.get(EXECUTOR_CLASS_PATH)
 
   private val executorLabels = ConfigurationUtils.parsePrefixedKeyValuePairs(
     sparkConf,
@@ -81,13 +81,12 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
 
   private val executorPodNamePrefix = sparkConf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX)
 
-  private val executorMemoryMiB = sparkConf.get(org.apache.spark.internal.config.EXECUTOR_MEMORY)
+  private val executorMemoryMiB = sparkConf.get(EXECUTOR_MEMORY)
   private val executorMemoryString = sparkConf.get(
-    org.apache.spark.internal.config.EXECUTOR_MEMORY.key,
-    org.apache.spark.internal.config.EXECUTOR_MEMORY.defaultValueString)
+    EXECUTOR_MEMORY.key, EXECUTOR_MEMORY.defaultValueString)
 
   private val memoryOverheadMiB = sparkConf
-    .get(KUBERNETES_EXECUTOR_MEMORY_OVERHEAD)
+    .get(EXECUTOR_MEMORY_OVERHEAD)
     .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryMiB).toInt,
       MEMORY_OVERHEAD_MIN_MIB))
   private val executorMemoryWithOverhead = executorMemoryMiB + memoryOverheadMiB
@@ -129,7 +128,7 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
         .build()
     }
     val executorExtraJavaOptionsEnv = sparkConf
-      .get(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS)
+      .get(EXECUTOR_JAVA_OPTIONS)
       .map { opts =>
         val delimitedOpts = Utils.splitCommandString(opts)
         delimitedOpts.zipWithIndex.map {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
index 68ca6a7622171..b8bb152d17910 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
@@ -20,7 +20,7 @@ import java.io.File
 
 import io.fabric8.kubernetes.client.Config
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.k8s.SparkKubernetesClientFactory
@@ -33,6 +33,10 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
+    if (masterURL.startsWith("k8s") && sc.deployMode == "client") {
+      throw new SparkException("Client mode is currently not supported for Kubernetes.")
+    }
+
     new TaskSchedulerImpl(sc)
   }
 
@@ -45,7 +49,7 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
     val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
       KUBERNETES_MASTER_INTERNAL_URL,
       Some(sparkConf.get(KUBERNETES_NAMESPACE)),
-      APISERVER_AUTH_DRIVER_MOUNTED_CONF_PREFIX,
+      KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX,
       sparkConf,
       Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
       Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
new file mode 100644
index 0000000000000..bf4ec04893204
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import scala.collection.JavaConverters._
+
+import com.google.common.collect.Iterables
+import io.fabric8.kubernetes.api.model._
+import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
+import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
+import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
+import org.mockito.Mockito.{doReturn, verify, when}
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mockito.MockitoSugar._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.steps.DriverConfigurationStep
+
+class ClientSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val DRIVER_POD_UID = "pod-id"
+  private val DRIVER_POD_API_VERSION = "v1"
+  private val DRIVER_POD_KIND = "pod"
+
+  private type ResourceList = NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable[
+      HasMetadata, Boolean]
+  private type Pods = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
+
+  @Mock
+  private var kubernetesClient: KubernetesClient = _
+
+  @Mock
+  private var podOperations: Pods = _
+
+  @Mock
+  private var namedPods: PodResource[Pod, DoneablePod] = _
+
+  @Mock
+  private var loggingPodStatusWatcher: LoggingPodStatusWatcher = _
+
+  @Mock
+  private var resourceList: ResourceList = _
+
+  private val submissionSteps = Seq(FirstTestConfigurationStep, SecondTestConfigurationStep)
+  private var createdPodArgumentCaptor: ArgumentCaptor[Pod] = _
+  private var createdResourcesArgumentCaptor: ArgumentCaptor[HasMetadata] = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    when(kubernetesClient.pods()).thenReturn(podOperations)
+    when(podOperations.withName(FirstTestConfigurationStep.podName)).thenReturn(namedPods)
+
+    createdPodArgumentCaptor = ArgumentCaptor.forClass(classOf[Pod])
+    createdResourcesArgumentCaptor = ArgumentCaptor.forClass(classOf[HasMetadata])
+    when(podOperations.create(createdPodArgumentCaptor.capture())).thenAnswer(new Answer[Pod] {
+      override def answer(invocation: InvocationOnMock): Pod = {
+        new PodBuilder(invocation.getArgumentAt(0, classOf[Pod]))
+          .editMetadata()
+            .withUid(DRIVER_POD_UID)
+            .endMetadata()
+          .withApiVersion(DRIVER_POD_API_VERSION)
+          .withKind(DRIVER_POD_KIND)
+          .build()
+      }
+    })
+    when(podOperations.withName(FirstTestConfigurationStep.podName)).thenReturn(namedPods)
+    when(namedPods.watch(loggingPodStatusWatcher)).thenReturn(mock[Watch])
+    doReturn(resourceList)
+      .when(kubernetesClient)
+      .resourceList(createdResourcesArgumentCaptor.capture())
+  }
+
+  test("The client should configure the pod with the submission steps.") {
+    val submissionClient = new Client(
+      submissionSteps,
+      new SparkConf(false),
+      kubernetesClient,
+      false,
+      "spark",
+      loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    assert(createdPod.getMetadata.getName === FirstTestConfigurationStep.podName)
+    assert(createdPod.getMetadata.getLabels.asScala ===
+      Map(FirstTestConfigurationStep.labelKey -> FirstTestConfigurationStep.labelValue))
+    assert(createdPod.getMetadata.getAnnotations.asScala ===
+      Map(SecondTestConfigurationStep.annotationKey ->
+        SecondTestConfigurationStep.annotationValue))
+    assert(createdPod.getSpec.getContainers.size() === 1)
+    assert(createdPod.getSpec.getContainers.get(0).getName ===
+      SecondTestConfigurationStep.containerName)
+  }
+
+  test("The client should create the secondary Kubernetes resources.") {
+    val submissionClient = new Client(
+      submissionSteps,
+      new SparkConf(false),
+      kubernetesClient,
+      false,
+      "spark",
+      loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    val otherCreatedResources = createdResourcesArgumentCaptor.getAllValues
+    assert(otherCreatedResources.size === 1)
+    val createdResource = Iterables.getOnlyElement(otherCreatedResources).asInstanceOf[Secret]
+    assert(createdResource.getMetadata.getName === FirstTestConfigurationStep.secretName)
+    assert(createdResource.getData.asScala ===
+      Map(FirstTestConfigurationStep.secretKey -> FirstTestConfigurationStep.secretData))
+    val ownerReference = Iterables.getOnlyElement(createdResource.getMetadata.getOwnerReferences)
+    assert(ownerReference.getName === createdPod.getMetadata.getName)
+    assert(ownerReference.getKind === DRIVER_POD_KIND)
+    assert(ownerReference.getUid === DRIVER_POD_UID)
+    assert(ownerReference.getApiVersion === DRIVER_POD_API_VERSION)
+  }
+
+  test("The client should attach the driver container with the appropriate JVM options.") {
+    val sparkConf = new SparkConf(false)
+      .set("spark.logConf", "true")
+      .set(
+        org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS,
+          "-XX:+HeapDumpOnOutOfMemoryError -XX:+PrintGCDetails")
+    val submissionClient = new Client(
+      submissionSteps,
+      sparkConf,
+      kubernetesClient,
+      false,
+      "spark",
+      loggingPodStatusWatcher)
+    submissionClient.run()
+    val createdPod = createdPodArgumentCaptor.getValue
+    val driverContainer = Iterables.getOnlyElement(createdPod.getSpec.getContainers)
+    assert(driverContainer.getName === SecondTestConfigurationStep.containerName)
+    val driverJvmOptsEnvs = driverContainer.getEnv.asScala.filter { env =>
+      env.getName.startsWith(ENV_JAVA_OPT_PREFIX)
+    }.sortBy(_.getName)
+    assert(driverJvmOptsEnvs.size === 4)
+
+    val expectedJvmOptsValues = Seq(
+      "-Dspark.logConf=true",
+      s"-D${SecondTestConfigurationStep.sparkConfKey}=" +
+        s"${SecondTestConfigurationStep.sparkConfValue}",
+      "-XX:+HeapDumpOnOutOfMemoryError",
+      "-XX:+PrintGCDetails")
+    driverJvmOptsEnvs.zip(expectedJvmOptsValues).zipWithIndex.foreach {
+      case ((resolvedEnv, expectedJvmOpt), index) =>
+        assert(resolvedEnv.getName === s"$ENV_JAVA_OPT_PREFIX$index")
+        assert(resolvedEnv.getValue === expectedJvmOpt)
+    }
+  }
+
+  test("Waiting for app completion should stall on the watcher") {
+    val submissionClient = new Client(
+      submissionSteps,
+      new SparkConf(false),
+      kubernetesClient,
+      true,
+      "spark",
+      loggingPodStatusWatcher)
+    submissionClient.run()
+    verify(loggingPodStatusWatcher).awaitCompletion()
+  }
+
+}
+
+private object FirstTestConfigurationStep extends DriverConfigurationStep {
+
+  val podName = "test-pod"
+  val secretName = "test-secret"
+  val labelKey = "first-submit"
+  val labelValue = "true"
+  val secretKey = "secretKey"
+  val secretData = "secretData"
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val modifiedPod = new PodBuilder(driverSpec.driverPod)
+      .editMetadata()
+      .withName(podName)
+      .addToLabels(labelKey, labelValue)
+      .endMetadata()
+      .build()
+    val additionalResource = new SecretBuilder()
+      .withNewMetadata()
+      .withName(secretName)
+      .endMetadata()
+      .addToData(secretKey, secretData)
+      .build()
+    driverSpec.copy(
+      driverPod = modifiedPod,
+      otherKubernetesResources = driverSpec.otherKubernetesResources ++ Seq(additionalResource))
+  }
+}
+
+private object SecondTestConfigurationStep extends DriverConfigurationStep {
+
+  val annotationKey = "second-submit"
+  val annotationValue = "submitted"
+  val sparkConfKey = "spark.custom-conf"
+  val sparkConfValue = "custom-conf-value"
+  val containerName = "driverContainer"
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val modifiedPod = new PodBuilder(driverSpec.driverPod)
+      .editMetadata()
+        .addToAnnotations(annotationKey, annotationValue)
+        .endMetadata()
+      .build()
+    val resolvedSparkConf = driverSpec.driverSparkConf.clone().set(sparkConfKey, sparkConfValue)
+    val modifiedContainer = new ContainerBuilder(driverSpec.driverContainer)
+      .withName(containerName)
+      .build()
+    driverSpec.copy(
+      driverPod = modifiedPod,
+      driverSparkConf = resolvedSparkConf,
+      driverContainer = modifiedContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
new file mode 100644
index 0000000000000..c7291d49b465e
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config.DRIVER_DOCKER_IMAGE
+import org.apache.spark.deploy.k8s.submit.steps._
+
+class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
+
+  private val NAMESPACE = "default"
+  private val DRIVER_IMAGE = "driver-image"
+  private val APP_ID = "spark-app-id"
+  private val LAUNCH_TIME = 975256L
+  private val APP_NAME = "spark"
+  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val APP_ARGS = Array("arg1", "arg2")
+
+  test("Base submission steps with a main app resource.") {
+    val sparkConf = new SparkConf(false)
+      .set(DRIVER_DOCKER_IMAGE, DRIVER_IMAGE)
+    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
+    val orchestrator = new DriverConfigurationStepsOrchestrator(
+      NAMESPACE,
+      APP_ID,
+      LAUNCH_TIME,
+      Some(mainAppResource),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    validateStepTypes(
+      orchestrator,
+      classOf[BaseDriverConfigurationStep],
+      classOf[DriverServiceBootstrapStep],
+      classOf[DriverKubernetesCredentialsStep],
+      classOf[DependencyResolutionStep]
+    )
+  }
+
+  test("Base submission steps without a main app resource.") {
+    val sparkConf = new SparkConf(false)
+      .set(DRIVER_DOCKER_IMAGE, DRIVER_IMAGE)
+    val orchestrator = new DriverConfigurationStepsOrchestrator(
+      NAMESPACE,
+      APP_ID,
+      LAUNCH_TIME,
+      Option.empty,
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    validateStepTypes(
+      orchestrator,
+      classOf[BaseDriverConfigurationStep],
+      classOf[DriverServiceBootstrapStep],
+      classOf[DriverKubernetesCredentialsStep]
+    )
+  }
+
+  private def validateStepTypes(
+      orchestrator: DriverConfigurationStepsOrchestrator,
+      types: Class[_ <: DriverConfigurationStep]*): Unit = {
+    val steps = orchestrator.getAllConfigurationSteps()
+    assert(steps.size === types.size)
+    assert(steps.map(_.getClass) === types)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
new file mode 100644
index 0000000000000..83c5f98254829
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+
+class BaseDriverConfigurationStepSuite extends SparkFunSuite {
+
+  private val APP_ID = "spark-app-id"
+  private val RESOURCE_NAME_PREFIX = "spark"
+  private val DRIVER_LABELS = Map("labelkey" -> "labelvalue")
+  private val DOCKER_IMAGE_PULL_POLICY = "IfNotPresent"
+  private val APP_NAME = "spark-test"
+  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
+  private val APP_ARGS = Array("arg1", "arg2", "arg 3")
+  private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
+  private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
+  private val DRIVER_CUSTOM_ENV_KEY1 = "customDriverEnv1"
+  private val DRIVER_CUSTOM_ENV_KEY2 = "customDriverEnv2"
+
+  test("Set all possible configurations from the user.") {
+    val sparkConf = new SparkConf()
+      .set(KUBERNETES_DRIVER_POD_NAME, "spark-driver-pod")
+      .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, "/opt/spark/spark-examples.jar")
+      .set("spark.driver.cores", "2")
+      .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
+      .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
+      .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
+      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
+      .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
+      .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
+      .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
+
+    val submissionStep = new BaseDriverConfigurationStep(
+      APP_ID,
+      RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      DOCKER_IMAGE_PULL_POLICY,
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    val basePod = new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build()
+    val baseDriverSpec = KubernetesDriverSpec(
+      driverPod = basePod,
+      driverContainer = new ContainerBuilder().build(),
+      driverSparkConf = new SparkConf(false),
+      otherKubernetesResources = Seq.empty[HasMetadata])
+    val preparedDriverSpec = submissionStep.configureDriver(baseDriverSpec)
+
+    assert(preparedDriverSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
+    assert(preparedDriverSpec.driverContainer.getImage === "spark-driver:latest")
+    assert(preparedDriverSpec.driverContainer.getImagePullPolicy === DOCKER_IMAGE_PULL_POLICY)
+
+    assert(preparedDriverSpec.driverContainer.getEnv.size === 7)
+    val envs = preparedDriverSpec.driverContainer
+      .getEnv
+      .asScala
+      .map(env => (env.getName, env.getValue))
+      .toMap
+    assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
+    assert(envs(ENV_DRIVER_MEMORY) === "256M")
+    assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
+    assert(envs(ENV_DRIVER_ARGS) === "\"arg1\" \"arg2\" \"arg 3\"")
+    assert(envs(DRIVER_CUSTOM_ENV_KEY1) === "customDriverEnv1")
+    assert(envs(DRIVER_CUSTOM_ENV_KEY2) === "customDriverEnv2")
+
+    assert(preparedDriverSpec.driverContainer.getEnv.asScala.exists(envVar =>
+      envVar.getName.equals(ENV_DRIVER_BIND_ADDRESS) &&
+        envVar.getValueFrom.getFieldRef.getApiVersion.equals("v1") &&
+        envVar.getValueFrom.getFieldRef.getFieldPath.equals("status.podIP")))
+
+    val resourceRequirements = preparedDriverSpec.driverContainer.getResources
+    val requests = resourceRequirements.getRequests.asScala
+    assert(requests("cpu").getAmount === "2")
+    assert(requests("memory").getAmount === "256Mi")
+    val limits = resourceRequirements.getLimits.asScala
+    assert(limits("memory").getAmount === "456Mi")
+    assert(limits("cpu").getAmount === "4")
+
+    val driverPodMetadata = preparedDriverSpec.driverPod.getMetadata
+    assert(driverPodMetadata.getName === "spark-driver-pod")
+    assert(driverPodMetadata.getLabels.asScala === DRIVER_LABELS)
+    val expectedAnnotations = Map(
+      CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
+      SPARK_APP_NAME_ANNOTATION -> APP_NAME)
+    assert(driverPodMetadata.getAnnotations.asScala === expectedAnnotations)
+    assert(preparedDriverSpec.driverPod.getSpec.getRestartPolicy === "Never")
+
+    val resolvedSparkConf = preparedDriverSpec.driverSparkConf.getAll.toMap
+    val expectedSparkConf = Map(
+      KUBERNETES_DRIVER_POD_NAME.key -> "spark-driver-pod",
+      "spark.app.id" -> APP_ID,
+      KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key -> RESOURCE_NAME_PREFIX)
+    assert(resolvedSparkConf === expectedSparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
new file mode 100644
index 0000000000000..991b03cafb76c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+
+class DependencyResolutionStepSuite extends SparkFunSuite {
+
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/apps/jars/jar1.jar",
+    "file:///home/user/apps/jars/jar2.jar",
+    "local:///var/apps/jars/jar3.jar")
+
+  private val SPARK_FILES = Seq(
+    "file:///home/user/apps/files/file1.txt",
+    "hdfs://localhost:9000/apps/files/file2.txt",
+    "local:///var/apps/files/file3.txt")
+
+  private val JARS_DOWNLOAD_PATH = "/mnt/spark-data/jars"
+  private val FILES_DOWNLOAD_PATH = "/mnt/spark-data/files"
+
+  test("Added dependencies should be resolved in Spark configuration and environment") {
+    val dependencyResolutionStep = new DependencyResolutionStep(
+      SPARK_JARS,
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH)
+    val driverPod = new PodBuilder().build()
+    val baseDriverSpec = KubernetesDriverSpec(
+      driverPod = driverPod,
+      driverContainer = new ContainerBuilder().build(),
+      driverSparkConf = new SparkConf(false),
+      otherKubernetesResources = Seq.empty[HasMetadata])
+    val preparedDriverSpec = dependencyResolutionStep.configureDriver(baseDriverSpec)
+    assert(preparedDriverSpec.driverPod === driverPod)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    val resolvedSparkJars = preparedDriverSpec.driverSparkConf.get("spark.jars").split(",").toSet
+    val expectedResolvedSparkJars = Set(
+      "hdfs://localhost:9000/apps/jars/jar1.jar",
+      s"$JARS_DOWNLOAD_PATH/jar2.jar",
+      "/var/apps/jars/jar3.jar")
+    assert(resolvedSparkJars === expectedResolvedSparkJars)
+    val resolvedSparkFiles = preparedDriverSpec.driverSparkConf.get("spark.files").split(",").toSet
+    val expectedResolvedSparkFiles = Set(
+      s"$FILES_DOWNLOAD_PATH/file1.txt",
+      s"hdfs://localhost:9000/apps/files/file2.txt",
+      s"/var/apps/files/file3.txt")
+    assert(resolvedSparkFiles === expectedResolvedSparkFiles)
+    val driverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
+    assert(driverEnv.size === 1)
+    assert(driverEnv.head.getName === ENV_MOUNTED_CLASSPATH)
+    val resolvedDriverClasspath = driverEnv.head.getValue.split(File.pathSeparator).toSet
+    val expectedResolvedDriverClasspath = Set(
+      s"$JARS_DOWNLOAD_PATH/jar1.jar",
+      s"$JARS_DOWNLOAD_PATH/jar2.jar",
+      "/var/apps/jars/jar3.jar")
+    assert(resolvedDriverClasspath === expectedResolvedDriverClasspath)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
new file mode 100644
index 0000000000000..64553d25883bb
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import com.google.common.base.Charsets
+import com.google.common.io.{BaseEncoding, Files}
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, Secret}
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.util.Utils
+
+class DriverKubernetesCredentialsStepSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val KUBERNETES_RESOURCE_NAME_PREFIX = "spark"
+  private var credentialsTempDirectory: File = _
+  private val BASE_DRIVER_SPEC = new KubernetesDriverSpec(
+    driverPod = new PodBuilder().build(),
+    driverContainer = new ContainerBuilder().build(),
+    driverSparkConf = new SparkConf(false),
+    otherKubernetesResources = Seq.empty[HasMetadata])
+
+  before {
+    credentialsTempDirectory = Utils.createTempDir()
+  }
+
+  after {
+    credentialsTempDirectory.delete()
+  }
+
+  test("Don't set any credentials") {
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+        new SparkConf(false), KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
+    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
+    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    assert(preparedDriverSpec.driverSparkConf.getAll.isEmpty)
+  }
+
+  test("Only set credentials that are manually mounted.") {
+    val submissionSparkConf = new SparkConf(false)
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
+        "/mnt/secrets/my-token.txt")
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+        "/mnt/secrets/my-key.pem")
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+        "/mnt/secrets/my-cert.pem")
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+        "/mnt/secrets/my-ca.pem")
+
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+      submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
+    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
+    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
+    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === submissionSparkConf.getAll.toMap)
+  }
+
+  test("Mount credentials from the submission client as a secret.") {
+    val caCertFile = writeCredentials("ca.pem", "ca-cert")
+    val clientKeyFile = writeCredentials("key.pem", "key")
+    val clientCertFile = writeCredentials("cert.pem", "cert")
+    val submissionSparkConf = new SparkConf(false)
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX",
+        "token")
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
+        clientKeyFile.getAbsolutePath)
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
+        clientCertFile.getAbsolutePath)
+      .set(
+        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
+        caCertFile.getAbsolutePath)
+    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
+      submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
+    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(
+      BASE_DRIVER_SPEC.copy(driverSparkConf = submissionSparkConf))
+    val expectedSparkConf = Map(
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX" -> "<present_but_redacted>",
+      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX" ->
+        DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH,
+      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
+        DRIVER_CREDENTIALS_CLIENT_KEY_PATH,
+      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
+        DRIVER_CREDENTIALS_CLIENT_CERT_PATH,
+      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
+        DRIVER_CREDENTIALS_CA_CERT_PATH,
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
+        clientKeyFile.getAbsolutePath,
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
+        clientCertFile.getAbsolutePath,
+      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
+        caCertFile.getAbsolutePath)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
+    assert(preparedDriverSpec.otherKubernetesResources.size === 1)
+    val credentialsSecret = preparedDriverSpec.otherKubernetesResources.head.asInstanceOf[Secret]
+    assert(credentialsSecret.getMetadata.getName ===
+      s"$KUBERNETES_RESOURCE_NAME_PREFIX-kubernetes-credentials")
+    val decodedSecretData = credentialsSecret.getData.asScala.map { data =>
+      (data._1, new String(BaseEncoding.base64().decode(data._2), Charsets.UTF_8))
+    }
+    val expectedSecretData = Map(
+      DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME -> "ca-cert",
+      DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME -> "token",
+      DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME -> "key",
+      DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME -> "cert")
+    assert(decodedSecretData === expectedSecretData)
+    val driverPodVolumes = preparedDriverSpec.driverPod.getSpec.getVolumes.asScala
+    assert(driverPodVolumes.size === 1)
+    assert(driverPodVolumes.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+    assert(driverPodVolumes.head.getSecret != null)
+    assert(driverPodVolumes.head.getSecret.getSecretName === credentialsSecret.getMetadata.getName)
+    val driverContainerVolumeMount = preparedDriverSpec.driverContainer.getVolumeMounts.asScala
+    assert(driverContainerVolumeMount.size === 1)
+    assert(driverContainerVolumeMount.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
+    assert(driverContainerVolumeMount.head.getMountPath === DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
+  }
+
+  private def writeCredentials(credentialsFileName: String, credentialsContents: String): File = {
+    val credentialsFile = new File(credentialsTempDirectory, credentialsFileName)
+    Files.write(credentialsContents, credentialsFile, Charsets.UTF_8)
+    credentialsFile
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
new file mode 100644
index 0000000000000..006ce2668f8a0
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.Service
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Mockito.when
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.util.Clock
+
+class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val SHORT_RESOURCE_NAME_PREFIX =
+    "a" * (DriverServiceBootstrapStep.MAX_SERVICE_NAME_LENGTH -
+      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX.length)
+
+  private val LONG_RESOURCE_NAME_PREFIX =
+    "a" * (DriverServiceBootstrapStep.MAX_SERVICE_NAME_LENGTH -
+      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX.length + 1)
+  private val DRIVER_LABELS = Map(
+    "label1key" -> "label1value",
+    "label2key" -> "label2value")
+
+  @Mock
+  private var clock: Clock = _
+
+  private var sparkConf: SparkConf = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    sparkConf = new SparkConf(false)
+  }
+
+  test("Headless service has a port for the driver RPC and the block manager.") {
+    val configurationStep = new DriverServiceBootstrapStep(
+      SHORT_RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      sparkConf
+        .set("spark.driver.port", "9000")
+        .set(org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, 8080),
+      clock)
+    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
+    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
+    assert(resolvedDriverSpec.otherKubernetesResources.size === 1)
+    assert(resolvedDriverSpec.otherKubernetesResources.head.isInstanceOf[Service])
+    val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
+    verifyService(
+      9000,
+      8080,
+      s"$SHORT_RESOURCE_NAME_PREFIX${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}",
+      driverService)
+  }
+
+  test("Hostname and ports are set according to the service name.") {
+    val configurationStep = new DriverServiceBootstrapStep(
+      SHORT_RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      sparkConf
+        .set("spark.driver.port", "9000")
+        .set(org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, 8080)
+        .set(KUBERNETES_NAMESPACE, "my-namespace"),
+      clock)
+    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
+    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
+    val expectedServiceName = SHORT_RESOURCE_NAME_PREFIX +
+      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
+  }
+
+  test("Ports should resolve to defaults in SparkConf and in the service.") {
+    val configurationStep = new DriverServiceBootstrapStep(
+      SHORT_RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      sparkConf,
+      clock)
+    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
+    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
+    verifyService(
+      DEFAULT_DRIVER_PORT,
+      DEFAULT_BLOCKMANAGER_PORT,
+      s"$SHORT_RESOURCE_NAME_PREFIX${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}",
+      resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service])
+    assert(resolvedDriverSpec.driverSparkConf.get("spark.driver.port") ===
+      DEFAULT_DRIVER_PORT.toString)
+    assert(resolvedDriverSpec.driverSparkConf.get(
+      org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT) === DEFAULT_BLOCKMANAGER_PORT)
+  }
+
+  test("Long prefixes should switch to using a generated name.") {
+    val configurationStep = new DriverServiceBootstrapStep(
+      LONG_RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      sparkConf.set(KUBERNETES_NAMESPACE, "my-namespace"),
+      clock)
+    when(clock.getTimeMillis()).thenReturn(10000)
+    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
+    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
+    val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
+    val expectedServiceName = s"spark-10000${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}"
+    assert(driverService.getMetadata.getName === expectedServiceName)
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
+  }
+
+  test("Disallow bind address and driver host to be set explicitly.") {
+    val configurationStep = new DriverServiceBootstrapStep(
+      LONG_RESOURCE_NAME_PREFIX,
+      DRIVER_LABELS,
+      sparkConf.set(org.apache.spark.internal.config.DRIVER_BIND_ADDRESS, "host"),
+      clock)
+    try {
+      configurationStep.configureDriver(KubernetesDriverSpec.initialSpec(sparkConf))
+      fail("The driver bind address should not be allowed.")
+    } catch {
+      case e: Throwable =>
+        assert(e.getMessage ===
+          s"requirement failed: ${DriverServiceBootstrapStep.DRIVER_BIND_ADDRESS_KEY} is" +
+          " not supported in Kubernetes mode, as the driver's bind address is managed" +
+          " and set to the driver pod's IP address.")
+    }
+    sparkConf.remove(org.apache.spark.internal.config.DRIVER_BIND_ADDRESS)
+    sparkConf.set(org.apache.spark.internal.config.DRIVER_HOST_ADDRESS, "host")
+    try {
+      configurationStep.configureDriver(KubernetesDriverSpec.initialSpec(sparkConf))
+      fail("The driver host address should not be allowed.")
+    } catch {
+      case e: Throwable =>
+        assert(e.getMessage ===
+          s"requirement failed: ${DriverServiceBootstrapStep.DRIVER_HOST_KEY} is" +
+          " not supported in Kubernetes mode, as the driver's hostname will be managed via" +
+          " a Kubernetes service.")
+    }
+  }
+
+  private def verifyService(
+      driverPort: Int,
+      blockManagerPort: Int,
+      expectedServiceName: String,
+      service: Service): Unit = {
+    assert(service.getMetadata.getName === expectedServiceName)
+    assert(service.getSpec.getClusterIP === "None")
+    assert(service.getSpec.getSelector.asScala === DRIVER_LABELS)
+    assert(service.getSpec.getPorts.size() === 2)
+    val driverServicePorts = service.getSpec.getPorts.asScala
+    assert(driverServicePorts.head.getName === DRIVER_PORT_NAME)
+    assert(driverServicePorts.head.getPort.intValue() === driverPort)
+    assert(driverServicePorts.head.getTargetPort.getIntVal === driverPort)
+    assert(driverServicePorts(1).getName === BLOCK_MANAGER_PORT_NAME)
+    assert(driverServicePorts(1).getPort.intValue() === blockManagerPort)
+    assert(driverServicePorts(1).getTargetPort.getIntVal === blockManagerPort)
+  }
+
+  private def verifySparkConfHostNames(
+      driverSparkConf: SparkConf, expectedHostName: String): Unit = {
+    assert(driverSparkConf.get(
+      org.apache.spark.internal.config.DRIVER_HOST_ADDRESS) === expectedHostName)
+  }
+}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
new file mode 100644
index 0000000000000..d16349559466d
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM spark-base
+
+# Before building the docker image, first build and make a Spark distribution following
+# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
+# If this docker file is being used in the context of building your images from a Spark
+# distribution, the docker build command should be invoked from the top level directory
+# of the Spark distribution. E.g.:
+# docker build -t spark-driver:latest -f dockerfiles/spark-base/Dockerfile .
+
+COPY examples /opt/spark/examples
+
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
+    readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    ${JAVA_HOME}/bin/java "${SPARK_DRIVER_JAVA_OPTS[@]}" -cp "$SPARK_CLASSPATH" -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
new file mode 100644
index 0000000000000..0e38169b8efdc
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM spark-base
+
+# Before building the docker image, first build and make a Spark distribution following
+# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
+# If this docker file is being used in the context of building your images from a Spark
+# distribution, the docker build command should be invoked from the top level directory
+# of the Spark distribution. E.g.:
+# docker build -t spark-executor:latest -f dockerfiles/spark-base/Dockerfile .
+
+COPY examples /opt/spark/examples
+
+CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
+    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
+    readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt && \
+    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp "$SPARK_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
new file mode 100644
index 0000000000000..20316c9c5098a
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
@@ -0,0 +1,47 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM openjdk:8-alpine
+
+# Before building the docker image, first build and make a Spark distribution following
+# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
+# If this docker file is being used in the context of building your images from a Spark
+# distribution, the docker build command should be invoked from the top level directory
+# of the Spark distribution. E.g.:
+# docker build -t spark-base:latest -f dockerfiles/spark-base/Dockerfile .
+
+RUN set -ex && \
+    apk upgrade --no-cache && \
+    apk add --no-cache bash tini libc6-compat && \
+    mkdir -p /opt/spark && \
+    mkdir -p /opt/spark/work-dir \
+    touch /opt/spark/RELEASE && \
+    rm /bin/sh && \
+    ln -sv /bin/bash /bin/sh && \
+    chgrp root /etc/passwd && chmod ug+rw /etc/passwd
+
+COPY jars /opt/spark/jars
+COPY bin /opt/spark/bin
+COPY sbin /opt/spark/sbin
+COPY conf /opt/spark/conf
+COPY dockerfiles/spark-base/entrypoint.sh /opt/
+
+ENV SPARK_HOME /opt/spark
+
+WORKDIR /opt/spark/work-dir
+
+ENTRYPOINT [ "/opt/entrypoint.sh" ]
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
new file mode 100644
index 0000000000000..82559889f4beb
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# echo commands to the terminal output
+set -ex
+
+# Check whether there is a passwd entry for the container UID
+myuid=$(id -u)
+mygid=$(id -g)
+uidentry=$(getent passwd $myuid)
+
+# If there is no passwd entry for the container UID, attempt to create one
+if [ -z "$uidentry" ] ; then
+    if [ -w /etc/passwd ] ; then
+        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
+    else
+        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
+    fi
+fi
+
+# Execute the container CMD under tini for better hygiene
+/sbin/tini -s -- "$@"
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
index e1af8ba087d6e..3ba3ae5ab4401 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config.scala
@@ -217,20 +217,12 @@ package object config {
     .intConf
     .createWithDefault(1)
 
-  private[spark] val DRIVER_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.driver.memoryOverhead")
-    .bytesConf(ByteUnit.MiB)
-    .createOptional
-
   /* Executor configuration. */
 
   private[spark] val EXECUTOR_CORES = ConfigBuilder("spark.executor.cores")
     .intConf
     .createWithDefault(1)
 
-  private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.yarn.executor.memoryOverhead")
-    .bytesConf(ByteUnit.MiB)
-    .createOptional
-
   private[spark] val EXECUTOR_NODE_LABEL_EXPRESSION =
     ConfigBuilder("spark.yarn.executor.nodeLabelExpression")
       .doc("Node label expression for executors.")

From 3d82f6eb782315b05453b3a0334d3bc05ab4298a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 11 Dec 2017 15:55:23 -0800
Subject: [PATCH 085/356] [SPARK-22726][TEST] Basic tests for Binary Comparison
 and ImplicitTypeCasts

## What changes were proposed in this pull request?
Before we deliver the Hive compatibility mode, we plan to write a set of test cases that can be easily run in both Spark and Hive sides. We can easily compare whether they are the same or not. When new typeCoercion rules are added, we also can easily track the changes. These test cases can also be backported to the previous Spark versions for determining the changes we made.

This PR is the first attempt for improving the test coverage for type coercion compatibility. We generate these test cases for our binary comparison and ImplicitTypeCasts based on the Apache Derby test cases in https://github.com/apache/derby/blob/10.14/java/testing/org/apache/derbyTesting/functionTests/tests/lang/implicitConversions.sql

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19918 from gatorsmile/typeCoercionTests.
---
 .../typeCoercion/native/binaryComparison.sql  |  287 +++
 .../typeCoercion/native/implicitTypeCasts.sql |   72 +
 .../sql-tests/results/datetime.sql.out        |    2 +
 .../results/predicate-functions.sql.out       |  152 +-
 .../native/binaryComparison.sql.out           | 2146 +++++++++++++++++
 .../native/implicitTypeCasts.sql.out          |  354 +++
 .../apache/spark/sql/SQLQueryTestSuite.scala  |    2 +-
 .../apache/spark/sql/TPCDSQuerySuite.scala    |    2 +-
 .../execution/HiveCompatibilitySuite.scala    |    2 +-
 9 files changed, 2940 insertions(+), 79 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/binaryComparison.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/binaryComparison.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/binaryComparison.sql
new file mode 100644
index 0000000000000..522322ac480be
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/binaryComparison.sql
@@ -0,0 +1,287 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+-- Binary Comparison
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT cast(1 as binary) = '1' FROM t;
+SELECT cast(1 as binary) > '2' FROM t;
+SELECT cast(1 as binary) >= '2' FROM t;
+SELECT cast(1 as binary) < '2' FROM t;
+SELECT cast(1 as binary) <= '2' FROM t;
+SELECT cast(1 as binary) <> '2' FROM t;
+SELECT cast(1 as binary) = cast(null as string) FROM t;
+SELECT cast(1 as binary) > cast(null as string) FROM t;
+SELECT cast(1 as binary) >= cast(null as string) FROM t;
+SELECT cast(1 as binary) < cast(null as string) FROM t;
+SELECT cast(1 as binary) <= cast(null as string) FROM t;
+SELECT cast(1 as binary) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as binary) FROM t;
+SELECT '2' > cast(1 as binary) FROM t;
+SELECT '2' >= cast(1 as binary) FROM t;
+SELECT '2' < cast(1 as binary) FROM t;
+SELECT '2' <= cast(1 as binary) FROM t;
+SELECT '2' <> cast(1 as binary) FROM t;
+SELECT cast(null as string) = cast(1 as binary) FROM t;
+SELECT cast(null as string) > cast(1 as binary) FROM t;
+SELECT cast(null as string) >= cast(1 as binary) FROM t;
+SELECT cast(null as string) < cast(1 as binary) FROM t;
+SELECT cast(null as string) <= cast(1 as binary) FROM t;
+SELECT cast(null as string) <> cast(1 as binary) FROM t;
+SELECT cast(1 as tinyint) = '1' FROM t;
+SELECT cast(1 as tinyint) > '2' FROM t;
+SELECT cast(1 as tinyint) >= '2' FROM t;
+SELECT cast(1 as tinyint) < '2' FROM t;
+SELECT cast(1 as tinyint) <= '2' FROM t;
+SELECT cast(1 as tinyint) <> '2' FROM t;
+SELECT cast(1 as tinyint) = cast(null as string) FROM t;
+SELECT cast(1 as tinyint) > cast(null as string) FROM t;
+SELECT cast(1 as tinyint) >= cast(null as string) FROM t;
+SELECT cast(1 as tinyint) < cast(null as string) FROM t;
+SELECT cast(1 as tinyint) <= cast(null as string) FROM t;
+SELECT cast(1 as tinyint) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as tinyint) FROM t;
+SELECT '2' > cast(1 as tinyint) FROM t;
+SELECT '2' >= cast(1 as tinyint) FROM t;
+SELECT '2' < cast(1 as tinyint) FROM t;
+SELECT '2' <= cast(1 as tinyint) FROM t;
+SELECT '2' <> cast(1 as tinyint) FROM t;
+SELECT cast(null as string) = cast(1 as tinyint) FROM t;
+SELECT cast(null as string) > cast(1 as tinyint) FROM t;
+SELECT cast(null as string) >= cast(1 as tinyint) FROM t;
+SELECT cast(null as string) < cast(1 as tinyint) FROM t;
+SELECT cast(null as string) <= cast(1 as tinyint) FROM t;
+SELECT cast(null as string) <> cast(1 as tinyint) FROM t;
+SELECT cast(1 as smallint) = '1' FROM t;
+SELECT cast(1 as smallint) > '2' FROM t;
+SELECT cast(1 as smallint) >= '2' FROM t;
+SELECT cast(1 as smallint) < '2' FROM t;
+SELECT cast(1 as smallint) <= '2' FROM t;
+SELECT cast(1 as smallint) <> '2' FROM t;
+SELECT cast(1 as smallint) = cast(null as string) FROM t;
+SELECT cast(1 as smallint) > cast(null as string) FROM t;
+SELECT cast(1 as smallint) >= cast(null as string) FROM t;
+SELECT cast(1 as smallint) < cast(null as string) FROM t;
+SELECT cast(1 as smallint) <= cast(null as string) FROM t;
+SELECT cast(1 as smallint) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as smallint) FROM t;
+SELECT '2' > cast(1 as smallint) FROM t;
+SELECT '2' >= cast(1 as smallint) FROM t;
+SELECT '2' < cast(1 as smallint) FROM t;
+SELECT '2' <= cast(1 as smallint) FROM t;
+SELECT '2' <> cast(1 as smallint) FROM t;
+SELECT cast(null as string) = cast(1 as smallint) FROM t;
+SELECT cast(null as string) > cast(1 as smallint) FROM t;
+SELECT cast(null as string) >= cast(1 as smallint) FROM t;
+SELECT cast(null as string) < cast(1 as smallint) FROM t;
+SELECT cast(null as string) <= cast(1 as smallint) FROM t;
+SELECT cast(null as string) <> cast(1 as smallint) FROM t;
+SELECT cast(1 as int) = '1' FROM t;
+SELECT cast(1 as int) > '2' FROM t;
+SELECT cast(1 as int) >= '2' FROM t;
+SELECT cast(1 as int) < '2' FROM t;
+SELECT cast(1 as int) <= '2' FROM t;
+SELECT cast(1 as int) <> '2' FROM t;
+SELECT cast(1 as int) = cast(null as string) FROM t;
+SELECT cast(1 as int) > cast(null as string) FROM t;
+SELECT cast(1 as int) >= cast(null as string) FROM t;
+SELECT cast(1 as int) < cast(null as string) FROM t;
+SELECT cast(1 as int) <= cast(null as string) FROM t;
+SELECT cast(1 as int) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as int) FROM t;
+SELECT '2' > cast(1 as int) FROM t;
+SELECT '2' >= cast(1 as int) FROM t;
+SELECT '2' < cast(1 as int) FROM t;
+SELECT '2' <> cast(1 as int) FROM t;
+SELECT '2' <= cast(1 as int) FROM t;
+SELECT cast(null as string) = cast(1 as int) FROM t;
+SELECT cast(null as string) > cast(1 as int) FROM t;
+SELECT cast(null as string) >= cast(1 as int) FROM t;
+SELECT cast(null as string) < cast(1 as int) FROM t;
+SELECT cast(null as string) <> cast(1 as int) FROM t;
+SELECT cast(null as string) <= cast(1 as int) FROM t;
+SELECT cast(1 as bigint) = '1' FROM t;
+SELECT cast(1 as bigint) > '2' FROM t;
+SELECT cast(1 as bigint) >= '2' FROM t;
+SELECT cast(1 as bigint) < '2' FROM t;
+SELECT cast(1 as bigint) <= '2' FROM t;
+SELECT cast(1 as bigint) <> '2' FROM t;
+SELECT cast(1 as bigint) = cast(null as string) FROM t;
+SELECT cast(1 as bigint) > cast(null as string) FROM t;
+SELECT cast(1 as bigint) >= cast(null as string) FROM t;
+SELECT cast(1 as bigint) < cast(null as string) FROM t;
+SELECT cast(1 as bigint) <= cast(null as string) FROM t;
+SELECT cast(1 as bigint) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as bigint) FROM t;
+SELECT '2' > cast(1 as bigint) FROM t;
+SELECT '2' >= cast(1 as bigint) FROM t;
+SELECT '2' < cast(1 as bigint) FROM t;
+SELECT '2' <= cast(1 as bigint) FROM t;
+SELECT '2' <> cast(1 as bigint) FROM t;
+SELECT cast(null as string) = cast(1 as bigint) FROM t;
+SELECT cast(null as string) > cast(1 as bigint) FROM t;
+SELECT cast(null as string) >= cast(1 as bigint) FROM t;
+SELECT cast(null as string) < cast(1 as bigint) FROM t;
+SELECT cast(null as string) <= cast(1 as bigint) FROM t;
+SELECT cast(null as string) <> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) = '1' FROM t;
+SELECT cast(1 as decimal(10, 0)) > '2' FROM t;
+SELECT cast(1 as decimal(10, 0)) >= '2' FROM t;
+SELECT cast(1 as decimal(10, 0)) < '2' FROM t;
+SELECT cast(1 as decimal(10, 0)) <> '2' FROM t;
+SELECT cast(1 as decimal(10, 0)) <= '2' FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(null as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(null as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(null as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(null as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(null as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(null as string) FROM t;
+SELECT '1' = cast(1 as decimal(10, 0)) FROM t;
+SELECT '2' > cast(1 as decimal(10, 0)) FROM t;
+SELECT '2' >= cast(1 as decimal(10, 0)) FROM t;
+SELECT '2' < cast(1 as decimal(10, 0)) FROM t;
+SELECT '2' <= cast(1 as decimal(10, 0)) FROM t;
+SELECT '2' <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(null as string) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) = '1' FROM t;
+SELECT cast(1 as double) > '2' FROM t;
+SELECT cast(1 as double) >= '2' FROM t;
+SELECT cast(1 as double) < '2' FROM t;
+SELECT cast(1 as double) <= '2' FROM t;
+SELECT cast(1 as double) <> '2' FROM t;
+SELECT cast(1 as double) = cast(null as string) FROM t;
+SELECT cast(1 as double) > cast(null as string) FROM t;
+SELECT cast(1 as double) >= cast(null as string) FROM t;
+SELECT cast(1 as double) < cast(null as string) FROM t;
+SELECT cast(1 as double) <= cast(null as string) FROM t;
+SELECT cast(1 as double) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as double) FROM t;
+SELECT '2' > cast(1 as double) FROM t;
+SELECT '2' >= cast(1 as double) FROM t;
+SELECT '2' < cast(1 as double) FROM t;
+SELECT '2' <= cast(1 as double) FROM t;
+SELECT '2' <> cast(1 as double) FROM t;
+SELECT cast(null as string) = cast(1 as double) FROM t;
+SELECT cast(null as string) > cast(1 as double) FROM t;
+SELECT cast(null as string) >= cast(1 as double) FROM t;
+SELECT cast(null as string) < cast(1 as double) FROM t;
+SELECT cast(null as string) <= cast(1 as double) FROM t;
+SELECT cast(null as string) <> cast(1 as double) FROM t;
+SELECT cast(1 as float) = '1' FROM t;
+SELECT cast(1 as float) > '2' FROM t;
+SELECT cast(1 as float) >= '2' FROM t;
+SELECT cast(1 as float) < '2' FROM t;
+SELECT cast(1 as float) <= '2' FROM t;
+SELECT cast(1 as float) <> '2' FROM t;
+SELECT cast(1 as float) = cast(null as string) FROM t;
+SELECT cast(1 as float) > cast(null as string) FROM t;
+SELECT cast(1 as float) >= cast(null as string) FROM t;
+SELECT cast(1 as float) < cast(null as string) FROM t;
+SELECT cast(1 as float) <= cast(null as string) FROM t;
+SELECT cast(1 as float) <> cast(null as string) FROM t;
+SELECT '1' = cast(1 as float) FROM t;
+SELECT '2' > cast(1 as float) FROM t;
+SELECT '2' >= cast(1 as float) FROM t;
+SELECT '2' < cast(1 as float) FROM t;
+SELECT '2' <= cast(1 as float) FROM t;
+SELECT '2' <> cast(1 as float) FROM t;
+SELECT cast(null as string) = cast(1 as float) FROM t;
+SELECT cast(null as string) > cast(1 as float) FROM t;
+SELECT cast(null as string) >= cast(1 as float) FROM t;
+SELECT cast(null as string) < cast(1 as float) FROM t;
+SELECT cast(null as string) <= cast(1 as float) FROM t;
+SELECT cast(null as string) <> cast(1 as float) FROM t;
+-- the following queries return 1 if the search condition is satisfied
+-- and returns nothing if the search condition is not satisfied
+SELECT '1996-09-09' = date('1996-09-09') FROM t;
+SELECT '1996-9-10' > date('1996-09-09') FROM t;
+SELECT '1996-9-10' >= date('1996-09-09') FROM t;
+SELECT '1996-9-10' < date('1996-09-09') FROM t;
+SELECT '1996-9-10' <= date('1996-09-09') FROM t;
+SELECT '1996-9-10' <> date('1996-09-09') FROM t;
+SELECT cast(null as string) = date('1996-09-09') FROM t;
+SELECT cast(null as string)> date('1996-09-09') FROM t;
+SELECT cast(null as string)>= date('1996-09-09') FROM t;
+SELECT cast(null as string)< date('1996-09-09') FROM t;
+SELECT cast(null as string)<= date('1996-09-09') FROM t;
+SELECT cast(null as string)<> date('1996-09-09') FROM t;
+SELECT date('1996-09-09') = '1996-09-09' FROM t;
+SELECT date('1996-9-10') > '1996-09-09' FROM t;
+SELECT date('1996-9-10') >= '1996-09-09' FROM t;
+SELECT date('1996-9-10') < '1996-09-09' FROM t;
+SELECT date('1996-9-10') <= '1996-09-09' FROM t;
+SELECT date('1996-9-10') <> '1996-09-09' FROM t;
+SELECT date('1996-09-09') = cast(null as string) FROM t;
+SELECT date('1996-9-10') > cast(null as string) FROM t;
+SELECT date('1996-9-10') >= cast(null as string) FROM t;
+SELECT date('1996-9-10') < cast(null as string) FROM t;
+SELECT date('1996-9-10') <= cast(null as string) FROM t;
+SELECT date('1996-9-10') <> cast(null as string) FROM t;
+SELECT '1996-09-09 12:12:12.4' = timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT '1996-09-09 12:12:12.5' > timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT '1996-09-09 12:12:12.5' >= timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT '1996-09-09 12:12:12.5' < timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT '1996-09-09 12:12:12.5' <= timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT '1996-09-09 12:12:12.5' <> timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) = timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) > timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) >= timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) < timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) <= timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT cast(null as string) <> timestamp('1996-09-09 12:12:12.4') FROM t;
+SELECT timestamp('1996-09-09 12:12:12.4' )= '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )> '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )>= '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )< '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )<= '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )<> '1996-09-09 12:12:12.4' FROM t;
+SELECT timestamp('1996-09-09 12:12:12.4' )= cast(null as string) FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )> cast(null as string) FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )>= cast(null as string) FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )< cast(null as string) FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )<= cast(null as string) FROM t;
+SELECT timestamp('1996-09-09 12:12:12.5' )<> cast(null as string) FROM t;
+SELECT ' ' = X'0020' FROM t;
+SELECT ' ' > X'001F' FROM t;
+SELECT ' ' >= X'001F' FROM t;
+SELECT ' ' < X'001F' FROM t;
+SELECT ' ' <= X'001F' FROM t;
+SELECT ' ' <> X'001F' FROM t;
+SELECT cast(null as string) = X'0020' FROM t;
+SELECT cast(null as string) > X'001F' FROM t;
+SELECT cast(null as string) >= X'001F' FROM t;
+SELECT cast(null as string) < X'001F' FROM t;
+SELECT cast(null as string) <= X'001F' FROM t;
+SELECT cast(null as string) <> X'001F' FROM t;
+SELECT X'0020' = ' ' FROM t;
+SELECT X'001F' > ' ' FROM t;
+SELECT X'001F' >= ' ' FROM t;
+SELECT X'001F' < ' ' FROM t;
+SELECT X'001F' <= ' ' FROM t;
+SELECT X'001F' <> ' ' FROM t;
+SELECT X'0020' = cast(null as string) FROM t;
+SELECT X'001F' > cast(null as string) FROM t;
+SELECT X'001F' >= cast(null as string) FROM t;
+SELECT X'001F' < cast(null as string) FROM t;
+SELECT X'001F' <= cast(null as string) FROM t;
+SELECT X'001F' <> cast(null as string) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
new file mode 100644
index 0000000000000..58866f4b18112
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
@@ -0,0 +1,72 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+-- ImplicitTypeCasts
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT 1 + '2' FROM t;
+SELECT 1 - '2' FROM t;
+SELECT 1 * '2' FROM t;
+SELECT 4 / '2' FROM t;
+SELECT 1.1 + '2' FROM t;
+SELECT 1.1 - '2' FROM t;
+SELECT 1.1 * '2' FROM t;
+SELECT 4.4 / '2' FROM t;
+SELECT 1.1 + '2.2' FROM t;
+SELECT 1.1 - '2.2' FROM t;
+SELECT 1.1 * '2.2' FROM t;
+SELECT 4.4 / '2.2' FROM t;
+
+-- concatentation
+SELECT '$' || cast(1 as smallint) || '$' FROM t;
+SELECT '$' || 1 || '$' FROM t;
+SELECT '$' || cast(1 as bigint) || '$' FROM t;
+SELECT '$' || cast(1.1 as float) || '$' FROM t;
+SELECT '$' || cast(1.1 as double) || '$' FROM t;
+SELECT '$' || 1.1 || '$' FROM t;
+SELECT '$' || cast(1.1 as decimal(8,3)) || '$' FROM t;
+SELECT '$' || 'abcd' || '$' FROM t;
+SELECT '$' || date('1996-09-09') || '$' FROM t;
+SELECT '$' || timestamp('1996-09-09 10:11:12.4' )|| '$' FROM t;
+
+-- length functions
+SELECT length(cast(1 as smallint)) FROM t;
+SELECT length(cast(1 as int)) FROM t;
+SELECT length(cast(1 as bigint)) FROM t;
+SELECT length(cast(1.1 as float)) FROM t;
+SELECT length(cast(1.1 as double)) FROM t;
+SELECT length(1.1) FROM t;
+SELECT length(cast(1.1 as decimal(8,3))) FROM t;
+SELECT length('four') FROM t;
+SELECT length(date('1996-09-10')) FROM t;
+SELECT length(timestamp('1996-09-10 10:11:12.4')) FROM t;
+
+-- extract
+SELECT year( '1996-01-10') FROM t;
+SELECT month( '1996-01-10') FROM t;
+SELECT day( '1996-01-10') FROM t;
+SELECT hour( '10:11:12') FROM t;
+SELECT minute( '10:11:12') FROM t;
+SELECT second( '10:11:12') FROM t;
+
+-- like
+select 1 like '%' FROM t;
+select date('1996-09-10') like '19%' FROM t;
+select '1' like 1 FROM t;
+select '1 ' like 1 FROM t;
+select '1996-09-10' like date('1996-09-10') FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
index 7b2f46f6c2a66..bbb6851e69c7e 100644
--- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -44,6 +44,7 @@ struct<>
 -- !query 4 output
 
 
+
 -- !query 5
 select current_date, current_timestamp from ttf1
 -- !query 5 schema
@@ -63,6 +64,7 @@ struct<>
 -- !query 6 output
 
 
+
 -- !query 7
 select current_date = current_date(), current_timestamp = current_timestamp(), a, b from ttf2
 -- !query 7 schema
diff --git a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
index 8cd0d51da64f5..d51f6d37e4b41 100644
--- a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 31
+-- Number of queries: 32
 
 
 -- !query 0
@@ -34,225 +34,225 @@ struct<(CAST(1.5 AS DOUBLE) = CAST(1.51 AS DOUBLE)):boolean>
 false
 
 
--- !query 3
-select 1 > '1'
--- !query 3 schema
-struct<(1 > CAST(1 AS INT)):boolean>
--- !query 3 output
-false
-
-
 -- !query 4
-select 2 > '1.0'
+select 1 > '1'
 -- !query 4 schema
-struct<(2 > CAST(1.0 AS INT)):boolean>
+struct<(1 > CAST(1 AS INT)):boolean>
 -- !query 4 output
-true
+false
 
 
 -- !query 5
-select 2 > '2.0'
+select 2 > '1.0'
 -- !query 5 schema
-struct<(2 > CAST(2.0 AS INT)):boolean>
+struct<(2 > CAST(1.0 AS INT)):boolean>
 -- !query 5 output
-false
+true
 
 
 -- !query 6
-select 2 > '2.2'
+select 2 > '2.0'
 -- !query 6 schema
-struct<(2 > CAST(2.2 AS INT)):boolean>
+struct<(2 > CAST(2.0 AS INT)):boolean>
 -- !query 6 output
 false
 
 
 -- !query 7
-select '1.5' > 0.5
+select 2 > '2.2'
 -- !query 7 schema
-struct<(CAST(1.5 AS DOUBLE) > CAST(0.5 AS DOUBLE)):boolean>
+struct<(2 > CAST(2.2 AS INT)):boolean>
 -- !query 7 output
-true
+false
 
 
 -- !query 8
-select to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')
+select '1.5' > 0.5
 -- !query 8 schema
-struct<(to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')):boolean>
+struct<(CAST(1.5 AS DOUBLE) > CAST(0.5 AS DOUBLE)):boolean>
 -- !query 8 output
-false
+true
 
 
 -- !query 9
-select to_date('2009-07-30 04:17:52') > '2009-07-30 04:17:52'
+select to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')
 -- !query 9 schema
-struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) > 2009-07-30 04:17:52):boolean>
+struct<(to_date('2009-07-30 04:17:52') > to_date('2009-07-30 04:17:52')):boolean>
 -- !query 9 output
 false
 
 
 -- !query 10
-select 1 >= '1'
+select to_date('2009-07-30 04:17:52') > '2009-07-30 04:17:52'
 -- !query 10 schema
-struct<(1 >= CAST(1 AS INT)):boolean>
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) > 2009-07-30 04:17:52):boolean>
 -- !query 10 output
-true
+false
 
 
 -- !query 11
-select 2 >= '1.0'
+select 1 >= '1'
 -- !query 11 schema
-struct<(2 >= CAST(1.0 AS INT)):boolean>
+struct<(1 >= CAST(1 AS INT)):boolean>
 -- !query 11 output
 true
 
 
 -- !query 12
-select 2 >= '2.0'
+select 2 >= '1.0'
 -- !query 12 schema
-struct<(2 >= CAST(2.0 AS INT)):boolean>
+struct<(2 >= CAST(1.0 AS INT)):boolean>
 -- !query 12 output
 true
 
 
 -- !query 13
-select 2.0 >= '2.2'
+select 2 >= '2.0'
 -- !query 13 schema
-struct<(CAST(2.0 AS DOUBLE) >= CAST(2.2 AS DOUBLE)):boolean>
+struct<(2 >= CAST(2.0 AS INT)):boolean>
 -- !query 13 output
-false
+true
 
 
 -- !query 14
-select '1.5' >= 0.5
+select 2.0 >= '2.2'
 -- !query 14 schema
-struct<(CAST(1.5 AS DOUBLE) >= CAST(0.5 AS DOUBLE)):boolean>
+struct<(CAST(2.0 AS DOUBLE) >= CAST(2.2 AS DOUBLE)):boolean>
 -- !query 14 output
-true
+false
 
 
 -- !query 15
-select to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')
+select '1.5' >= 0.5
 -- !query 15 schema
-struct<(to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')):boolean>
+struct<(CAST(1.5 AS DOUBLE) >= CAST(0.5 AS DOUBLE)):boolean>
 -- !query 15 output
 true
 
 
 -- !query 16
-select to_date('2009-07-30 04:17:52') >= '2009-07-30 04:17:52'
+select to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')
 -- !query 16 schema
-struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) >= 2009-07-30 04:17:52):boolean>
+struct<(to_date('2009-07-30 04:17:52') >= to_date('2009-07-30 04:17:52')):boolean>
 -- !query 16 output
-false
+true
 
 
 -- !query 17
-select 1 < '1'
+select to_date('2009-07-30 04:17:52') >= '2009-07-30 04:17:52'
 -- !query 17 schema
-struct<(1 < CAST(1 AS INT)):boolean>
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) >= 2009-07-30 04:17:52):boolean>
 -- !query 17 output
 false
 
 
 -- !query 18
-select 2 < '1.0'
+select 1 < '1'
 -- !query 18 schema
-struct<(2 < CAST(1.0 AS INT)):boolean>
+struct<(1 < CAST(1 AS INT)):boolean>
 -- !query 18 output
 false
 
 
 -- !query 19
-select 2 < '2.0'
+select 2 < '1.0'
 -- !query 19 schema
-struct<(2 < CAST(2.0 AS INT)):boolean>
+struct<(2 < CAST(1.0 AS INT)):boolean>
 -- !query 19 output
 false
 
 
 -- !query 20
-select 2.0 < '2.2'
+select 2 < '2.0'
 -- !query 20 schema
-struct<(CAST(2.0 AS DOUBLE) < CAST(2.2 AS DOUBLE)):boolean>
+struct<(2 < CAST(2.0 AS INT)):boolean>
 -- !query 20 output
-true
+false
 
 
 -- !query 21
-select 0.5 < '1.5'
+select 2.0 < '2.2'
 -- !query 21 schema
-struct<(CAST(0.5 AS DOUBLE) < CAST(1.5 AS DOUBLE)):boolean>
+struct<(CAST(2.0 AS DOUBLE) < CAST(2.2 AS DOUBLE)):boolean>
 -- !query 21 output
 true
 
 
 -- !query 22
-select to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')
+select 0.5 < '1.5'
 -- !query 22 schema
-struct<(to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')):boolean>
+struct<(CAST(0.5 AS DOUBLE) < CAST(1.5 AS DOUBLE)):boolean>
 -- !query 22 output
-false
+true
 
 
 -- !query 23
-select to_date('2009-07-30 04:17:52') < '2009-07-30 04:17:52'
+select to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')
 -- !query 23 schema
-struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) < 2009-07-30 04:17:52):boolean>
+struct<(to_date('2009-07-30 04:17:52') < to_date('2009-07-30 04:17:52')):boolean>
 -- !query 23 output
-true
+false
 
 
 -- !query 24
-select 1 <= '1'
+select to_date('2009-07-30 04:17:52') < '2009-07-30 04:17:52'
 -- !query 24 schema
-struct<(1 <= CAST(1 AS INT)):boolean>
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) < 2009-07-30 04:17:52):boolean>
 -- !query 24 output
 true
 
 
 -- !query 25
-select 2 <= '1.0'
+select 1 <= '1'
 -- !query 25 schema
-struct<(2 <= CAST(1.0 AS INT)):boolean>
+struct<(1 <= CAST(1 AS INT)):boolean>
 -- !query 25 output
-false
+true
 
 
 -- !query 26
-select 2 <= '2.0'
+select 2 <= '1.0'
 -- !query 26 schema
-struct<(2 <= CAST(2.0 AS INT)):boolean>
+struct<(2 <= CAST(1.0 AS INT)):boolean>
 -- !query 26 output
-true
+false
 
 
 -- !query 27
-select 2.0 <= '2.2'
+select 2 <= '2.0'
 -- !query 27 schema
-struct<(CAST(2.0 AS DOUBLE) <= CAST(2.2 AS DOUBLE)):boolean>
+struct<(2 <= CAST(2.0 AS INT)):boolean>
 -- !query 27 output
 true
 
 
 -- !query 28
-select 0.5 <= '1.5'
+select 2.0 <= '2.2'
 -- !query 28 schema
-struct<(CAST(0.5 AS DOUBLE) <= CAST(1.5 AS DOUBLE)):boolean>
+struct<(CAST(2.0 AS DOUBLE) <= CAST(2.2 AS DOUBLE)):boolean>
 -- !query 28 output
 true
 
 
 -- !query 29
-select to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')
+select 0.5 <= '1.5'
 -- !query 29 schema
-struct<(to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')):boolean>
+struct<(CAST(0.5 AS DOUBLE) <= CAST(1.5 AS DOUBLE)):boolean>
 -- !query 29 output
 true
 
 
 -- !query 30
-select to_date('2009-07-30 04:17:52') <= '2009-07-30 04:17:52'
+select to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')
 -- !query 30 schema
-struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) <= 2009-07-30 04:17:52):boolean>
+struct<(to_date('2009-07-30 04:17:52') <= to_date('2009-07-30 04:17:52')):boolean>
 -- !query 30 output
 true
+
+
+-- !query 31
+select to_date('2009-07-30 04:17:52') <= '2009-07-30 04:17:52'
+-- !query 31 schema
+struct<(CAST(to_date('2009-07-30 04:17:52') AS STRING) <= 2009-07-30 04:17:52):boolean>
+-- !query 31 output
+true
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
new file mode 100644
index 0000000000000..fe7bde040707c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
@@ -0,0 +1,2146 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 265
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT cast(1 as binary) = '1' FROM t
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 2
+SELECT cast(1 as binary) > '2' FROM t
+-- !query 2 schema
+struct<>
+-- !query 2 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 3
+SELECT cast(1 as binary) >= '2' FROM t
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 4
+SELECT cast(1 as binary) < '2' FROM t
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 5
+SELECT cast(1 as binary) <= '2' FROM t
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 6
+SELECT cast(1 as binary) <> '2' FROM t
+-- !query 6 schema
+struct<>
+-- !query 6 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 7
+SELECT cast(1 as binary) = cast(null as string) FROM t
+-- !query 7 schema
+struct<>
+-- !query 7 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 8
+SELECT cast(1 as binary) > cast(null as string) FROM t
+-- !query 8 schema
+struct<>
+-- !query 8 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 9
+SELECT cast(1 as binary) >= cast(null as string) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 10
+SELECT cast(1 as binary) < cast(null as string) FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 11
+SELECT cast(1 as binary) <= cast(null as string) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 12
+SELECT cast(1 as binary) <> cast(null as string) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+
+
+-- !query 13
+SELECT '1' = cast(1 as binary) FROM t
+-- !query 13 schema
+struct<>
+-- !query 13 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+
+
+-- !query 14
+SELECT '2' > cast(1 as binary) FROM t
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+
+
+-- !query 15
+SELECT '2' >= cast(1 as binary) FROM t
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+
+
+-- !query 16
+SELECT '2' < cast(1 as binary) FROM t
+-- !query 16 schema
+struct<>
+-- !query 16 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+
+
+-- !query 17
+SELECT '2' <= cast(1 as binary) FROM t
+-- !query 17 schema
+struct<>
+-- !query 17 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+
+
+-- !query 18
+SELECT '2' <> cast(1 as binary) FROM t
+-- !query 18 schema
+struct<>
+-- !query 18 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+
+
+-- !query 19
+SELECT cast(null as string) = cast(1 as binary) FROM t
+-- !query 19 schema
+struct<>
+-- !query 19 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+
+
+-- !query 20
+SELECT cast(null as string) > cast(1 as binary) FROM t
+-- !query 20 schema
+struct<>
+-- !query 20 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+
+
+-- !query 21
+SELECT cast(null as string) >= cast(1 as binary) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+
+
+-- !query 22
+SELECT cast(null as string) < cast(1 as binary) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+
+
+-- !query 23
+SELECT cast(null as string) <= cast(1 as binary) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+
+
+-- !query 24
+SELECT cast(null as string) <> cast(1 as binary) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+
+
+-- !query 25
+SELECT cast(1 as tinyint) = '1' FROM t
+-- !query 25 schema
+struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 25 output
+true
+
+
+-- !query 26
+SELECT cast(1 as tinyint) > '2' FROM t
+-- !query 26 schema
+struct<(CAST(1 AS TINYINT) > CAST(2 AS TINYINT)):boolean>
+-- !query 26 output
+false
+
+
+-- !query 27
+SELECT cast(1 as tinyint) >= '2' FROM t
+-- !query 27 schema
+struct<(CAST(1 AS TINYINT) >= CAST(2 AS TINYINT)):boolean>
+-- !query 27 output
+false
+
+
+-- !query 28
+SELECT cast(1 as tinyint) < '2' FROM t
+-- !query 28 schema
+struct<(CAST(1 AS TINYINT) < CAST(2 AS TINYINT)):boolean>
+-- !query 28 output
+true
+
+
+-- !query 29
+SELECT cast(1 as tinyint) <= '2' FROM t
+-- !query 29 schema
+struct<(CAST(1 AS TINYINT) <= CAST(2 AS TINYINT)):boolean>
+-- !query 29 output
+true
+
+
+-- !query 30
+SELECT cast(1 as tinyint) <> '2' FROM t
+-- !query 30 schema
+struct<(NOT (CAST(1 AS TINYINT) = CAST(2 AS TINYINT))):boolean>
+-- !query 30 output
+true
+
+
+-- !query 31
+SELECT cast(1 as tinyint) = cast(null as string) FROM t
+-- !query 31 schema
+struct<(CAST(1 AS TINYINT) = CAST(CAST(NULL AS STRING) AS TINYINT)):boolean>
+-- !query 31 output
+NULL
+
+
+-- !query 32
+SELECT cast(1 as tinyint) > cast(null as string) FROM t
+-- !query 32 schema
+struct<(CAST(1 AS TINYINT) > CAST(CAST(NULL AS STRING) AS TINYINT)):boolean>
+-- !query 32 output
+NULL
+
+
+-- !query 33
+SELECT cast(1 as tinyint) >= cast(null as string) FROM t
+-- !query 33 schema
+struct<(CAST(1 AS TINYINT) >= CAST(CAST(NULL AS STRING) AS TINYINT)):boolean>
+-- !query 33 output
+NULL
+
+
+-- !query 34
+SELECT cast(1 as tinyint) < cast(null as string) FROM t
+-- !query 34 schema
+struct<(CAST(1 AS TINYINT) < CAST(CAST(NULL AS STRING) AS TINYINT)):boolean>
+-- !query 34 output
+NULL
+
+
+-- !query 35
+SELECT cast(1 as tinyint) <= cast(null as string) FROM t
+-- !query 35 schema
+struct<(CAST(1 AS TINYINT) <= CAST(CAST(NULL AS STRING) AS TINYINT)):boolean>
+-- !query 35 output
+NULL
+
+
+-- !query 36
+SELECT cast(1 as tinyint) <> cast(null as string) FROM t
+-- !query 36 schema
+struct<(NOT (CAST(1 AS TINYINT) = CAST(CAST(NULL AS STRING) AS TINYINT))):boolean>
+-- !query 36 output
+NULL
+
+
+-- !query 37
+SELECT '1' = cast(1 as tinyint) FROM t
+-- !query 37 schema
+struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 37 output
+true
+
+
+-- !query 38
+SELECT '2' > cast(1 as tinyint) FROM t
+-- !query 38 schema
+struct<(CAST(2 AS TINYINT) > CAST(1 AS TINYINT)):boolean>
+-- !query 38 output
+true
+
+
+-- !query 39
+SELECT '2' >= cast(1 as tinyint) FROM t
+-- !query 39 schema
+struct<(CAST(2 AS TINYINT) >= CAST(1 AS TINYINT)):boolean>
+-- !query 39 output
+true
+
+
+-- !query 40
+SELECT '2' < cast(1 as tinyint) FROM t
+-- !query 40 schema
+struct<(CAST(2 AS TINYINT) < CAST(1 AS TINYINT)):boolean>
+-- !query 40 output
+false
+
+
+-- !query 41
+SELECT '2' <= cast(1 as tinyint) FROM t
+-- !query 41 schema
+struct<(CAST(2 AS TINYINT) <= CAST(1 AS TINYINT)):boolean>
+-- !query 41 output
+false
+
+
+-- !query 42
+SELECT '2' <> cast(1 as tinyint) FROM t
+-- !query 42 schema
+struct<(NOT (CAST(2 AS TINYINT) = CAST(1 AS TINYINT))):boolean>
+-- !query 42 output
+true
+
+
+-- !query 43
+SELECT cast(null as string) = cast(1 as tinyint) FROM t
+-- !query 43 schema
+struct<(CAST(CAST(NULL AS STRING) AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 43 output
+NULL
+
+
+-- !query 44
+SELECT cast(null as string) > cast(1 as tinyint) FROM t
+-- !query 44 schema
+struct<(CAST(CAST(NULL AS STRING) AS TINYINT) > CAST(1 AS TINYINT)):boolean>
+-- !query 44 output
+NULL
+
+
+-- !query 45
+SELECT cast(null as string) >= cast(1 as tinyint) FROM t
+-- !query 45 schema
+struct<(CAST(CAST(NULL AS STRING) AS TINYINT) >= CAST(1 AS TINYINT)):boolean>
+-- !query 45 output
+NULL
+
+
+-- !query 46
+SELECT cast(null as string) < cast(1 as tinyint) FROM t
+-- !query 46 schema
+struct<(CAST(CAST(NULL AS STRING) AS TINYINT) < CAST(1 AS TINYINT)):boolean>
+-- !query 46 output
+NULL
+
+
+-- !query 47
+SELECT cast(null as string) <= cast(1 as tinyint) FROM t
+-- !query 47 schema
+struct<(CAST(CAST(NULL AS STRING) AS TINYINT) <= CAST(1 AS TINYINT)):boolean>
+-- !query 47 output
+NULL
+
+
+-- !query 48
+SELECT cast(null as string) <> cast(1 as tinyint) FROM t
+-- !query 48 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS TINYINT) = CAST(1 AS TINYINT))):boolean>
+-- !query 48 output
+NULL
+
+
+-- !query 49
+SELECT cast(1 as smallint) = '1' FROM t
+-- !query 49 schema
+struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 49 output
+true
+
+
+-- !query 50
+SELECT cast(1 as smallint) > '2' FROM t
+-- !query 50 schema
+struct<(CAST(1 AS SMALLINT) > CAST(2 AS SMALLINT)):boolean>
+-- !query 50 output
+false
+
+
+-- !query 51
+SELECT cast(1 as smallint) >= '2' FROM t
+-- !query 51 schema
+struct<(CAST(1 AS SMALLINT) >= CAST(2 AS SMALLINT)):boolean>
+-- !query 51 output
+false
+
+
+-- !query 52
+SELECT cast(1 as smallint) < '2' FROM t
+-- !query 52 schema
+struct<(CAST(1 AS SMALLINT) < CAST(2 AS SMALLINT)):boolean>
+-- !query 52 output
+true
+
+
+-- !query 53
+SELECT cast(1 as smallint) <= '2' FROM t
+-- !query 53 schema
+struct<(CAST(1 AS SMALLINT) <= CAST(2 AS SMALLINT)):boolean>
+-- !query 53 output
+true
+
+
+-- !query 54
+SELECT cast(1 as smallint) <> '2' FROM t
+-- !query 54 schema
+struct<(NOT (CAST(1 AS SMALLINT) = CAST(2 AS SMALLINT))):boolean>
+-- !query 54 output
+true
+
+
+-- !query 55
+SELECT cast(1 as smallint) = cast(null as string) FROM t
+-- !query 55 schema
+struct<(CAST(1 AS SMALLINT) = CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean>
+-- !query 55 output
+NULL
+
+
+-- !query 56
+SELECT cast(1 as smallint) > cast(null as string) FROM t
+-- !query 56 schema
+struct<(CAST(1 AS SMALLINT) > CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean>
+-- !query 56 output
+NULL
+
+
+-- !query 57
+SELECT cast(1 as smallint) >= cast(null as string) FROM t
+-- !query 57 schema
+struct<(CAST(1 AS SMALLINT) >= CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean>
+-- !query 57 output
+NULL
+
+
+-- !query 58
+SELECT cast(1 as smallint) < cast(null as string) FROM t
+-- !query 58 schema
+struct<(CAST(1 AS SMALLINT) < CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean>
+-- !query 58 output
+NULL
+
+
+-- !query 59
+SELECT cast(1 as smallint) <= cast(null as string) FROM t
+-- !query 59 schema
+struct<(CAST(1 AS SMALLINT) <= CAST(CAST(NULL AS STRING) AS SMALLINT)):boolean>
+-- !query 59 output
+NULL
+
+
+-- !query 60
+SELECT cast(1 as smallint) <> cast(null as string) FROM t
+-- !query 60 schema
+struct<(NOT (CAST(1 AS SMALLINT) = CAST(CAST(NULL AS STRING) AS SMALLINT))):boolean>
+-- !query 60 output
+NULL
+
+
+-- !query 61
+SELECT '1' = cast(1 as smallint) FROM t
+-- !query 61 schema
+struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 61 output
+true
+
+
+-- !query 62
+SELECT '2' > cast(1 as smallint) FROM t
+-- !query 62 schema
+struct<(CAST(2 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean>
+-- !query 62 output
+true
+
+
+-- !query 63
+SELECT '2' >= cast(1 as smallint) FROM t
+-- !query 63 schema
+struct<(CAST(2 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean>
+-- !query 63 output
+true
+
+
+-- !query 64
+SELECT '2' < cast(1 as smallint) FROM t
+-- !query 64 schema
+struct<(CAST(2 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean>
+-- !query 64 output
+false
+
+
+-- !query 65
+SELECT '2' <= cast(1 as smallint) FROM t
+-- !query 65 schema
+struct<(CAST(2 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean>
+-- !query 65 output
+false
+
+
+-- !query 66
+SELECT '2' <> cast(1 as smallint) FROM t
+-- !query 66 schema
+struct<(NOT (CAST(2 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean>
+-- !query 66 output
+true
+
+
+-- !query 67
+SELECT cast(null as string) = cast(1 as smallint) FROM t
+-- !query 67 schema
+struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 67 output
+NULL
+
+
+-- !query 68
+SELECT cast(null as string) > cast(1 as smallint) FROM t
+-- !query 68 schema
+struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) > CAST(1 AS SMALLINT)):boolean>
+-- !query 68 output
+NULL
+
+
+-- !query 69
+SELECT cast(null as string) >= cast(1 as smallint) FROM t
+-- !query 69 schema
+struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean>
+-- !query 69 output
+NULL
+
+
+-- !query 70
+SELECT cast(null as string) < cast(1 as smallint) FROM t
+-- !query 70 schema
+struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) < CAST(1 AS SMALLINT)):boolean>
+-- !query 70 output
+NULL
+
+
+-- !query 71
+SELECT cast(null as string) <= cast(1 as smallint) FROM t
+-- !query 71 schema
+struct<(CAST(CAST(NULL AS STRING) AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean>
+-- !query 71 output
+NULL
+
+
+-- !query 72
+SELECT cast(null as string) <> cast(1 as smallint) FROM t
+-- !query 72 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS SMALLINT) = CAST(1 AS SMALLINT))):boolean>
+-- !query 72 output
+NULL
+
+
+-- !query 73
+SELECT cast(1 as int) = '1' FROM t
+-- !query 73 schema
+struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean>
+-- !query 73 output
+true
+
+
+-- !query 74
+SELECT cast(1 as int) > '2' FROM t
+-- !query 74 schema
+struct<(CAST(1 AS INT) > CAST(2 AS INT)):boolean>
+-- !query 74 output
+false
+
+
+-- !query 75
+SELECT cast(1 as int) >= '2' FROM t
+-- !query 75 schema
+struct<(CAST(1 AS INT) >= CAST(2 AS INT)):boolean>
+-- !query 75 output
+false
+
+
+-- !query 76
+SELECT cast(1 as int) < '2' FROM t
+-- !query 76 schema
+struct<(CAST(1 AS INT) < CAST(2 AS INT)):boolean>
+-- !query 76 output
+true
+
+
+-- !query 77
+SELECT cast(1 as int) <= '2' FROM t
+-- !query 77 schema
+struct<(CAST(1 AS INT) <= CAST(2 AS INT)):boolean>
+-- !query 77 output
+true
+
+
+-- !query 78
+SELECT cast(1 as int) <> '2' FROM t
+-- !query 78 schema
+struct<(NOT (CAST(1 AS INT) = CAST(2 AS INT))):boolean>
+-- !query 78 output
+true
+
+
+-- !query 79
+SELECT cast(1 as int) = cast(null as string) FROM t
+-- !query 79 schema
+struct<(CAST(1 AS INT) = CAST(CAST(NULL AS STRING) AS INT)):boolean>
+-- !query 79 output
+NULL
+
+
+-- !query 80
+SELECT cast(1 as int) > cast(null as string) FROM t
+-- !query 80 schema
+struct<(CAST(1 AS INT) > CAST(CAST(NULL AS STRING) AS INT)):boolean>
+-- !query 80 output
+NULL
+
+
+-- !query 81
+SELECT cast(1 as int) >= cast(null as string) FROM t
+-- !query 81 schema
+struct<(CAST(1 AS INT) >= CAST(CAST(NULL AS STRING) AS INT)):boolean>
+-- !query 81 output
+NULL
+
+
+-- !query 82
+SELECT cast(1 as int) < cast(null as string) FROM t
+-- !query 82 schema
+struct<(CAST(1 AS INT) < CAST(CAST(NULL AS STRING) AS INT)):boolean>
+-- !query 82 output
+NULL
+
+
+-- !query 83
+SELECT cast(1 as int) <= cast(null as string) FROM t
+-- !query 83 schema
+struct<(CAST(1 AS INT) <= CAST(CAST(NULL AS STRING) AS INT)):boolean>
+-- !query 83 output
+NULL
+
+
+-- !query 84
+SELECT cast(1 as int) <> cast(null as string) FROM t
+-- !query 84 schema
+struct<(NOT (CAST(1 AS INT) = CAST(CAST(NULL AS STRING) AS INT))):boolean>
+-- !query 84 output
+NULL
+
+
+-- !query 85
+SELECT '1' = cast(1 as int) FROM t
+-- !query 85 schema
+struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean>
+-- !query 85 output
+true
+
+
+-- !query 86
+SELECT '2' > cast(1 as int) FROM t
+-- !query 86 schema
+struct<(CAST(2 AS INT) > CAST(1 AS INT)):boolean>
+-- !query 86 output
+true
+
+
+-- !query 87
+SELECT '2' >= cast(1 as int) FROM t
+-- !query 87 schema
+struct<(CAST(2 AS INT) >= CAST(1 AS INT)):boolean>
+-- !query 87 output
+true
+
+
+-- !query 88
+SELECT '2' < cast(1 as int) FROM t
+-- !query 88 schema
+struct<(CAST(2 AS INT) < CAST(1 AS INT)):boolean>
+-- !query 88 output
+false
+
+
+-- !query 89
+SELECT '2' <> cast(1 as int) FROM t
+-- !query 89 schema
+struct<(NOT (CAST(2 AS INT) = CAST(1 AS INT))):boolean>
+-- !query 89 output
+true
+
+
+-- !query 90
+SELECT '2' <= cast(1 as int) FROM t
+-- !query 90 schema
+struct<(CAST(2 AS INT) <= CAST(1 AS INT)):boolean>
+-- !query 90 output
+false
+
+
+-- !query 91
+SELECT cast(null as string) = cast(1 as int) FROM t
+-- !query 91 schema
+struct<(CAST(CAST(NULL AS STRING) AS INT) = CAST(1 AS INT)):boolean>
+-- !query 91 output
+NULL
+
+
+-- !query 92
+SELECT cast(null as string) > cast(1 as int) FROM t
+-- !query 92 schema
+struct<(CAST(CAST(NULL AS STRING) AS INT) > CAST(1 AS INT)):boolean>
+-- !query 92 output
+NULL
+
+
+-- !query 93
+SELECT cast(null as string) >= cast(1 as int) FROM t
+-- !query 93 schema
+struct<(CAST(CAST(NULL AS STRING) AS INT) >= CAST(1 AS INT)):boolean>
+-- !query 93 output
+NULL
+
+
+-- !query 94
+SELECT cast(null as string) < cast(1 as int) FROM t
+-- !query 94 schema
+struct<(CAST(CAST(NULL AS STRING) AS INT) < CAST(1 AS INT)):boolean>
+-- !query 94 output
+NULL
+
+
+-- !query 95
+SELECT cast(null as string) <> cast(1 as int) FROM t
+-- !query 95 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS INT) = CAST(1 AS INT))):boolean>
+-- !query 95 output
+NULL
+
+
+-- !query 96
+SELECT cast(null as string) <= cast(1 as int) FROM t
+-- !query 96 schema
+struct<(CAST(CAST(NULL AS STRING) AS INT) <= CAST(1 AS INT)):boolean>
+-- !query 96 output
+NULL
+
+
+-- !query 97
+SELECT cast(1 as bigint) = '1' FROM t
+-- !query 97 schema
+struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 97 output
+true
+
+
+-- !query 98
+SELECT cast(1 as bigint) > '2' FROM t
+-- !query 98 schema
+struct<(CAST(1 AS BIGINT) > CAST(2 AS BIGINT)):boolean>
+-- !query 98 output
+false
+
+
+-- !query 99
+SELECT cast(1 as bigint) >= '2' FROM t
+-- !query 99 schema
+struct<(CAST(1 AS BIGINT) >= CAST(2 AS BIGINT)):boolean>
+-- !query 99 output
+false
+
+
+-- !query 100
+SELECT cast(1 as bigint) < '2' FROM t
+-- !query 100 schema
+struct<(CAST(1 AS BIGINT) < CAST(2 AS BIGINT)):boolean>
+-- !query 100 output
+true
+
+
+-- !query 101
+SELECT cast(1 as bigint) <= '2' FROM t
+-- !query 101 schema
+struct<(CAST(1 AS BIGINT) <= CAST(2 AS BIGINT)):boolean>
+-- !query 101 output
+true
+
+
+-- !query 102
+SELECT cast(1 as bigint) <> '2' FROM t
+-- !query 102 schema
+struct<(NOT (CAST(1 AS BIGINT) = CAST(2 AS BIGINT))):boolean>
+-- !query 102 output
+true
+
+
+-- !query 103
+SELECT cast(1 as bigint) = cast(null as string) FROM t
+-- !query 103 schema
+struct<(CAST(1 AS BIGINT) = CAST(CAST(NULL AS STRING) AS BIGINT)):boolean>
+-- !query 103 output
+NULL
+
+
+-- !query 104
+SELECT cast(1 as bigint) > cast(null as string) FROM t
+-- !query 104 schema
+struct<(CAST(1 AS BIGINT) > CAST(CAST(NULL AS STRING) AS BIGINT)):boolean>
+-- !query 104 output
+NULL
+
+
+-- !query 105
+SELECT cast(1 as bigint) >= cast(null as string) FROM t
+-- !query 105 schema
+struct<(CAST(1 AS BIGINT) >= CAST(CAST(NULL AS STRING) AS BIGINT)):boolean>
+-- !query 105 output
+NULL
+
+
+-- !query 106
+SELECT cast(1 as bigint) < cast(null as string) FROM t
+-- !query 106 schema
+struct<(CAST(1 AS BIGINT) < CAST(CAST(NULL AS STRING) AS BIGINT)):boolean>
+-- !query 106 output
+NULL
+
+
+-- !query 107
+SELECT cast(1 as bigint) <= cast(null as string) FROM t
+-- !query 107 schema
+struct<(CAST(1 AS BIGINT) <= CAST(CAST(NULL AS STRING) AS BIGINT)):boolean>
+-- !query 107 output
+NULL
+
+
+-- !query 108
+SELECT cast(1 as bigint) <> cast(null as string) FROM t
+-- !query 108 schema
+struct<(NOT (CAST(1 AS BIGINT) = CAST(CAST(NULL AS STRING) AS BIGINT))):boolean>
+-- !query 108 output
+NULL
+
+
+-- !query 109
+SELECT '1' = cast(1 as bigint) FROM t
+-- !query 109 schema
+struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 109 output
+true
+
+
+-- !query 110
+SELECT '2' > cast(1 as bigint) FROM t
+-- !query 110 schema
+struct<(CAST(2 AS BIGINT) > CAST(1 AS BIGINT)):boolean>
+-- !query 110 output
+true
+
+
+-- !query 111
+SELECT '2' >= cast(1 as bigint) FROM t
+-- !query 111 schema
+struct<(CAST(2 AS BIGINT) >= CAST(1 AS BIGINT)):boolean>
+-- !query 111 output
+true
+
+
+-- !query 112
+SELECT '2' < cast(1 as bigint) FROM t
+-- !query 112 schema
+struct<(CAST(2 AS BIGINT) < CAST(1 AS BIGINT)):boolean>
+-- !query 112 output
+false
+
+
+-- !query 113
+SELECT '2' <= cast(1 as bigint) FROM t
+-- !query 113 schema
+struct<(CAST(2 AS BIGINT) <= CAST(1 AS BIGINT)):boolean>
+-- !query 113 output
+false
+
+
+-- !query 114
+SELECT '2' <> cast(1 as bigint) FROM t
+-- !query 114 schema
+struct<(NOT (CAST(2 AS BIGINT) = CAST(1 AS BIGINT))):boolean>
+-- !query 114 output
+true
+
+
+-- !query 115
+SELECT cast(null as string) = cast(1 as bigint) FROM t
+-- !query 115 schema
+struct<(CAST(CAST(NULL AS STRING) AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 115 output
+NULL
+
+
+-- !query 116
+SELECT cast(null as string) > cast(1 as bigint) FROM t
+-- !query 116 schema
+struct<(CAST(CAST(NULL AS STRING) AS BIGINT) > CAST(1 AS BIGINT)):boolean>
+-- !query 116 output
+NULL
+
+
+-- !query 117
+SELECT cast(null as string) >= cast(1 as bigint) FROM t
+-- !query 117 schema
+struct<(CAST(CAST(NULL AS STRING) AS BIGINT) >= CAST(1 AS BIGINT)):boolean>
+-- !query 117 output
+NULL
+
+
+-- !query 118
+SELECT cast(null as string) < cast(1 as bigint) FROM t
+-- !query 118 schema
+struct<(CAST(CAST(NULL AS STRING) AS BIGINT) < CAST(1 AS BIGINT)):boolean>
+-- !query 118 output
+NULL
+
+
+-- !query 119
+SELECT cast(null as string) <= cast(1 as bigint) FROM t
+-- !query 119 schema
+struct<(CAST(CAST(NULL AS STRING) AS BIGINT) <= CAST(1 AS BIGINT)):boolean>
+-- !query 119 output
+NULL
+
+
+-- !query 120
+SELECT cast(null as string) <> cast(1 as bigint) FROM t
+-- !query 120 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS BIGINT) = CAST(1 AS BIGINT))):boolean>
+-- !query 120 output
+NULL
+
+
+-- !query 121
+SELECT cast(1 as decimal(10, 0)) = '1' FROM t
+-- !query 121 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 121 output
+true
+
+
+-- !query 122
+SELECT cast(1 as decimal(10, 0)) > '2' FROM t
+-- !query 122 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(2 AS DOUBLE)):boolean>
+-- !query 122 output
+false
+
+
+-- !query 123
+SELECT cast(1 as decimal(10, 0)) >= '2' FROM t
+-- !query 123 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(2 AS DOUBLE)):boolean>
+-- !query 123 output
+false
+
+
+-- !query 124
+SELECT cast(1 as decimal(10, 0)) < '2' FROM t
+-- !query 124 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(2 AS DOUBLE)):boolean>
+-- !query 124 output
+true
+
+
+-- !query 125
+SELECT cast(1 as decimal(10, 0)) <> '2' FROM t
+-- !query 125 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(2 AS DOUBLE))):boolean>
+-- !query 125 output
+true
+
+
+-- !query 126
+SELECT cast(1 as decimal(10, 0)) <= '2' FROM t
+-- !query 126 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(2 AS DOUBLE)):boolean>
+-- !query 126 output
+true
+
+
+-- !query 127
+SELECT cast(1 as decimal(10, 0)) = cast(null as string) FROM t
+-- !query 127 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 127 output
+NULL
+
+
+-- !query 128
+SELECT cast(1 as decimal(10, 0)) > cast(null as string) FROM t
+-- !query 128 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 128 output
+NULL
+
+
+-- !query 129
+SELECT cast(1 as decimal(10, 0)) >= cast(null as string) FROM t
+-- !query 129 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 129 output
+NULL
+
+
+-- !query 130
+SELECT cast(1 as decimal(10, 0)) < cast(null as string) FROM t
+-- !query 130 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 130 output
+NULL
+
+
+-- !query 131
+SELECT cast(1 as decimal(10, 0)) <> cast(null as string) FROM t
+-- !query 131 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE))):boolean>
+-- !query 131 output
+NULL
+
+
+-- !query 132
+SELECT cast(1 as decimal(10, 0)) <= cast(null as string) FROM t
+-- !query 132 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 132 output
+NULL
+
+
+-- !query 133
+SELECT '1' = cast(1 as decimal(10, 0)) FROM t
+-- !query 133 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 133 output
+true
+
+
+-- !query 134
+SELECT '2' > cast(1 as decimal(10, 0)) FROM t
+-- !query 134 schema
+struct<(CAST(2 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 134 output
+true
+
+
+-- !query 135
+SELECT '2' >= cast(1 as decimal(10, 0)) FROM t
+-- !query 135 schema
+struct<(CAST(2 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 135 output
+true
+
+
+-- !query 136
+SELECT '2' < cast(1 as decimal(10, 0)) FROM t
+-- !query 136 schema
+struct<(CAST(2 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 136 output
+false
+
+
+-- !query 137
+SELECT '2' <= cast(1 as decimal(10, 0)) FROM t
+-- !query 137 schema
+struct<(CAST(2 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 137 output
+false
+
+
+-- !query 138
+SELECT '2' <> cast(1 as decimal(10, 0)) FROM t
+-- !query 138 schema
+struct<(NOT (CAST(2 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 138 output
+true
+
+
+-- !query 139
+SELECT cast(null as string) = cast(1 as decimal(10, 0)) FROM t
+-- !query 139 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 139 output
+NULL
+
+
+-- !query 140
+SELECT cast(null as string) > cast(1 as decimal(10, 0)) FROM t
+-- !query 140 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 140 output
+NULL
+
+
+-- !query 141
+SELECT cast(null as string) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 141 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 141 output
+NULL
+
+
+-- !query 142
+SELECT cast(null as string) < cast(1 as decimal(10, 0)) FROM t
+-- !query 142 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 142 output
+NULL
+
+
+-- !query 143
+SELECT cast(null as string) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 143 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 143 output
+NULL
+
+
+-- !query 144
+SELECT cast(null as string) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 144 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 144 output
+NULL
+
+
+-- !query 145
+SELECT cast(1 as double) = '1' FROM t
+-- !query 145 schema
+struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 145 output
+true
+
+
+-- !query 146
+SELECT cast(1 as double) > '2' FROM t
+-- !query 146 schema
+struct<(CAST(1 AS DOUBLE) > CAST(2 AS DOUBLE)):boolean>
+-- !query 146 output
+false
+
+
+-- !query 147
+SELECT cast(1 as double) >= '2' FROM t
+-- !query 147 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(2 AS DOUBLE)):boolean>
+-- !query 147 output
+false
+
+
+-- !query 148
+SELECT cast(1 as double) < '2' FROM t
+-- !query 148 schema
+struct<(CAST(1 AS DOUBLE) < CAST(2 AS DOUBLE)):boolean>
+-- !query 148 output
+true
+
+
+-- !query 149
+SELECT cast(1 as double) <= '2' FROM t
+-- !query 149 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(2 AS DOUBLE)):boolean>
+-- !query 149 output
+true
+
+
+-- !query 150
+SELECT cast(1 as double) <> '2' FROM t
+-- !query 150 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(2 AS DOUBLE))):boolean>
+-- !query 150 output
+true
+
+
+-- !query 151
+SELECT cast(1 as double) = cast(null as string) FROM t
+-- !query 151 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 151 output
+NULL
+
+
+-- !query 152
+SELECT cast(1 as double) > cast(null as string) FROM t
+-- !query 152 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 152 output
+NULL
+
+
+-- !query 153
+SELECT cast(1 as double) >= cast(null as string) FROM t
+-- !query 153 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 153 output
+NULL
+
+
+-- !query 154
+SELECT cast(1 as double) < cast(null as string) FROM t
+-- !query 154 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 154 output
+NULL
+
+
+-- !query 155
+SELECT cast(1 as double) <= cast(null as string) FROM t
+-- !query 155 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(NULL AS STRING) AS DOUBLE)):boolean>
+-- !query 155 output
+NULL
+
+
+-- !query 156
+SELECT cast(1 as double) <> cast(null as string) FROM t
+-- !query 156 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(NULL AS STRING) AS DOUBLE))):boolean>
+-- !query 156 output
+NULL
+
+
+-- !query 157
+SELECT '1' = cast(1 as double) FROM t
+-- !query 157 schema
+struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 157 output
+true
+
+
+-- !query 158
+SELECT '2' > cast(1 as double) FROM t
+-- !query 158 schema
+struct<(CAST(2 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 158 output
+true
+
+
+-- !query 159
+SELECT '2' >= cast(1 as double) FROM t
+-- !query 159 schema
+struct<(CAST(2 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 159 output
+true
+
+
+-- !query 160
+SELECT '2' < cast(1 as double) FROM t
+-- !query 160 schema
+struct<(CAST(2 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 160 output
+false
+
+
+-- !query 161
+SELECT '2' <= cast(1 as double) FROM t
+-- !query 161 schema
+struct<(CAST(2 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 161 output
+false
+
+
+-- !query 162
+SELECT '2' <> cast(1 as double) FROM t
+-- !query 162 schema
+struct<(NOT (CAST(2 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 162 output
+true
+
+
+-- !query 163
+SELECT cast(null as string) = cast(1 as double) FROM t
+-- !query 163 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 163 output
+NULL
+
+
+-- !query 164
+SELECT cast(null as string) > cast(1 as double) FROM t
+-- !query 164 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 164 output
+NULL
+
+
+-- !query 165
+SELECT cast(null as string) >= cast(1 as double) FROM t
+-- !query 165 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 165 output
+NULL
+
+
+-- !query 166
+SELECT cast(null as string) < cast(1 as double) FROM t
+-- !query 166 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 166 output
+NULL
+
+
+-- !query 167
+SELECT cast(null as string) <= cast(1 as double) FROM t
+-- !query 167 schema
+struct<(CAST(CAST(NULL AS STRING) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 167 output
+NULL
+
+
+-- !query 168
+SELECT cast(null as string) <> cast(1 as double) FROM t
+-- !query 168 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 168 output
+NULL
+
+
+-- !query 169
+SELECT cast(1 as float) = '1' FROM t
+-- !query 169 schema
+struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 169 output
+true
+
+
+-- !query 170
+SELECT cast(1 as float) > '2' FROM t
+-- !query 170 schema
+struct<(CAST(1 AS FLOAT) > CAST(2 AS FLOAT)):boolean>
+-- !query 170 output
+false
+
+
+-- !query 171
+SELECT cast(1 as float) >= '2' FROM t
+-- !query 171 schema
+struct<(CAST(1 AS FLOAT) >= CAST(2 AS FLOAT)):boolean>
+-- !query 171 output
+false
+
+
+-- !query 172
+SELECT cast(1 as float) < '2' FROM t
+-- !query 172 schema
+struct<(CAST(1 AS FLOAT) < CAST(2 AS FLOAT)):boolean>
+-- !query 172 output
+true
+
+
+-- !query 173
+SELECT cast(1 as float) <= '2' FROM t
+-- !query 173 schema
+struct<(CAST(1 AS FLOAT) <= CAST(2 AS FLOAT)):boolean>
+-- !query 173 output
+true
+
+
+-- !query 174
+SELECT cast(1 as float) <> '2' FROM t
+-- !query 174 schema
+struct<(NOT (CAST(1 AS FLOAT) = CAST(2 AS FLOAT))):boolean>
+-- !query 174 output
+true
+
+
+-- !query 175
+SELECT cast(1 as float) = cast(null as string) FROM t
+-- !query 175 schema
+struct<(CAST(1 AS FLOAT) = CAST(CAST(NULL AS STRING) AS FLOAT)):boolean>
+-- !query 175 output
+NULL
+
+
+-- !query 176
+SELECT cast(1 as float) > cast(null as string) FROM t
+-- !query 176 schema
+struct<(CAST(1 AS FLOAT) > CAST(CAST(NULL AS STRING) AS FLOAT)):boolean>
+-- !query 176 output
+NULL
+
+
+-- !query 177
+SELECT cast(1 as float) >= cast(null as string) FROM t
+-- !query 177 schema
+struct<(CAST(1 AS FLOAT) >= CAST(CAST(NULL AS STRING) AS FLOAT)):boolean>
+-- !query 177 output
+NULL
+
+
+-- !query 178
+SELECT cast(1 as float) < cast(null as string) FROM t
+-- !query 178 schema
+struct<(CAST(1 AS FLOAT) < CAST(CAST(NULL AS STRING) AS FLOAT)):boolean>
+-- !query 178 output
+NULL
+
+
+-- !query 179
+SELECT cast(1 as float) <= cast(null as string) FROM t
+-- !query 179 schema
+struct<(CAST(1 AS FLOAT) <= CAST(CAST(NULL AS STRING) AS FLOAT)):boolean>
+-- !query 179 output
+NULL
+
+
+-- !query 180
+SELECT cast(1 as float) <> cast(null as string) FROM t
+-- !query 180 schema
+struct<(NOT (CAST(1 AS FLOAT) = CAST(CAST(NULL AS STRING) AS FLOAT))):boolean>
+-- !query 180 output
+NULL
+
+
+-- !query 181
+SELECT '1' = cast(1 as float) FROM t
+-- !query 181 schema
+struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 181 output
+true
+
+
+-- !query 182
+SELECT '2' > cast(1 as float) FROM t
+-- !query 182 schema
+struct<(CAST(2 AS FLOAT) > CAST(1 AS FLOAT)):boolean>
+-- !query 182 output
+true
+
+
+-- !query 183
+SELECT '2' >= cast(1 as float) FROM t
+-- !query 183 schema
+struct<(CAST(2 AS FLOAT) >= CAST(1 AS FLOAT)):boolean>
+-- !query 183 output
+true
+
+
+-- !query 184
+SELECT '2' < cast(1 as float) FROM t
+-- !query 184 schema
+struct<(CAST(2 AS FLOAT) < CAST(1 AS FLOAT)):boolean>
+-- !query 184 output
+false
+
+
+-- !query 185
+SELECT '2' <= cast(1 as float) FROM t
+-- !query 185 schema
+struct<(CAST(2 AS FLOAT) <= CAST(1 AS FLOAT)):boolean>
+-- !query 185 output
+false
+
+
+-- !query 186
+SELECT '2' <> cast(1 as float) FROM t
+-- !query 186 schema
+struct<(NOT (CAST(2 AS FLOAT) = CAST(1 AS FLOAT))):boolean>
+-- !query 186 output
+true
+
+
+-- !query 187
+SELECT cast(null as string) = cast(1 as float) FROM t
+-- !query 187 schema
+struct<(CAST(CAST(NULL AS STRING) AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 187 output
+NULL
+
+
+-- !query 188
+SELECT cast(null as string) > cast(1 as float) FROM t
+-- !query 188 schema
+struct<(CAST(CAST(NULL AS STRING) AS FLOAT) > CAST(1 AS FLOAT)):boolean>
+-- !query 188 output
+NULL
+
+
+-- !query 189
+SELECT cast(null as string) >= cast(1 as float) FROM t
+-- !query 189 schema
+struct<(CAST(CAST(NULL AS STRING) AS FLOAT) >= CAST(1 AS FLOAT)):boolean>
+-- !query 189 output
+NULL
+
+
+-- !query 190
+SELECT cast(null as string) < cast(1 as float) FROM t
+-- !query 190 schema
+struct<(CAST(CAST(NULL AS STRING) AS FLOAT) < CAST(1 AS FLOAT)):boolean>
+-- !query 190 output
+NULL
+
+
+-- !query 191
+SELECT cast(null as string) <= cast(1 as float) FROM t
+-- !query 191 schema
+struct<(CAST(CAST(NULL AS STRING) AS FLOAT) <= CAST(1 AS FLOAT)):boolean>
+-- !query 191 output
+NULL
+
+
+-- !query 192
+SELECT cast(null as string) <> cast(1 as float) FROM t
+-- !query 192 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS FLOAT) = CAST(1 AS FLOAT))):boolean>
+-- !query 192 output
+NULL
+
+
+-- !query 193
+SELECT '1996-09-09' = date('1996-09-09') FROM t
+-- !query 193 schema
+struct<(1996-09-09 = CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 193 output
+true
+
+
+-- !query 194
+SELECT '1996-9-10' > date('1996-09-09') FROM t
+-- !query 194 schema
+struct<(1996-9-10 > CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 194 output
+true
+
+
+-- !query 195
+SELECT '1996-9-10' >= date('1996-09-09') FROM t
+-- !query 195 schema
+struct<(1996-9-10 >= CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 195 output
+true
+
+
+-- !query 196
+SELECT '1996-9-10' < date('1996-09-09') FROM t
+-- !query 196 schema
+struct<(1996-9-10 < CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 196 output
+false
+
+
+-- !query 197
+SELECT '1996-9-10' <= date('1996-09-09') FROM t
+-- !query 197 schema
+struct<(1996-9-10 <= CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 197 output
+false
+
+
+-- !query 198
+SELECT '1996-9-10' <> date('1996-09-09') FROM t
+-- !query 198 schema
+struct<(NOT (1996-9-10 = CAST(CAST(1996-09-09 AS DATE) AS STRING))):boolean>
+-- !query 198 output
+true
+
+
+-- !query 199
+SELECT cast(null as string) = date('1996-09-09') FROM t
+-- !query 199 schema
+struct<(CAST(NULL AS STRING) = CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 199 output
+NULL
+
+
+-- !query 200
+SELECT cast(null as string)> date('1996-09-09') FROM t
+-- !query 200 schema
+struct<(CAST(NULL AS STRING) > CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 200 output
+NULL
+
+
+-- !query 201
+SELECT cast(null as string)>= date('1996-09-09') FROM t
+-- !query 201 schema
+struct<(CAST(NULL AS STRING) >= CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 201 output
+NULL
+
+
+-- !query 202
+SELECT cast(null as string)< date('1996-09-09') FROM t
+-- !query 202 schema
+struct<(CAST(NULL AS STRING) < CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 202 output
+NULL
+
+
+-- !query 203
+SELECT cast(null as string)<= date('1996-09-09') FROM t
+-- !query 203 schema
+struct<(CAST(NULL AS STRING) <= CAST(CAST(1996-09-09 AS DATE) AS STRING)):boolean>
+-- !query 203 output
+NULL
+
+
+-- !query 204
+SELECT cast(null as string)<> date('1996-09-09') FROM t
+-- !query 204 schema
+struct<(NOT (CAST(NULL AS STRING) = CAST(CAST(1996-09-09 AS DATE) AS STRING))):boolean>
+-- !query 204 output
+NULL
+
+
+-- !query 205
+SELECT date('1996-09-09') = '1996-09-09' FROM t
+-- !query 205 schema
+struct<(CAST(CAST(1996-09-09 AS DATE) AS STRING) = 1996-09-09):boolean>
+-- !query 205 output
+true
+
+
+-- !query 206
+SELECT date('1996-9-10') > '1996-09-09' FROM t
+-- !query 206 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) > 1996-09-09):boolean>
+-- !query 206 output
+true
+
+
+-- !query 207
+SELECT date('1996-9-10') >= '1996-09-09' FROM t
+-- !query 207 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) >= 1996-09-09):boolean>
+-- !query 207 output
+true
+
+
+-- !query 208
+SELECT date('1996-9-10') < '1996-09-09' FROM t
+-- !query 208 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) < 1996-09-09):boolean>
+-- !query 208 output
+false
+
+
+-- !query 209
+SELECT date('1996-9-10') <= '1996-09-09' FROM t
+-- !query 209 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) <= 1996-09-09):boolean>
+-- !query 209 output
+false
+
+
+-- !query 210
+SELECT date('1996-9-10') <> '1996-09-09' FROM t
+-- !query 210 schema
+struct<(NOT (CAST(CAST(1996-9-10 AS DATE) AS STRING) = 1996-09-09)):boolean>
+-- !query 210 output
+true
+
+
+-- !query 211
+SELECT date('1996-09-09') = cast(null as string) FROM t
+-- !query 211 schema
+struct<(CAST(CAST(1996-09-09 AS DATE) AS STRING) = CAST(NULL AS STRING)):boolean>
+-- !query 211 output
+NULL
+
+
+-- !query 212
+SELECT date('1996-9-10') > cast(null as string) FROM t
+-- !query 212 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) > CAST(NULL AS STRING)):boolean>
+-- !query 212 output
+NULL
+
+
+-- !query 213
+SELECT date('1996-9-10') >= cast(null as string) FROM t
+-- !query 213 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) >= CAST(NULL AS STRING)):boolean>
+-- !query 213 output
+NULL
+
+
+-- !query 214
+SELECT date('1996-9-10') < cast(null as string) FROM t
+-- !query 214 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) < CAST(NULL AS STRING)):boolean>
+-- !query 214 output
+NULL
+
+
+-- !query 215
+SELECT date('1996-9-10') <= cast(null as string) FROM t
+-- !query 215 schema
+struct<(CAST(CAST(1996-9-10 AS DATE) AS STRING) <= CAST(NULL AS STRING)):boolean>
+-- !query 215 output
+NULL
+
+
+-- !query 216
+SELECT date('1996-9-10') <> cast(null as string) FROM t
+-- !query 216 schema
+struct<(NOT (CAST(CAST(1996-9-10 AS DATE) AS STRING) = CAST(NULL AS STRING))):boolean>
+-- !query 216 output
+NULL
+
+
+-- !query 217
+SELECT '1996-09-09 12:12:12.4' = timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 217 schema
+struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean>
+-- !query 217 output
+true
+
+
+-- !query 218
+SELECT '1996-09-09 12:12:12.5' > timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 218 schema
+struct<(1996-09-09 12:12:12.5 > CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 218 output
+true
+
+
+-- !query 219
+SELECT '1996-09-09 12:12:12.5' >= timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 219 schema
+struct<(1996-09-09 12:12:12.5 >= CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 219 output
+true
+
+
+-- !query 220
+SELECT '1996-09-09 12:12:12.5' < timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 220 schema
+struct<(1996-09-09 12:12:12.5 < CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 220 output
+false
+
+
+-- !query 221
+SELECT '1996-09-09 12:12:12.5' <= timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 221 schema
+struct<(1996-09-09 12:12:12.5 <= CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 221 output
+false
+
+
+-- !query 222
+SELECT '1996-09-09 12:12:12.5' <> timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 222 schema
+struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean>
+-- !query 222 output
+true
+
+
+-- !query 223
+SELECT cast(null as string) = timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 223 schema
+struct<(CAST(CAST(NULL AS STRING) AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean>
+-- !query 223 output
+NULL
+
+
+-- !query 224
+SELECT cast(null as string) > timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 224 schema
+struct<(CAST(NULL AS STRING) > CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 224 output
+NULL
+
+
+-- !query 225
+SELECT cast(null as string) >= timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 225 schema
+struct<(CAST(NULL AS STRING) >= CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 225 output
+NULL
+
+
+-- !query 226
+SELECT cast(null as string) < timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 226 schema
+struct<(CAST(NULL AS STRING) < CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 226 output
+NULL
+
+
+-- !query 227
+SELECT cast(null as string) <= timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 227 schema
+struct<(CAST(NULL AS STRING) <= CAST(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 227 output
+NULL
+
+
+-- !query 228
+SELECT cast(null as string) <> timestamp('1996-09-09 12:12:12.4') FROM t
+-- !query 228 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean>
+-- !query 228 output
+NULL
+
+
+-- !query 229
+SELECT timestamp('1996-09-09 12:12:12.4' )= '1996-09-09 12:12:12.4' FROM t
+-- !query 229 schema
+struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP)):boolean>
+-- !query 229 output
+true
+
+
+-- !query 230
+SELECT timestamp('1996-09-09 12:12:12.5' )> '1996-09-09 12:12:12.4' FROM t
+-- !query 230 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) > 1996-09-09 12:12:12.4):boolean>
+-- !query 230 output
+true
+
+
+-- !query 231
+SELECT timestamp('1996-09-09 12:12:12.5' )>= '1996-09-09 12:12:12.4' FROM t
+-- !query 231 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) >= 1996-09-09 12:12:12.4):boolean>
+-- !query 231 output
+true
+
+
+-- !query 232
+SELECT timestamp('1996-09-09 12:12:12.5' )< '1996-09-09 12:12:12.4' FROM t
+-- !query 232 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) < 1996-09-09 12:12:12.4):boolean>
+-- !query 232 output
+false
+
+
+-- !query 233
+SELECT timestamp('1996-09-09 12:12:12.5' )<= '1996-09-09 12:12:12.4' FROM t
+-- !query 233 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) <= 1996-09-09 12:12:12.4):boolean>
+-- !query 233 output
+false
+
+
+-- !query 234
+SELECT timestamp('1996-09-09 12:12:12.5' )<> '1996-09-09 12:12:12.4' FROM t
+-- !query 234 schema
+struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(1996-09-09 12:12:12.4 AS TIMESTAMP))):boolean>
+-- !query 234 output
+true
+
+
+-- !query 235
+SELECT timestamp('1996-09-09 12:12:12.4' )= cast(null as string) FROM t
+-- !query 235 schema
+struct<(CAST(1996-09-09 12:12:12.4 AS TIMESTAMP) = CAST(CAST(NULL AS STRING) AS TIMESTAMP)):boolean>
+-- !query 235 output
+NULL
+
+
+-- !query 236
+SELECT timestamp('1996-09-09 12:12:12.5' )> cast(null as string) FROM t
+-- !query 236 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) > CAST(NULL AS STRING)):boolean>
+-- !query 236 output
+NULL
+
+
+-- !query 237
+SELECT timestamp('1996-09-09 12:12:12.5' )>= cast(null as string) FROM t
+-- !query 237 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) >= CAST(NULL AS STRING)):boolean>
+-- !query 237 output
+NULL
+
+
+-- !query 238
+SELECT timestamp('1996-09-09 12:12:12.5' )< cast(null as string) FROM t
+-- !query 238 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) < CAST(NULL AS STRING)):boolean>
+-- !query 238 output
+NULL
+
+
+-- !query 239
+SELECT timestamp('1996-09-09 12:12:12.5' )<= cast(null as string) FROM t
+-- !query 239 schema
+struct<(CAST(CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) AS STRING) <= CAST(NULL AS STRING)):boolean>
+-- !query 239 output
+NULL
+
+
+-- !query 240
+SELECT timestamp('1996-09-09 12:12:12.5' )<> cast(null as string) FROM t
+-- !query 240 schema
+struct<(NOT (CAST(1996-09-09 12:12:12.5 AS TIMESTAMP) = CAST(CAST(NULL AS STRING) AS TIMESTAMP))):boolean>
+-- !query 240 output
+NULL
+
+
+-- !query 241
+SELECT ' ' = X'0020' FROM t
+-- !query 241 schema
+struct<(CAST(  AS BINARY) = X'0020'):boolean>
+-- !query 241 output
+false
+
+
+-- !query 242
+SELECT ' ' > X'001F' FROM t
+-- !query 242 schema
+struct<(CAST(  AS BINARY) > X'001F'):boolean>
+-- !query 242 output
+true
+
+
+-- !query 243
+SELECT ' ' >= X'001F' FROM t
+-- !query 243 schema
+struct<(CAST(  AS BINARY) >= X'001F'):boolean>
+-- !query 243 output
+true
+
+
+-- !query 244
+SELECT ' ' < X'001F' FROM t
+-- !query 244 schema
+struct<(CAST(  AS BINARY) < X'001F'):boolean>
+-- !query 244 output
+false
+
+
+-- !query 245
+SELECT ' ' <= X'001F' FROM t
+-- !query 245 schema
+struct<(CAST(  AS BINARY) <= X'001F'):boolean>
+-- !query 245 output
+false
+
+
+-- !query 246
+SELECT ' ' <> X'001F' FROM t
+-- !query 246 schema
+struct<(NOT (CAST(  AS BINARY) = X'001F')):boolean>
+-- !query 246 output
+true
+
+
+-- !query 247
+SELECT cast(null as string) = X'0020' FROM t
+-- !query 247 schema
+struct<(CAST(CAST(NULL AS STRING) AS BINARY) = X'0020'):boolean>
+-- !query 247 output
+NULL
+
+
+-- !query 248
+SELECT cast(null as string) > X'001F' FROM t
+-- !query 248 schema
+struct<(CAST(CAST(NULL AS STRING) AS BINARY) > X'001F'):boolean>
+-- !query 248 output
+NULL
+
+
+-- !query 249
+SELECT cast(null as string) >= X'001F' FROM t
+-- !query 249 schema
+struct<(CAST(CAST(NULL AS STRING) AS BINARY) >= X'001F'):boolean>
+-- !query 249 output
+NULL
+
+
+-- !query 250
+SELECT cast(null as string) < X'001F' FROM t
+-- !query 250 schema
+struct<(CAST(CAST(NULL AS STRING) AS BINARY) < X'001F'):boolean>
+-- !query 250 output
+NULL
+
+
+-- !query 251
+SELECT cast(null as string) <= X'001F' FROM t
+-- !query 251 schema
+struct<(CAST(CAST(NULL AS STRING) AS BINARY) <= X'001F'):boolean>
+-- !query 251 output
+NULL
+
+
+-- !query 252
+SELECT cast(null as string) <> X'001F' FROM t
+-- !query 252 schema
+struct<(NOT (CAST(CAST(NULL AS STRING) AS BINARY) = X'001F')):boolean>
+-- !query 252 output
+NULL
+
+
+-- !query 253
+SELECT X'0020' = ' ' FROM t
+-- !query 253 schema
+struct<(X'0020' = CAST(  AS BINARY)):boolean>
+-- !query 253 output
+false
+
+
+-- !query 254
+SELECT X'001F' > ' ' FROM t
+-- !query 254 schema
+struct<(X'001F' > CAST(  AS BINARY)):boolean>
+-- !query 254 output
+false
+
+
+-- !query 255
+SELECT X'001F' >= ' ' FROM t
+-- !query 255 schema
+struct<(X'001F' >= CAST(  AS BINARY)):boolean>
+-- !query 255 output
+false
+
+
+-- !query 256
+SELECT X'001F' < ' ' FROM t
+-- !query 256 schema
+struct<(X'001F' < CAST(  AS BINARY)):boolean>
+-- !query 256 output
+true
+
+
+-- !query 257
+SELECT X'001F' <= ' ' FROM t
+-- !query 257 schema
+struct<(X'001F' <= CAST(  AS BINARY)):boolean>
+-- !query 257 output
+true
+
+
+-- !query 258
+SELECT X'001F' <> ' ' FROM t
+-- !query 258 schema
+struct<(NOT (X'001F' = CAST(  AS BINARY))):boolean>
+-- !query 258 output
+true
+
+
+-- !query 259
+SELECT X'0020' = cast(null as string) FROM t
+-- !query 259 schema
+struct<(X'0020' = CAST(CAST(NULL AS STRING) AS BINARY)):boolean>
+-- !query 259 output
+NULL
+
+
+-- !query 260
+SELECT X'001F' > cast(null as string) FROM t
+-- !query 260 schema
+struct<(X'001F' > CAST(CAST(NULL AS STRING) AS BINARY)):boolean>
+-- !query 260 output
+NULL
+
+
+-- !query 261
+SELECT X'001F' >= cast(null as string) FROM t
+-- !query 261 schema
+struct<(X'001F' >= CAST(CAST(NULL AS STRING) AS BINARY)):boolean>
+-- !query 261 output
+NULL
+
+
+-- !query 262
+SELECT X'001F' < cast(null as string) FROM t
+-- !query 262 schema
+struct<(X'001F' < CAST(CAST(NULL AS STRING) AS BINARY)):boolean>
+-- !query 262 output
+NULL
+
+
+-- !query 263
+SELECT X'001F' <= cast(null as string) FROM t
+-- !query 263 schema
+struct<(X'001F' <= CAST(CAST(NULL AS STRING) AS BINARY)):boolean>
+-- !query 263 output
+NULL
+
+
+-- !query 264
+SELECT X'001F' <> cast(null as string) FROM t
+-- !query 264 schema
+struct<(NOT (X'001F' = CAST(CAST(NULL AS STRING) AS BINARY))):boolean>
+-- !query 264 output
+NULL
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out
new file mode 100644
index 0000000000000..44fa48e2697b3
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/implicitTypeCasts.sql.out
@@ -0,0 +1,354 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 44
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT 1 + '2' FROM t
+-- !query 1 schema
+struct<(CAST(1 AS DOUBLE) + CAST(2 AS DOUBLE)):double>
+-- !query 1 output
+3.0
+
+
+-- !query 2
+SELECT 1 - '2' FROM t
+-- !query 2 schema
+struct<(CAST(1 AS DOUBLE) - CAST(2 AS DOUBLE)):double>
+-- !query 2 output
+-1.0
+
+
+-- !query 3
+SELECT 1 * '2' FROM t
+-- !query 3 schema
+struct<(CAST(1 AS DOUBLE) * CAST(2 AS DOUBLE)):double>
+-- !query 3 output
+2.0
+
+
+-- !query 4
+SELECT 4 / '2' FROM t
+-- !query 4 schema
+struct<(CAST(4 AS DOUBLE) / CAST(CAST(2 AS DOUBLE) AS DOUBLE)):double>
+-- !query 4 output
+2.0
+
+
+-- !query 5
+SELECT 1.1 + '2' FROM t
+-- !query 5 schema
+struct<(CAST(1.1 AS DOUBLE) + CAST(2 AS DOUBLE)):double>
+-- !query 5 output
+3.1
+
+
+-- !query 6
+SELECT 1.1 - '2' FROM t
+-- !query 6 schema
+struct<(CAST(1.1 AS DOUBLE) - CAST(2 AS DOUBLE)):double>
+-- !query 6 output
+-0.8999999999999999
+
+
+-- !query 7
+SELECT 1.1 * '2' FROM t
+-- !query 7 schema
+struct<(CAST(1.1 AS DOUBLE) * CAST(2 AS DOUBLE)):double>
+-- !query 7 output
+2.2
+
+
+-- !query 8
+SELECT 4.4 / '2' FROM t
+-- !query 8 schema
+struct<(CAST(4.4 AS DOUBLE) / CAST(2 AS DOUBLE)):double>
+-- !query 8 output
+2.2
+
+
+-- !query 9
+SELECT 1.1 + '2.2' FROM t
+-- !query 9 schema
+struct<(CAST(1.1 AS DOUBLE) + CAST(2.2 AS DOUBLE)):double>
+-- !query 9 output
+3.3000000000000003
+
+
+-- !query 10
+SELECT 1.1 - '2.2' FROM t
+-- !query 10 schema
+struct<(CAST(1.1 AS DOUBLE) - CAST(2.2 AS DOUBLE)):double>
+-- !query 10 output
+-1.1
+
+
+-- !query 11
+SELECT 1.1 * '2.2' FROM t
+-- !query 11 schema
+struct<(CAST(1.1 AS DOUBLE) * CAST(2.2 AS DOUBLE)):double>
+-- !query 11 output
+2.4200000000000004
+
+
+-- !query 12
+SELECT 4.4 / '2.2' FROM t
+-- !query 12 schema
+struct<(CAST(4.4 AS DOUBLE) / CAST(2.2 AS DOUBLE)):double>
+-- !query 12 output
+2.0
+
+
+-- !query 13
+SELECT '$' || cast(1 as smallint) || '$' FROM t
+-- !query 13 schema
+struct<concat(concat($, CAST(CAST(1 AS SMALLINT) AS STRING)), $):string>
+-- !query 13 output
+$1$
+
+
+-- !query 14
+SELECT '$' || 1 || '$' FROM t
+-- !query 14 schema
+struct<concat(concat($, CAST(1 AS STRING)), $):string>
+-- !query 14 output
+$1$
+
+
+-- !query 15
+SELECT '$' || cast(1 as bigint) || '$' FROM t
+-- !query 15 schema
+struct<concat(concat($, CAST(CAST(1 AS BIGINT) AS STRING)), $):string>
+-- !query 15 output
+$1$
+
+
+-- !query 16
+SELECT '$' || cast(1.1 as float) || '$' FROM t
+-- !query 16 schema
+struct<concat(concat($, CAST(CAST(1.1 AS FLOAT) AS STRING)), $):string>
+-- !query 16 output
+$1.1$
+
+
+-- !query 17
+SELECT '$' || cast(1.1 as double) || '$' FROM t
+-- !query 17 schema
+struct<concat(concat($, CAST(CAST(1.1 AS DOUBLE) AS STRING)), $):string>
+-- !query 17 output
+$1.1$
+
+
+-- !query 18
+SELECT '$' || 1.1 || '$' FROM t
+-- !query 18 schema
+struct<concat(concat($, CAST(1.1 AS STRING)), $):string>
+-- !query 18 output
+$1.1$
+
+
+-- !query 19
+SELECT '$' || cast(1.1 as decimal(8,3)) || '$' FROM t
+-- !query 19 schema
+struct<concat(concat($, CAST(CAST(1.1 AS DECIMAL(8,3)) AS STRING)), $):string>
+-- !query 19 output
+$1.100$
+
+
+-- !query 20
+SELECT '$' || 'abcd' || '$' FROM t
+-- !query 20 schema
+struct<concat(concat($, abcd), $):string>
+-- !query 20 output
+$abcd$
+
+
+-- !query 21
+SELECT '$' || date('1996-09-09') || '$' FROM t
+-- !query 21 schema
+struct<concat(concat($, CAST(CAST(1996-09-09 AS DATE) AS STRING)), $):string>
+-- !query 21 output
+$1996-09-09$
+
+
+-- !query 22
+SELECT '$' || timestamp('1996-09-09 10:11:12.4' )|| '$' FROM t
+-- !query 22 schema
+struct<concat(concat($, CAST(CAST(1996-09-09 10:11:12.4 AS TIMESTAMP) AS STRING)), $):string>
+-- !query 22 output
+$1996-09-09 10:11:12.4$
+
+
+-- !query 23
+SELECT length(cast(1 as smallint)) FROM t
+-- !query 23 schema
+struct<length(CAST(CAST(1 AS SMALLINT) AS STRING)):int>
+-- !query 23 output
+1
+
+
+-- !query 24
+SELECT length(cast(1 as int)) FROM t
+-- !query 24 schema
+struct<length(CAST(CAST(1 AS INT) AS STRING)):int>
+-- !query 24 output
+1
+
+
+-- !query 25
+SELECT length(cast(1 as bigint)) FROM t
+-- !query 25 schema
+struct<length(CAST(CAST(1 AS BIGINT) AS STRING)):int>
+-- !query 25 output
+1
+
+
+-- !query 26
+SELECT length(cast(1.1 as float)) FROM t
+-- !query 26 schema
+struct<length(CAST(CAST(1.1 AS FLOAT) AS STRING)):int>
+-- !query 26 output
+3
+
+
+-- !query 27
+SELECT length(cast(1.1 as double)) FROM t
+-- !query 27 schema
+struct<length(CAST(CAST(1.1 AS DOUBLE) AS STRING)):int>
+-- !query 27 output
+3
+
+
+-- !query 28
+SELECT length(1.1) FROM t
+-- !query 28 schema
+struct<length(CAST(1.1 AS STRING)):int>
+-- !query 28 output
+3
+
+
+-- !query 29
+SELECT length(cast(1.1 as decimal(8,3))) FROM t
+-- !query 29 schema
+struct<length(CAST(CAST(1.1 AS DECIMAL(8,3)) AS STRING)):int>
+-- !query 29 output
+5
+
+
+-- !query 30
+SELECT length('four') FROM t
+-- !query 30 schema
+struct<length(four):int>
+-- !query 30 output
+4
+
+
+-- !query 31
+SELECT length(date('1996-09-10')) FROM t
+-- !query 31 schema
+struct<length(CAST(CAST(1996-09-10 AS DATE) AS STRING)):int>
+-- !query 31 output
+10
+
+
+-- !query 32
+SELECT length(timestamp('1996-09-10 10:11:12.4')) FROM t
+-- !query 32 schema
+struct<length(CAST(CAST(1996-09-10 10:11:12.4 AS TIMESTAMP) AS STRING)):int>
+-- !query 32 output
+21
+
+
+-- !query 33
+SELECT year( '1996-01-10') FROM t
+-- !query 33 schema
+struct<year(CAST(1996-01-10 AS DATE)):int>
+-- !query 33 output
+1996
+
+
+-- !query 34
+SELECT month( '1996-01-10') FROM t
+-- !query 34 schema
+struct<month(CAST(1996-01-10 AS DATE)):int>
+-- !query 34 output
+1
+
+
+-- !query 35
+SELECT day( '1996-01-10') FROM t
+-- !query 35 schema
+struct<dayofmonth(CAST(1996-01-10 AS DATE)):int>
+-- !query 35 output
+10
+
+
+-- !query 36
+SELECT hour( '10:11:12') FROM t
+-- !query 36 schema
+struct<hour(CAST(10:11:12 AS TIMESTAMP)):int>
+-- !query 36 output
+10
+
+
+-- !query 37
+SELECT minute( '10:11:12') FROM t
+-- !query 37 schema
+struct<minute(CAST(10:11:12 AS TIMESTAMP)):int>
+-- !query 37 output
+11
+
+
+-- !query 38
+SELECT second( '10:11:12') FROM t
+-- !query 38 schema
+struct<second(CAST(10:11:12 AS TIMESTAMP)):int>
+-- !query 38 output
+12
+
+
+-- !query 39
+select 1 like '%' FROM t
+-- !query 39 schema
+struct<CAST(1 AS STRING) LIKE %:boolean>
+-- !query 39 output
+true
+
+
+-- !query 40
+select date('1996-09-10') like '19%' FROM t
+-- !query 40 schema
+struct<CAST(CAST(1996-09-10 AS DATE) AS STRING) LIKE 19%:boolean>
+-- !query 40 output
+true
+
+
+-- !query 41
+select '1' like 1 FROM t
+-- !query 41 schema
+struct<1 LIKE CAST(1 AS STRING):boolean>
+-- !query 41 output
+true
+
+
+-- !query 42
+select '1 ' like 1 FROM t
+-- !query 42 schema
+struct<1  LIKE CAST(1 AS STRING):boolean>
+-- !query 42 output
+false
+
+
+-- !query 43
+select '1996-09-10' like date('1996-09-10') FROM t
+-- !query 43 schema
+struct<1996-09-10 LIKE CAST(CAST(1996-09-10 AS DATE) AS STRING):boolean>
+-- !query 43 output
+true
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
index cac3e12dde3e1..e3901af4b9988 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala
@@ -300,7 +300,7 @@ class SQLQueryTestSuite extends QueryTest with SharedSQLContext {
       Locale.setDefault(originalLocale)
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logInfo(RuleExecutor.dumpTimeSpent())
+      logWarning(RuleExecutor.dumpTimeSpent())
     } finally {
       super.afterAll()
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
index dbea03689785c..a58000da1543d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -41,7 +41,7 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
   protected override def afterAll(): Unit = {
     try {
       // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logInfo(RuleExecutor.dumpTimeSpent())
+      logWarning(RuleExecutor.dumpTimeSpent())
       spark.sessionState.catalog.reset()
     } finally {
       super.afterAll()
diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index def70a516a96b..45791c69b4cb7 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -76,7 +76,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
       TestHive.setConf(SQLConf.SESSION_LOCAL_TIMEZONE, originalSessionLocalTimeZone)
 
       // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logInfo(RuleExecutor.dumpTimeSpent())
+      logWarning(RuleExecutor.dumpTimeSpent())
     } finally {
       super.afterAll()
     }

From a4002651a3ea673cf3eff7927531c1659663d194 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 11 Dec 2017 16:33:06 -0800
Subject: [PATCH 086/356] [SPARK-20557][SQL] Only support TIMESTAMP WITH TIME
 ZONE for Oracle Dialect

## What changes were proposed in this pull request?
In the previous PRs, https://github.com/apache/spark/pull/17832 and https://github.com/apache/spark/pull/17835 , we convert `TIMESTAMP WITH TIME ZONE` and `TIME WITH TIME ZONE` to `TIMESTAMP` for all the JDBC sources. However, this conversion could be risky since it does not respect our SQL configuration `spark.sql.session.timeZone`.

In addition, each vendor might have different semantics for these two types. For example, Postgres simply returns `TIMESTAMP` types for `TIMESTAMP WITH TIME ZONE`. For such supports, we should do it case by case. This PR reverts the general support of `TIMESTAMP WITH TIME ZONE` and `TIME WITH TIME ZONE` for JDBC sources, except ORACLE Dialect.

When supporting the ORACLE's `TIMESTAMP WITH TIME ZONE`, we only support it when the JVM default timezone is the same as the user-specified configuration `spark.sql.session.timeZone` (whose default is the JVM default timezone). Now, we still treat `TIMESTAMP WITH TIME ZONE` as `TIMESTAMP` when fetching the values via the Oracle JDBC connector, whose client converts the timestamp values with time zone to the timestamp values using the local JVM default timezone (a test case is added to `OracleIntegrationSuite.scala` in this PR for showing the behavior). Thus, to avoid any future behavior change, we will not support it if JVM default timezone is different from `spark.sql.session.timeZone`

No regression because the previous two PRs were just merged to be unreleased master branch.

## How was this patch tested?
Added the test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19939 from gatorsmile/timezoneUpdate.
---
 .../sql/jdbc/OracleIntegrationSuite.scala     | 67 ++++++++++++++++++-
 .../sql/jdbc/PostgresIntegrationSuite.scala   |  2 +
 .../datasources/jdbc/JdbcUtils.scala          |  4 +-
 .../apache/spark/sql/jdbc/OracleDialect.scala | 13 +++-
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala |  4 +-
 5 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
index 90343182712ed..8512496e5fe52 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala
@@ -18,11 +18,12 @@
 package org.apache.spark.sql.jdbc
 
 import java.sql.{Connection, Date, Timestamp}
-import java.util.Properties
+import java.util.{Properties, TimeZone}
 import java.math.BigDecimal
 
-import org.apache.spark.sql.{DataFrame, Row, SaveMode}
-import org.apache.spark.sql.execution.{WholeStageCodegenExec, RowDataSourceScanExec}
+import org.apache.spark.sql.{DataFrame, QueryTest, Row, SaveMode}
+import org.apache.spark.sql.execution.{RowDataSourceScanExec, WholeStageCodegenExec}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
 import org.apache.spark.tags.DockerTest
@@ -77,6 +78,9 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     conn.prepareStatement(
       "INSERT INTO ts_with_timezone VALUES " +
         "(1, to_timestamp_tz('1999-12-01 11:00:00 UTC','YYYY-MM-DD HH:MI:SS TZR'))").executeUpdate()
+    conn.prepareStatement(
+      "INSERT INTO ts_with_timezone VALUES " +
+        "(2, to_timestamp_tz('1999-12-01 12:00:00 PST','YYYY-MM-DD HH:MI:SS TZR'))").executeUpdate()
     conn.commit()
 
     conn.prepareStatement(
@@ -235,6 +239,63 @@ class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSQLCo
     assert(types(1).equals("class java.sql.Timestamp"))
   }
 
+  test("Column type TIMESTAMP with SESSION_LOCAL_TIMEZONE is different from default") {
+    val defaultJVMTimeZone = TimeZone.getDefault
+    // Pick the timezone different from the current default time zone of JVM
+    val sofiaTimeZone = TimeZone.getTimeZone("Europe/Sofia")
+    val shanghaiTimeZone = TimeZone.getTimeZone("Asia/Shanghai")
+    val localSessionTimeZone =
+      if (defaultJVMTimeZone == shanghaiTimeZone) sofiaTimeZone else shanghaiTimeZone
+
+    withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> localSessionTimeZone.getID) {
+      val e = intercept[java.sql.SQLException] {
+        val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties)
+        dfRead.collect()
+      }.getMessage
+      assert(e.contains("Unrecognized SQL type -101"))
+    }
+  }
+
+  /**
+   * Change the Time Zone `timeZoneId` of JVM before executing `f`, then switches back to the
+   * original after `f` returns.
+   * @param timeZoneId the ID for a TimeZone, either an abbreviation such as "PST", a full name such
+   *                   as "America/Los_Angeles", or a custom ID such as "GMT-8:00".
+   */
+  private def withTimeZone(timeZoneId: String)(f: => Unit): Unit = {
+    val originalLocale = TimeZone.getDefault
+    try {
+      // Add Locale setting
+      TimeZone.setDefault(TimeZone.getTimeZone(timeZoneId))
+      f
+    } finally {
+      TimeZone.setDefault(originalLocale)
+    }
+  }
+
+  test("Column TIMESTAMP with TIME ZONE(JVM timezone)") {
+    def checkRow(row: Row, ts: String): Unit = {
+      assert(row.getTimestamp(1).equals(Timestamp.valueOf(ts)))
+    }
+
+    withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> TimeZone.getDefault.getID) {
+      val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties)
+      withTimeZone("PST") {
+        assert(dfRead.collect().toSet ===
+          Set(
+            Row(BigDecimal.valueOf(1), java.sql.Timestamp.valueOf("1999-12-01 03:00:00")),
+            Row(BigDecimal.valueOf(2), java.sql.Timestamp.valueOf("1999-12-01 12:00:00"))))
+      }
+
+      withTimeZone("UTC") {
+        assert(dfRead.collect().toSet ===
+          Set(
+            Row(BigDecimal.valueOf(1), java.sql.Timestamp.valueOf("1999-12-01 11:00:00")),
+            Row(BigDecimal.valueOf(2), java.sql.Timestamp.valueOf("1999-12-01 20:00:00"))))
+      }
+    }
+  }
+
   test("SPARK-18004: Make sure date or timestamp related predicate is pushed down correctly") {
     val props = new Properties()
     props.put("oracle.jdbc.mapDateToTimestamp", "false")
diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
index 48aba90afc787..be32cb89f4886 100644
--- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
+++ b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala
@@ -151,6 +151,8 @@ class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite {
 
   test("SPARK-20557: column type TIMESTAMP with TIME ZONE and TIME with TIME ZONE " +
     "should be recognized") {
+    // When using JDBC to read the columns of TIMESTAMP with TIME ZONE and TIME with TIME ZONE
+    // the actual types are java.sql.Types.TIMESTAMP and java.sql.Types.TIME
     val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties)
     val rows = dfRead.collect()
     val types = rows(0).toSeq.map(x => x.getClass.toString)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 75c94fc486493..bbc95df4d9dc4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -226,10 +226,10 @@ object JdbcUtils extends Logging {
       case java.sql.Types.STRUCT        => StringType
       case java.sql.Types.TIME          => TimestampType
       case java.sql.Types.TIME_WITH_TIMEZONE
-                                        => TimestampType
+                                        => null
       case java.sql.Types.TIMESTAMP     => TimestampType
       case java.sql.Types.TIMESTAMP_WITH_TIMEZONE
-                                        => TimestampType
+                                        => null
       case java.sql.Types.TINYINT       => IntegerType
       case java.sql.Types.VARBINARY     => BinaryType
       case java.sql.Types.VARCHAR       => StringType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
index e3f106c41c7ff..6ef77f24460be 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala
@@ -18,7 +18,10 @@
 package org.apache.spark.sql.jdbc
 
 import java.sql.{Date, Timestamp, Types}
+import java.util.TimeZone
 
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 
@@ -29,6 +32,13 @@ private case object OracleDialect extends JdbcDialect {
 
   override def canHandle(url: String): Boolean = url.startsWith("jdbc:oracle")
 
+  private def supportTimeZoneTypes: Boolean = {
+    val timeZone = DateTimeUtils.getTimeZone(SQLConf.get.sessionLocalTimeZone)
+    // TODO: support timezone types when users are not using the JVM timezone, which
+    // is the default value of SESSION_LOCAL_TIMEZONE
+    timeZone == TimeZone.getDefault
+  }
+
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     sqlType match {
@@ -49,7 +59,8 @@ private case object OracleDialect extends JdbcDialect {
           case _ if scale == -127L => Option(DecimalType(DecimalType.MAX_PRECISION, 10))
           case _ => None
         }
-      case TIMESTAMPTZ => Some(TimestampType) // Value for Timestamp with Time Zone in Oracle
+      case TIMESTAMPTZ if supportTimeZoneTypes
+        => Some(TimestampType) // Value for Timestamp with Time Zone in Oracle
       case BINARY_FLOAT => Some(FloatType) // Value for OracleTypes.BINARY_FLOAT
       case BINARY_DOUBLE => Some(DoubleType) // Value for OracleTypes.BINARY_DOUBLE
       case _ => None
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 61571bccdcb51..0767ca1573a7f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1064,10 +1064,10 @@ class JDBCSuite extends SparkFunSuite
   }
 
   test("unsupported types") {
-    var e = intercept[SparkException] {
+    var e = intercept[SQLException] {
       spark.read.jdbc(urlWithUserAndPass, "TEST.TIMEZONE", new Properties()).collect()
     }.getMessage
-    assert(e.contains("java.lang.UnsupportedOperationException: unimplemented"))
+    assert(e.contains("Unsupported type TIMESTAMP_WITH_TIMEZONE"))
     e = intercept[SQLException] {
       spark.read.jdbc(urlWithUserAndPass, "TEST.ARRAY", new Properties()).collect()
     }.getMessage

From ecc179ecaa9b4381d2f6d3356d88a4dff3e19c0f Mon Sep 17 00:00:00 2001
From: Ron Hu <ron.hu@huawei.com>
Date: Tue, 12 Dec 2017 15:04:49 +0800
Subject: [PATCH 087/356] [SPARK-21322][SQL] support histogram in filter
 cardinality estimation

## What changes were proposed in this pull request?

Histogram is effective in dealing with skewed distribution. After we generate histogram information for column statistics, we need to adjust filter estimation based on histogram data structure.

## How was this patch tested?

We revised all the unit test cases by including histogram data structure.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Ron Hu <ron.hu@huawei.com>

Closes #19783 from ron8hu/supportHistogram.
---
 .../statsEstimation/EstimationUtils.scala     |  97 +++++++-
 .../statsEstimation/FilterEstimation.scala    | 181 +++++++++++----
 .../FilterEstimationSuite.scala               | 217 +++++++++++++++++-
 3 files changed, 448 insertions(+), 47 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 9c34a9b7aa756..2f416f2af91f1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 import scala.math.BigDecimal.RoundingMode
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LogicalPlan, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types.{DecimalType, _}
 
 
@@ -114,4 +114,99 @@ object EstimationUtils {
     }
   }
 
+  /**
+   * Returns the number of the first bin into which a column value falls for a specified
+   * numeric equi-height histogram.
+   *
+   * @param value a literal value of a column
+   * @param bins an array of bins for a given numeric equi-height histogram
+   * @return the id of the first bin into which a column value falls.
+   */
+  def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+    var i = 0
+    while ((i < bins.length) && (value > bins(i).hi)) {
+      i += 1
+    }
+    i
+  }
+
+  /**
+   * Returns the number of the last bin into which a column value falls for a specified
+   * numeric equi-height histogram.
+   *
+   * @param value a literal value of a column
+   * @param bins an array of bins for a given numeric equi-height histogram
+   * @return the id of the last bin into which a column value falls.
+   */
+  def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+    var i = bins.length - 1
+    while ((i >= 0) && (value < bins(i).lo)) {
+      i -= 1
+    }
+    i
+  }
+
+  /**
+   * Returns a percentage of a bin holding values for column value in the range of
+   * [lowerValue, higherValue]
+   *
+   * @param higherValue a given upper bound value of a specified column value range
+   * @param lowerValue a given lower bound value of a specified column value range
+   * @param bin a single histogram bin
+   * @return the percentage of a single bin holding values in [lowerValue, higherValue].
+   */
+  private def getOccupation(
+      higherValue: Double,
+      lowerValue: Double,
+      bin: HistogramBin): Double = {
+    assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
+    if (bin.hi == bin.lo) {
+      // the entire bin is covered in the range
+      1.0
+    } else if (higherValue == lowerValue) {
+      // set percentage to 1/NDV
+      1.0 / bin.ndv.toDouble
+    } else {
+      // Use proration since the range falls inside this bin.
+      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+    }
+  }
+
+  /**
+   * Returns the number of bins for column values in [lowerValue, higherValue].
+   * The column value distribution is saved in an equi-height histogram.  The return values is a
+   * double value is because we may return a portion of a bin. For example, a predicate
+   * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
+   *
+   * @param higherId id of the high end bin holding the high end value of a column range
+   * @param lowerId id of the low end bin holding the low end value of a column range
+   * @param higherEnd a given upper bound value of a specified column value range
+   * @param lowerEnd a given lower bound value of a specified column value range
+   * @param histogram a numeric equi-height histogram
+   * @return the number of bins for column values in [lowerEnd, higherEnd].
+   */
+  def getOccupationBins(
+      higherId: Int,
+      lowerId: Int,
+      higherEnd: Double,
+      lowerEnd: Double,
+      histogram: Histogram): Double = {
+    assert(lowerId <= higherId)
+
+    if (lowerId == higherId) {
+      val curBin = histogram.bins(lowerId)
+      getOccupation(higherEnd, lowerEnd, curBin)
+    } else {
+      // compute how much lowerEnd/higherEnd occupies its bin
+      val lowerCurBin = histogram.bins(lowerId)
+      val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
+
+      val higherCurBin = histogram.bins(higherId)
+      val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
+
+      // the total length is lowerPart + higherPart + bins between them
+      lowerPart + higherPart + higherId - lowerId - 1
+    }
+  }
+
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 74820eb97d081..f52a15edbfe4a 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, LeafNode, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.types._
 
@@ -265,7 +265,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
    * @param update a boolean flag to specify if we need to update ColumnStat of a given column
    *               for subsequent conditions
    * @return an optional double value to show the percentage of rows meeting a given condition
-    *         It returns None if no statistics exists for a given column or wrong value.
+   *         It returns None if no statistics exists for a given column or wrong value.
    */
   def evaluateBinary(
       op: BinaryComparison,
@@ -332,8 +332,44 @@ case class FilterEstimation(plan: Filter) extends Logging {
         colStatsMap.update(attr, newStats)
       }
 
-      Some(1.0 / BigDecimal(ndv))
-    } else {
+      if (colStat.histogram.isEmpty) {
+        // returns 1/ndv if there is no histogram
+        Some(1.0 / BigDecimal(ndv))
+      } else {
+        // We compute filter selectivity using Histogram information.
+        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+        val histogram = colStat.histogram.get
+        val hgmBins = histogram.bins
+
+        // find bins where column's current min and max locate.  Note that a column's [min, max]
+        // range may change due to another condition applied earlier.
+        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+        val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
+        val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
+
+        // compute how many bins the column's current valid range [min, max] occupies.
+        // Note that a column's [min, max] range may vary after we apply some filter conditions.
+        val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
+          min, histogram)
+
+        val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
+        val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
+        assert(lowerBinId <= higherBinId)
+        val lowerBinNdv = hgmBins(lowerBinId).ndv
+        val higherBinNdv = hgmBins(higherBinId).ndv
+        // assume uniform distribution in each bin
+        val occupiedBins = if (lowerBinId == higherBinId) {
+          1.0 / lowerBinNdv
+        } else {
+          (1.0 / lowerBinNdv) +   // lowest bin
+            (higherBinId - lowerBinId - 1) + // middle bins
+            (1.0 / higherBinNdv)  // highest bin
+        }
+        Some(occupiedBins / validRangeBins)
+      }
+
+    } else {  // not in interval
       Some(0.0)
     }
 
@@ -471,37 +507,46 @@ case class FilterEstimation(plan: Filter) extends Logging {
       percent = 1.0
     } else {
       // This is the partial overlap case:
-      // Without advanced statistics like histogram, we assume uniform data distribution.
-      // We just prorate the adjusted range over the initial range to compute filter selectivity.
-      assert(max > min)
-      percent = op match {
-        case _: LessThan =>
-          if (numericLiteral == max) {
-            // If the literal value is right on the boundary, we can minus the part of the
-            // boundary value (1/ndv).
-            1.0 - 1.0 / ndv
-          } else {
-            (numericLiteral - min) / (max - min)
-          }
-        case _: LessThanOrEqual =>
-          if (numericLiteral == min) {
-            // The boundary value is the only satisfying value.
-            1.0 / ndv
-          } else {
-            (numericLiteral - min) / (max - min)
-          }
-        case _: GreaterThan =>
-          if (numericLiteral == min) {
-            1.0 - 1.0 / ndv
-          } else {
-            (max - numericLiteral) / (max - min)
-          }
-        case _: GreaterThanOrEqual =>
-          if (numericLiteral == max) {
-            1.0 / ndv
-          } else {
-            (max - numericLiteral) / (max - min)
-          }
+
+      if (colStat.histogram.isEmpty) {
+        // Without advanced statistics like histogram, we assume uniform data distribution.
+        // We just prorate the adjusted range over the initial range to compute filter selectivity.
+        assert(max > min)
+        percent = op match {
+          case _: LessThan =>
+            if (numericLiteral == max) {
+              // If the literal value is right on the boundary, we can minus the part of the
+              // boundary value (1/ndv).
+              1.0 - 1.0 / ndv
+            } else {
+              (numericLiteral - min) / (max - min)
+            }
+          case _: LessThanOrEqual =>
+            if (numericLiteral == min) {
+              // The boundary value is the only satisfying value.
+              1.0 / ndv
+            } else {
+              (numericLiteral - min) / (max - min)
+            }
+          case _: GreaterThan =>
+            if (numericLiteral == min) {
+              1.0 - 1.0 / ndv
+            } else {
+              (max - numericLiteral) / (max - min)
+            }
+          case _: GreaterThanOrEqual =>
+            if (numericLiteral == max) {
+              1.0 / ndv
+            } else {
+              (max - numericLiteral) / (max - min)
+            }
+        }
+      } else {
+        val numericHistogram = colStat.histogram.get
+        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+        percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
       }
 
       if (update) {
@@ -513,10 +558,9 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
         op match {
           case _: GreaterThan | _: GreaterThanOrEqual =>
-            // If new ndv is 1, then new max must be equal to new min.
-            newMin = if (newNdv == 1) newMax else newValue
+            newMin = newValue
           case _: LessThan | _: LessThanOrEqual =>
-            newMax = if (newNdv == 1) newMin else newValue
+            newMax = newValue
         }
 
         val newStats =
@@ -529,6 +573,54 @@ case class FilterEstimation(plan: Filter) extends Logging {
     Some(percent)
   }
 
+  /**
+   * Returns the selectivity percentage for binary condition in the column's
+   * current valid range [min, max]
+   *
+   * @param op a binary comparison operator
+   * @param histogram a numeric equi-height histogram
+   * @param max the upper bound of the current valid range for a given column
+   * @param min the lower bound of the current valid range for a given column
+   * @param datumNumber the numeric value of a literal
+   * @return the selectivity percentage for a condition in the current range.
+   */
+
+  def computePercentByEquiHeightHgm(
+      op: BinaryComparison,
+      histogram: Histogram,
+      max: Double,
+      min: Double,
+      datumNumber: Double): Double = {
+    // find bins where column's current min and max locate.  Note that a column's [min, max]
+    // range may change due to another condition applied earlier.
+    val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
+    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+
+    // compute how many bins the column's current valid range [min, max] occupies.
+    // Note that a column's [min, max] range may vary after we apply some filter conditions.
+    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)
+
+    val datumInBinId = op match {
+      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
+        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
+      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
+        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
+    }
+
+    op match {
+      // LessThan and LessThanOrEqual share the same logic,
+      // but their datumInBinId may be different
+      case LessThan(_, _) | LessThanOrEqual(_, _) =>
+        EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
+          histogram) / minToMaxLength
+      // GreaterThan and GreaterThanOrEqual share the same logic,
+      // but their datumInBinId may be different
+      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
+        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
+          histogram) / minToMaxLength
+    }
+  }
+
   /**
    * Returns a percentage of rows meeting a binary comparison expression containing two columns.
    * In SQL queries, we also see predicate expressions involving two columns
@@ -784,11 +876,16 @@ case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
   def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
     : AttributeMap[ColumnStat] = {
     val newColumnStats = originalMap.map { case (attr, oriColStat) =>
-      // Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
-      // decreases; otherwise keep it unchanged.
-      val newNdv = EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
-        newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
       val colStat = updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
+      val newNdv = if (colStat.distinctCount > 1) {
+        // Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
+        // decreases; otherwise keep it unchanged.
+        EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
+          newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount)
+      } else {
+        // no need to scale down since it is already down to 1 (for skewed distribution case)
+        colStat.distinctCount
+      }
       attr -> colStat.copy(distinctCount = newNdv)
     }
     AttributeMap(newColumnStats.toSeq)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 455037e6c9952..2b1fe987a7960 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -22,7 +22,7 @@ import java.sql.Date
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.plans.LeftOuter
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Filter, Join, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -33,7 +33,7 @@ import org.apache.spark.sql.types._
  */
 class FilterEstimationSuite extends StatsEstimationTestBase {
 
-  // Suppose our test table has 10 rows and 6 columns.
+  // Suppose our test table has 10 rows and 10 columns.
   // column cint has values: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
   // Hence, distinctCount:10, min:1, max:10, nullCount:0, avgLen:4, maxLen:4
   val attrInt = AttributeReference("cint", IntegerType)()
@@ -91,6 +91,26 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
   val colStatInt4 = ColumnStat(distinctCount = 10, min = Some(1), max = Some(10),
     nullCount = 0, avgLen = 4, maxLen = 4)
 
+  // column cintHgm has values: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 with histogram.
+  // Note that cintHgm has an even distribution with histogram information built.
+  // Hence, distinctCount:10, min:1, max:10, nullCount:0, avgLen:4, maxLen:4
+  val attrIntHgm = AttributeReference("cintHgm", IntegerType)()
+  val hgmInt = Histogram(2.0, Array(HistogramBin(1.0, 2.0, 2),
+    HistogramBin(2.0, 4.0, 2), HistogramBin(4.0, 6.0, 2),
+    HistogramBin(6.0, 8.0, 2), HistogramBin(8.0, 10.0, 2)))
+  val colStatIntHgm = ColumnStat(distinctCount = 10, min = Some(1), max = Some(10),
+    nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))
+
+  // column cintSkewHgm has values: 1, 4, 4, 5, 5, 5, 5, 6, 6, 10 with histogram.
+  // Note that cintSkewHgm has a skewed distribution with histogram information built.
+  // distinctCount:5, min:1, max:10, nullCount:0, avgLen:4, maxLen:4
+  val attrIntSkewHgm = AttributeReference("cintSkewHgm", IntegerType)()
+  val hgmIntSkew = Histogram(2.0, Array(HistogramBin(1.0, 4.0, 2),
+    HistogramBin(4.0, 5.0, 2), HistogramBin(5.0, 5.0, 1),
+    HistogramBin(5.0, 6.0, 2), HistogramBin(6.0, 10.0, 2)))
+  val colStatIntSkewHgm = ColumnStat(distinctCount = 5, min = Some(1), max = Some(10),
+    nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))
+
   val attributeMap = AttributeMap(Seq(
     attrInt -> colStatInt,
     attrBool -> colStatBool,
@@ -100,7 +120,9 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
     attrString -> colStatString,
     attrInt2 -> colStatInt2,
     attrInt3 -> colStatInt3,
-    attrInt4 -> colStatInt4
+    attrInt4 -> colStatInt4,
+    attrIntHgm -> colStatIntHgm,
+    attrIntSkewHgm -> colStatIntSkewHgm
   ))
 
   test("true") {
@@ -359,7 +381,7 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
   test("cbool > false") {
     validateEstimatedStats(
       Filter(GreaterThan(attrBool, Literal(false)), childStatsTestPlan(Seq(attrBool), 10L)),
-      Seq(attrBool -> ColumnStat(distinctCount = 1, min = Some(true), max = Some(true),
+      Seq(attrBool -> ColumnStat(distinctCount = 1, min = Some(false), max = Some(true),
         nullCount = 0, avgLen = 1, maxLen = 1)),
       expectedRowCount = 5)
   }
@@ -578,6 +600,193 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 5)
   }
 
+  // The following test cases have histogram information collected for the test column with
+  // an even distribution
+  test("Not(cintHgm < 3 AND null)") {
+    val condition = Not(And(LessThan(attrIntHgm, Literal(3)), Literal(null, IntegerType)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 7)),
+      expectedRowCount = 7)
+  }
+
+  test("cintHgm = 5") {
+    validateEstimatedStats(
+      Filter(EqualTo(attrIntHgm, Literal(5)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 1, min = Some(5), max = Some(5),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 1)
+  }
+
+  test("cintHgm = 0") {
+    // This is an out-of-range case since 0 is outside the range [min, max]
+    validateEstimatedStats(
+      Filter(EqualTo(attrIntHgm, Literal(0)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintHgm < 3") {
+    validateEstimatedStats(
+      Filter(LessThan(attrIntHgm, Literal(3)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(1), max = Some(3),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 3)
+  }
+
+  test("cintHgm < 0") {
+    // This is a corner case since literal 0 is smaller than min.
+    validateEstimatedStats(
+      Filter(LessThan(attrIntHgm, Literal(0)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintHgm <= 3") {
+    validateEstimatedStats(
+      Filter(LessThanOrEqual(attrIntHgm, Literal(3)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 3, min = Some(1), max = Some(3),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 3)
+  }
+
+  test("cintHgm > 6") {
+    validateEstimatedStats(
+      Filter(GreaterThan(attrIntHgm, Literal(6)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 4, min = Some(6), max = Some(10),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 4)
+  }
+
+  test("cintHgm > 10") {
+    // This is a corner case since max value is 10.
+    validateEstimatedStats(
+      Filter(GreaterThan(attrIntHgm, Literal(10)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintHgm >= 6") {
+    validateEstimatedStats(
+      Filter(GreaterThanOrEqual(attrIntHgm, Literal(6)), childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 5, min = Some(6), max = Some(10),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 5)
+  }
+
+  test("cintHgm > 3 AND cintHgm <= 6") {
+    val condition = And(GreaterThan(attrIntHgm,
+      Literal(3)), LessThanOrEqual(attrIntHgm, Literal(6)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> ColumnStat(distinctCount = 4, min = Some(3), max = Some(6),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmInt))),
+      expectedRowCount = 4)
+  }
+
+  test("cintHgm = 3 OR cintHgm = 6") {
+    val condition = Or(EqualTo(attrIntHgm, Literal(3)), EqualTo(attrIntHgm, Literal(6)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntHgm), 10L)),
+      Seq(attrIntHgm -> colStatIntHgm.copy(distinctCount = 3)),
+      expectedRowCount = 3)
+  }
+
+  // The following test cases have histogram information collected for the test column with
+  // a skewed distribution.
+  test("Not(cintSkewHgm < 3 AND null)") {
+    val condition = Not(And(LessThan(attrIntSkewHgm, Literal(3)), Literal(null, IntegerType)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> colStatIntSkewHgm.copy(distinctCount = 5)),
+      expectedRowCount = 9)
+  }
+
+  test("cintSkewHgm = 5") {
+    validateEstimatedStats(
+      Filter(EqualTo(attrIntSkewHgm, Literal(5)), childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 1, min = Some(5), max = Some(5),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 4)
+  }
+
+  test("cintSkewHgm = 0") {
+    // This is an out-of-range case since 0 is outside the range [min, max]
+    validateEstimatedStats(
+      Filter(EqualTo(attrIntSkewHgm, Literal(0)), childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintSkewHgm < 3") {
+    validateEstimatedStats(
+      Filter(LessThan(attrIntSkewHgm, Literal(3)), childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(3),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 2)
+  }
+
+  test("cintSkewHgm < 0") {
+    // This is a corner case since literal 0 is smaller than min.
+    validateEstimatedStats(
+      Filter(LessThan(attrIntSkewHgm, Literal(0)), childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintSkewHgm <= 3") {
+    validateEstimatedStats(
+      Filter(LessThanOrEqual(attrIntSkewHgm, Literal(3)),
+        childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(3),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 2)
+  }
+
+  test("cintSkewHgm > 6") {
+    validateEstimatedStats(
+      Filter(GreaterThan(attrIntSkewHgm, Literal(6)), childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 1, min = Some(6), max = Some(10),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 2)
+  }
+
+  test("cintSkewHgm > 10") {
+    // This is a corner case since max value is 10.
+    validateEstimatedStats(
+      Filter(GreaterThan(attrIntSkewHgm, Literal(10)),
+        childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Nil,
+      expectedRowCount = 0)
+  }
+
+  test("cintSkewHgm >= 6") {
+    validateEstimatedStats(
+      Filter(GreaterThanOrEqual(attrIntSkewHgm, Literal(6)),
+        childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 2, min = Some(6), max = Some(10),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 3)
+  }
+
+  test("cintSkewHgm > 3 AND cintSkewHgm <= 6") {
+    val condition = And(GreaterThan(attrIntSkewHgm,
+      Literal(3)), LessThanOrEqual(attrIntSkewHgm, Literal(6)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> ColumnStat(distinctCount = 4, min = Some(3), max = Some(6),
+        nullCount = 0, avgLen = 4, maxLen = 4, histogram = Some(hgmIntSkew))),
+      expectedRowCount = 8)
+  }
+
+  test("cintSkewHgm = 3 OR cintSkewHgm = 6") {
+    val condition = Or(EqualTo(attrIntSkewHgm, Literal(3)), EqualTo(attrIntSkewHgm, Literal(6)))
+    validateEstimatedStats(
+      Filter(condition, childStatsTestPlan(Seq(attrIntSkewHgm), 10L)),
+      Seq(attrIntSkewHgm -> colStatIntSkewHgm.copy(distinctCount = 2)),
+      expectedRowCount = 3)
+  }
+
   private def childStatsTestPlan(outList: Seq[Attribute], tableRowCount: BigInt): StatsTestPlan = {
     StatsTestPlan(
       outputList = outList,

From bc8933faf238bcf14e7976bd1ac1465dc32b8e2b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 12 Dec 2017 17:02:04 +0900
Subject: [PATCH 088/356] [SPARK-3685][CORE] Prints explicit warnings when
 configured local directories are set to URIs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR proposes to print warnings before creating local by `java.io.File`.

I think we can't just simply disallow and throw an exception for such cases of `hdfs:/tmp/foo` case because it might break compatibility. Note that `hdfs:/tmp/foo` creates a directory called `hdfs:/`.

There were many discussion here about whether we should support this in other file systems or not; however, since the JIRA targets "Spark's local dir should accept only local paths", here I tried to resolve it by simply printing warnings.

I think we could open another JIRA and design doc if this is something we should support, separately. Another note, for your information, [SPARK-1529](https://issues.apache.org/jira/browse/SPARK-1529) is resolved as `Won't Fix`.

**Before**

```
./bin/spark-shell --conf spark.local.dir=file:/a/b/c
```

This creates a local directory as below:

```
 file:/
└── a
    └── b
        └── c
        ...
```

**After**

```bash
./bin/spark-shell --conf spark.local.dir=file:/a/b/c
```

Now, it prints a warning as below:

```
...
17/12/09 21:58:49 WARN Utils: The configured local directories are not expected to be URIs; however, got suspicious values [file:/a/b/c]. Please check your configured local directories.
...
```

```bash
./bin/spark-shell --conf spark.local.dir=file:/a/b/c,/tmp/a/b/c,hdfs:/a/b/c
```

It also works with comma-separated ones:

```
...
17/12/09 22:05:01 WARN Utils: The configured local directories are not expected to be URIs; however, got suspicious values [file:/a/b/c, hdfs:/a/b/c]. Please check your configured local directories.
...
 ```

## How was this patch tested?

 Manually tested:

 ```
 ./bin/spark-shell --conf spark.local.dir=C:\\a\\b\\c
 ./bin/spark-shell --conf spark.local.dir=/tmp/a/b/c
 ./bin/spark-shell --conf spark.local.dir=a/b/c
 ./bin/spark-shell --conf spark.local.dir=a/b/c,/tmp/a/b/c,C:\\a\\b\\c
 ```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #19934 from HyukjinKwon/SPARK-3685.
---
 .../main/scala/org/apache/spark/util/Utils.scala    | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 1ed09dc489440..fe5b4ea24440b 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -829,7 +829,18 @@ private[spark] object Utils extends Logging {
   }
 
   private def getOrCreateLocalRootDirsImpl(conf: SparkConf): Array[String] = {
-    getConfiguredLocalDirs(conf).flatMap { root =>
+    val configuredLocalDirs = getConfiguredLocalDirs(conf)
+    val uris = configuredLocalDirs.filter { root =>
+      // Here, we guess if the given value is a URI at its best - check if scheme is set.
+      Try(new URI(root).getScheme != null).getOrElse(false)
+    }
+    if (uris.nonEmpty) {
+      logWarning(
+        "The configured local directories are not expected to be URIs; however, got suspicious " +
+        s"values [${uris.mkString(", ")}]. Please check your configured local directories.")
+    }
+
+    configuredLocalDirs.flatMap { root =>
       try {
         val rootDir = new File(root)
         if (rootDir.exists || rootDir.mkdirs()) {

From d5007734b22fbee5eccb1682eef13479780423ab Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Tue, 12 Dec 2017 10:07:18 -0800
Subject: [PATCH 089/356] [SPARK-16986][WEB-UI] Converter Started, Completed
 and Last Updated to client time zone in history page

## What changes were proposed in this pull request?

This PR is converted the ` Started`, `Completed` and `Last Updated` to client local time in the history page.

## How was this patch tested?

Manual tests for Chrome, Firefox and Safari

#### Before modifying:
<img width="1280" alt="before-webui" src="https://user-images.githubusercontent.com/5399861/32315920-19de825c-bfe9-11e7-9db6-edbf57d50792.png">

#### After modifying:
<img width="1160" alt="after" src="https://user-images.githubusercontent.com/5399861/32867988-fb9d9dec-caaa-11e7-85dd-3152ff0d9ef0.png">

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19640 from wangyum/SPARK-16986.
---
 .../spark/ui/static/historypage-common.js     | 11 ++++----
 .../org/apache/spark/ui/static/historypage.js | 11 ++------
 .../org/apache/spark/ui/static/utils.js       | 28 +++++++++++++++++++
 .../spark/deploy/history/HistoryPage.scala    |  4 +++
 .../deploy/history/HistoryServerSuite.scala   |  2 +-
 5 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
index 55d540d8317a0..4cfe46ec914ae 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage-common.js
@@ -16,9 +16,10 @@
  */
 
 $(document).ready(function() {
-    if ($('#last-updated').length) {
-      var lastUpdatedMillis = Number($('#last-updated').text());
-      var updatedDate = new Date(lastUpdatedMillis);
-      $('#last-updated').text(updatedDate.toLocaleDateString()+", "+updatedDate.toLocaleTimeString())
-    }
+  if ($('#last-updated').length) {
+    var lastUpdatedMillis = Number($('#last-updated').text());
+    $('#last-updated').text(formatTimeMillis(lastUpdatedMillis));
+  }
+
+  $('#time-zone').text(getTimeZone());
 });
diff --git a/core/src/main/resources/org/apache/spark/ui/static/historypage.js b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
index aa7e860372553..2cde66b081a1c 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/historypage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/historypage.js
@@ -37,11 +37,6 @@ function makeIdNumeric(id) {
   return resl;
 }
 
-function formatDate(date) {
-  if (date <= 0) return "-";
-  else return date.split(".")[0].replace("T", " ");
-}
-
 function getParameterByName(name, searchString) {
   var regex = new RegExp("[\\?&]" + name + "=([^&#]*)"),
   results = regex.exec(searchString);
@@ -129,9 +124,9 @@ $(document).ready(function() {
         var num = app["attempts"].length;
         for (j in app["attempts"]) {
           var attempt = app["attempts"][j];
-          attempt["startTime"] = formatDate(attempt["startTime"]);
-          attempt["endTime"] = formatDate(attempt["endTime"]);
-          attempt["lastUpdated"] = formatDate(attempt["lastUpdated"]);
+          attempt["startTime"] = formatTimeMillis(attempt["startTimeEpoch"]);
+          attempt["endTime"] = formatTimeMillis(attempt["endTimeEpoch"]);
+          attempt["lastUpdated"] = formatTimeMillis(attempt["lastUpdatedEpoch"]);
           attempt["log"] = uiRoot + "/api/v1/applications/" + id + "/" +
             (attempt.hasOwnProperty("attemptId") ? attempt["attemptId"] + "/" : "") + "logs";
           attempt["durationMillisec"] = attempt["duration"];
diff --git a/core/src/main/resources/org/apache/spark/ui/static/utils.js b/core/src/main/resources/org/apache/spark/ui/static/utils.js
index edc0ee2ce181d..4f63f6413d6de 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/utils.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/utils.js
@@ -46,3 +46,31 @@ function formatBytes(bytes, type) {
     var i = Math.floor(Math.log(bytes) / Math.log(k));
     return parseFloat((bytes / Math.pow(k, i)).toFixed(dm)) + ' ' + sizes[i];
 }
+
+function padZeroes(num) {
+  return ("0" + num).slice(-2);
+}
+
+function formatTimeMillis(timeMillis) {
+  if (timeMillis <= 0) {
+    return "-";
+  } else {
+    var dt = new Date(timeMillis);
+    return dt.getFullYear() + "-" +
+      padZeroes(dt.getMonth() + 1) + "-" +
+      padZeroes(dt.getDate()) + " " +
+      padZeroes(dt.getHours()) + ":" +
+      padZeroes(dt.getMinutes()) + ":" +
+      padZeroes(dt.getSeconds());
+  }
+}
+
+function getTimeZone() {
+  try {
+    return Intl.DateTimeFormat().resolvedOptions().timeZone;
+  } catch(ex) {
+    // Get time zone from a string representing the date,
+    // eg. "Thu Nov 16 2017 01:13:32 GMT+0800 (CST)" -> "CST"
+    return new Date().toString().match(/\((.*)\)/)[1];
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
index d3dd58996a0bb..5d62a7d8bebb4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryPage.scala
@@ -57,6 +57,10 @@ private[history] class HistoryPage(parent: HistoryServer) extends WebUIPage("")
             }
             }
 
+            {
+            <p>Client local time zone: <span id="time-zone"></span></p>
+            }
+
             {
             if (allAppsSize > 0) {
               <script src={UIUtils.prependBaseUri("/static/dataTables.rowsGroup.js")}></script> ++
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 4e4395d0fb959..3a9790cd57270 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -345,7 +345,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
         .map(_.get)
         .filter(_.startsWith(url)).toList
 
-      // there are atleast some URL links that were generated via javascript,
+      // there are at least some URL links that were generated via javascript,
       // and they all contain the spark.ui.proxyBase (uiRoot)
       links.length should be > 4
       all(links) should startWith(url + uiRoot)

From e6dc5f280786bdb068abb7348b787324f76c0e4a Mon Sep 17 00:00:00 2001
From: Daniel van der Ende <daniel.vanderende@gmail.com>
Date: Tue, 12 Dec 2017 10:41:37 -0800
Subject: [PATCH 090/356] [SPARK-22729][SQL] Add getTruncateQuery to
 JdbcDialect

In order to enable truncate for PostgreSQL databases in Spark JDBC, a change is needed to the query used for truncating a PostgreSQL table. By default, PostgreSQL will automatically truncate any descendant tables if a TRUNCATE query is executed. As this may result in (unwanted) side-effects, the query used for the truncate should be specified separately for PostgreSQL, specifying only to TRUNCATE a single table.

## What changes were proposed in this pull request?

Add `getTruncateQuery` function to `JdbcDialect.scala`, with default query. Overridden this function for PostgreSQL to only truncate a single table. Also sets `isCascadingTruncateTable` to false, as this will allow truncates for PostgreSQL.

## How was this patch tested?

Existing tests all pass. Added test for `getTruncateQuery`

Author: Daniel van der Ende <daniel.vanderende@gmail.com>

Closes #19911 from danielvdende/SPARK-22717.
---
 .../datasources/jdbc/JdbcRelationProvider.scala  |  2 +-
 .../execution/datasources/jdbc/JdbcUtils.scala   |  7 ++++---
 .../spark/sql/jdbc/AggregatedDialect.scala       |  6 +++++-
 .../org/apache/spark/sql/jdbc/JdbcDialects.scala | 12 ++++++++++++
 .../apache/spark/sql/jdbc/PostgresDialect.scala  | 14 ++++++++++++--
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala    | 16 ++++++++++++++++
 6 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
index 37e7bb0a59bb6..cc506e51bd0c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
@@ -68,7 +68,7 @@ class JdbcRelationProvider extends CreatableRelationProvider
           case SaveMode.Overwrite =>
             if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
               // In this case, we should truncate table and then load.
-              truncateTable(conn, options.table)
+              truncateTable(conn, options)
               val tableSchema = JdbcUtils.getSchemaOption(conn, options)
               saveTable(df, tableSchema, isCaseSensitive, options)
             } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index bbc95df4d9dc4..e6dc2fda4eb1b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -96,12 +96,13 @@ object JdbcUtils extends Logging {
   }
 
   /**
-   * Truncates a table from the JDBC database.
+   * Truncates a table from the JDBC database without side effects.
    */
-  def truncateTable(conn: Connection, table: String): Unit = {
+  def truncateTable(conn: Connection, options: JDBCOptions): Unit = {
+    val dialect = JdbcDialects.get(options.url)
     val statement = conn.createStatement
     try {
-      statement.executeUpdate(s"TRUNCATE TABLE $table")
+      statement.executeUpdate(dialect.getTruncateQuery(options.table))
     } finally {
       statement.close()
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
index f3bfea5f6bfc8..8b92c8b4f56b5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/AggregatedDialect.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.types.{DataType, MetadataBuilder}
 /**
  * AggregatedDialect can unify multiple dialects into one virtual Dialect.
  * Dialects are tried in order, and the first dialect that does not return a
- * neutral element will will.
+ * neutral element will win.
  *
  * @param dialects List of dialects.
  */
@@ -63,4 +63,8 @@ private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect
       case _ => Some(false)
     }
   }
+
+  override def getTruncateQuery(table: String): String = {
+    dialects.head.getTruncateQuery(table)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 7c38ed68c0413..83d87a11810c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -116,6 +116,18 @@ abstract class JdbcDialect extends Serializable {
     s"SELECT * FROM $table WHERE 1=0"
   }
 
+  /**
+   * The SQL query that should be used to truncate a table. Dialects can override this method to
+   * return a query that is suitable for a particular database. For PostgreSQL, for instance,
+   * a different query is used to prevent "TRUNCATE" affecting other tables.
+   * @param table The name of the table.
+   * @return The SQL query to use for truncating a table
+   */
+  @Since("2.3.0")
+  def getTruncateQuery(table: String): String = {
+    s"TRUNCATE TABLE $table"
+  }
+
   /**
    * Override connection specific properties to run before a select is made.  This is in place to
    * allow dialects that need special treatment to optimize behavior.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala
index 4f61a328f47ca..13a2035f4d0c4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala
@@ -85,6 +85,17 @@ private object PostgresDialect extends JdbcDialect {
     s"SELECT 1 FROM $table LIMIT 1"
   }
 
+  /**
+  * The SQL query used to truncate a table. For Postgres, the default behaviour is to
+  * also truncate any descendant tables. As this is a (possibly unwanted) side-effect,
+  * the Postgres dialect adds 'ONLY' to truncate only the table in question
+  * @param table The name of the table.
+  * @return The SQL query to use for truncating a table
+  */
+  override def getTruncateQuery(table: String): String = {
+    s"TRUNCATE TABLE ONLY $table"
+  }
+
   override def beforeFetch(connection: Connection, properties: Map[String, String]): Unit = {
     super.beforeFetch(connection, properties)
 
@@ -97,8 +108,7 @@ private object PostgresDialect extends JdbcDialect {
     if (properties.getOrElse(JDBCOptions.JDBC_BATCH_FETCH_SIZE, "0").toInt > 0) {
       connection.setAutoCommit(false)
     }
-
   }
 
-  override def isCascadingTruncateTable(): Option[Boolean] = Some(true)
+  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index 0767ca1573a7f..cb2df0ac54f4c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -854,6 +854,22 @@ class JDBCSuite extends SparkFunSuite
     assert(derby.getTableExistsQuery(table) == defaultQuery)
   }
 
+  test("truncate table query by jdbc dialect") {
+    val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
+    val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    val db2 = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
+    val h2 = JdbcDialects.get(url)
+    val derby = JdbcDialects.get("jdbc:derby:db")
+    val table = "weblogs"
+    val defaultQuery = s"TRUNCATE TABLE $table"
+    val postgresQuery = s"TRUNCATE TABLE ONLY $table"
+    assert(MySQL.getTruncateQuery(table) == defaultQuery)
+    assert(Postgres.getTruncateQuery(table) == postgresQuery)
+    assert(db2.getTruncateQuery(table) == defaultQuery)
+    assert(h2.getTruncateQuery(table) == defaultQuery)
+    assert(derby.getTruncateQuery(table) == defaultQuery)
+  }
+
   test("Test DataFrame.where for Date and Timestamp") {
     // Regression test for bug SPARK-11788
     val timestamp = java.sql.Timestamp.valueOf("2001-02-20 11:22:33.543543");

From 10c27a6559803797e89c28ced11c1087127b82eb Mon Sep 17 00:00:00 2001
From: Yuhao Yang <yuhao.yang@intel.com>
Date: Tue, 12 Dec 2017 11:27:01 -0800
Subject: [PATCH 091/356] [SPARK-22289][ML] Add JSON support for Matrix
 parameters (LR with coefficients bound)

## What changes were proposed in this pull request?
jira: https://issues.apache.org/jira/browse/SPARK-22289

add JSON encoding/decoding for Param[Matrix].

The issue was reported by Nic Eggert during saving LR model with LowerBoundsOnCoefficients.
There're two ways to resolve this as I see:
1. Support save/load on LogisticRegressionParams, and also adjust the save/load in LogisticRegression and LogisticRegressionModel.
2. Directly support Matrix in Param.jsonEncode, similar to what we have done for Vector.

After some discussion in jira, we prefer the fix to support Matrix as a valid Param type, for simplicity and convenience for other classes.

Note that in the implementation, I added a "class" field in the JSON object to match different JSON converters when loading, which is for preciseness and future extension.

## How was this patch tested?

new unit test to cover the LR case and JsonMatrixConverter

Author: Yuhao Yang <yuhao.yang@intel.com>

Closes #19525 from hhbyyh/lrsave.
---
 .../org/apache/spark/ml/linalg/Matrices.scala |  7 ++
 .../spark/ml/linalg/JsonMatrixConverter.scala | 79 +++++++++++++++++++
 .../org/apache/spark/ml/param/params.scala    | 36 +++++++--
 .../LogisticRegressionSuite.scala             | 11 +++
 .../ml/linalg/JsonMatrixConverterSuite.scala  | 45 +++++++++++
 5 files changed, 170 insertions(+), 8 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/linalg/JsonMatrixConverter.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/linalg/JsonMatrixConverterSuite.scala

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index 66c53624419d9..14428c6f45cce 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -476,6 +476,9 @@ class DenseMatrix @Since("2.0.0") (
 @Since("2.0.0")
 object DenseMatrix {
 
+  private[ml] def unapply(dm: DenseMatrix): Option[(Int, Int, Array[Double], Boolean)] =
+    Some((dm.numRows, dm.numCols, dm.values, dm.isTransposed))
+
   /**
    * Generate a `DenseMatrix` consisting of zeros.
    * @param numRows number of rows of the matrix
@@ -827,6 +830,10 @@ class SparseMatrix @Since("2.0.0") (
 @Since("2.0.0")
 object SparseMatrix {
 
+  private[ml] def unapply(
+       sm: SparseMatrix): Option[(Int, Int, Array[Int], Array[Int], Array[Double], Boolean)] =
+    Some((sm.numRows, sm.numCols, sm.colPtrs, sm.rowIndices, sm.values, sm.isTransposed))
+
   /**
    * Generate a `SparseMatrix` from Coordinate List (COO) format. Input must be an array of
    * (i, j, value) tuples. Entries that have duplicate values of i and j are
diff --git a/mllib/src/main/scala/org/apache/spark/ml/linalg/JsonMatrixConverter.scala b/mllib/src/main/scala/org/apache/spark/ml/linalg/JsonMatrixConverter.scala
new file mode 100644
index 0000000000000..0bee643412b3f
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/linalg/JsonMatrixConverter.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.linalg
+
+import org.json4s.DefaultFormats
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods.{compact, parse => parseJson, render}
+
+private[ml] object JsonMatrixConverter {
+
+  /** Unique class name for identifying JSON object encoded by this class. */
+  val className = "matrix"
+
+  /**
+   * Parses the JSON representation of a Matrix into a [[Matrix]].
+   */
+  def fromJson(json: String): Matrix = {
+    implicit val formats = DefaultFormats
+    val jValue = parseJson(json)
+    (jValue \ "type").extract[Int] match {
+      case 0 => // sparse
+        val numRows = (jValue \ "numRows").extract[Int]
+        val numCols = (jValue \ "numCols").extract[Int]
+        val colPtrs = (jValue \ "colPtrs").extract[Seq[Int]].toArray
+        val rowIndices = (jValue \ "rowIndices").extract[Seq[Int]].toArray
+        val values = (jValue \ "values").extract[Seq[Double]].toArray
+        val isTransposed = (jValue \ "isTransposed").extract[Boolean]
+        new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed)
+      case 1 => // dense
+        val numRows = (jValue \ "numRows").extract[Int]
+        val numCols = (jValue \ "numCols").extract[Int]
+        val values = (jValue \ "values").extract[Seq[Double]].toArray
+        val isTransposed = (jValue \ "isTransposed").extract[Boolean]
+        new DenseMatrix(numRows, numCols, values, isTransposed)
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot parse $json into a Matrix.")
+    }
+  }
+
+  /**
+   * Coverts the Matrix to a JSON string.
+   */
+  def toJson(m: Matrix): String = {
+    m match {
+      case SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) =>
+        val jValue = ("class" -> className) ~
+          ("type" -> 0) ~
+          ("numRows" -> numRows) ~
+          ("numCols" -> numCols) ~
+          ("colPtrs" -> colPtrs.toSeq) ~
+          ("rowIndices" -> rowIndices.toSeq) ~
+          ("values" -> values.toSeq) ~
+          ("isTransposed" -> isTransposed)
+        compact(render(jValue))
+      case DenseMatrix(numRows, numCols, values, isTransposed) =>
+        val jValue = ("class" -> className) ~
+          ("type" -> 1) ~
+          ("numRows" -> numRows) ~
+          ("numCols" -> numCols) ~
+          ("values" -> values.toSeq) ~
+          ("isTransposed" -> isTransposed)
+        compact(render(jValue))
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index 8985f2af90a9a..1b4b401ac4aa0 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -28,9 +28,9 @@ import scala.collection.mutable
 import org.json4s._
 import org.json4s.jackson.JsonMethods._
 
+import org.apache.spark.SparkException
 import org.apache.spark.annotation.{DeveloperApi, Since}
-import org.apache.spark.ml.linalg.JsonVectorConverter
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{JsonMatrixConverter, JsonVectorConverter, Matrix, Vector}
 import org.apache.spark.ml.util.Identifiable
 
 /**
@@ -94,9 +94,11 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
         compact(render(JString(x)))
       case v: Vector =>
         JsonVectorConverter.toJson(v)
+      case m: Matrix =>
+        JsonMatrixConverter.toJson(m)
       case _ =>
         throw new NotImplementedError(
-          "The default jsonEncode only supports string and vector. " +
+          "The default jsonEncode only supports string, vector and matrix. " +
             s"${this.getClass.getName} must override jsonEncode for ${value.getClass.getName}.")
     }
   }
@@ -122,17 +124,35 @@ private[ml] object Param {
 
   /** Decodes a param value from JSON. */
   def jsonDecode[T](json: String): T = {
-    parse(json) match {
+    val jValue = parse(json)
+    jValue match {
       case JString(x) =>
         x.asInstanceOf[T]
       case JObject(v) =>
         val keys = v.map(_._1)
-        assert(keys.contains("type") && keys.contains("values"),
-          s"Expect a JSON serialized vector but cannot find fields 'type' and 'values' in $json.")
-        JsonVectorConverter.fromJson(json).asInstanceOf[T]
+        if (keys.contains("class")) {
+          implicit val formats = DefaultFormats
+          val className = (jValue \ "class").extract[String]
+          className match {
+            case JsonMatrixConverter.className =>
+              val checkFields = Array("numRows", "numCols", "values", "isTransposed", "type")
+              require(checkFields.forall(keys.contains), s"Expect a JSON serialized Matrix" +
+                s" but cannot find fields ${checkFields.mkString(", ")} in $json.")
+              JsonMatrixConverter.fromJson(json).asInstanceOf[T]
+
+            case s => throw new SparkException(s"unrecognized class $s in $json")
+          }
+        } else {
+          // "class" info in JSON was added in Spark 2.3(SPARK-22289). JSON support for Vector was
+          // implemented before that and does not have "class" attribute.
+          require(keys.contains("type") && keys.contains("values"), s"Expect a JSON serialized" +
+            s" vector/matrix but cannot find fields 'type' and 'values' in $json.")
+          JsonVectorConverter.fromJson(json).asInstanceOf[T]
+        }
+
       case _ =>
         throw new NotImplementedError(
-          "The default jsonDecode only supports string and vector. " +
+          "The default jsonDecode only supports string, vector and matrix. " +
             s"${this.getClass.getName} must override jsonDecode to support its value type.")
     }
   }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index 14f550890d238..a5f81a38face9 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -2767,6 +2767,17 @@ class LogisticRegressionSuite
     val lr = new LogisticRegression()
     testEstimatorAndModelReadWrite(lr, smallBinaryDataset, LogisticRegressionSuite.allParamSettings,
       LogisticRegressionSuite.allParamSettings, checkModelData)
+
+    // test lr with bounds on coefficients, need to set elasticNetParam to 0.
+    val numFeatures = smallBinaryDataset.select("features").head().getAs[Vector](0).size
+    val lowerBounds = new DenseMatrix(1, numFeatures, (1 to numFeatures).map(_ / 1000.0).toArray)
+    val upperBounds = new DenseMatrix(1, numFeatures, (1 to numFeatures).map(_ * 1000.0).toArray)
+    val paramSettings = Map("lowerBoundsOnCoefficients" -> lowerBounds,
+      "upperBoundsOnCoefficients" -> upperBounds,
+      "elasticNetParam" -> 0.0
+    )
+    testEstimatorAndModelReadWrite(lr, smallBinaryDataset, paramSettings,
+      paramSettings, checkModelData)
   }
 
   test("should support all NumericType labels and weights, and not support other types") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/linalg/JsonMatrixConverterSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/linalg/JsonMatrixConverterSuite.scala
new file mode 100644
index 0000000000000..4d83f945b4ba8
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/linalg/JsonMatrixConverterSuite.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.linalg
+
+import org.json4s.jackson.JsonMethods.parse
+
+import org.apache.spark.SparkFunSuite
+
+class JsonMatrixConverterSuite extends SparkFunSuite {
+
+  test("toJson/fromJson") {
+    val denseMatrices = Seq(
+      Matrices.dense(0, 0, Array.empty[Double]),
+      Matrices.dense(1, 1, Array(0.1)),
+      new DenseMatrix(3, 2, Array(0.0, 1.21, 2.3, 9.8, 9.0, 0.0), true)
+    )
+
+    val sparseMatrices = denseMatrices.map(_.toSparse) ++ Seq(
+      Matrices.sparse(3, 2, Array(0, 1, 2), Array(1, 2), Array(3.1, 3.5))
+    )
+
+    for (m <- sparseMatrices ++ denseMatrices) {
+      val json = JsonMatrixConverter.toJson(m)
+      parse(json) // `json` should be a valid JSON string
+      val u = JsonMatrixConverter.fromJson(json)
+      assert(u.getClass === m.getClass, "toJson/fromJson should preserve Matrix types.")
+      assert(u === m, "toJson/fromJson should preserve Matrix values.")
+    }
+  }
+}

From 7a51e71355485bb176a1387d99ec430c5986cbec Mon Sep 17 00:00:00 2001
From: German Schiavon <germanschiavon@gmail.com>
Date: Tue, 12 Dec 2017 11:46:57 -0800
Subject: [PATCH 092/356] [SPARK-22574][MESOS][SUBMIT] Check submission request
 parameters

## What changes were proposed in this pull request?

It solves the problem when submitting a wrong CreateSubmissionRequest to Spark Dispatcher was causing a bad state of Dispatcher and making it inactive as a Mesos framework.

https://issues.apache.org/jira/browse/SPARK-22574

## How was this patch tested?

All spark test passed successfully.

It was tested sending a wrong request (without appArgs) before and after the change. The point is easy, check if the value is null before being accessed.

This was before the change, leaving the dispatcher inactive:

```
Exception in thread "Thread-22" java.lang.NullPointerException
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.getDriverCommandValue(MesosClusterScheduler.scala:444)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.buildDriverCommand(MesosClusterScheduler.scala:451)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.org$apache$spark$scheduler$cluster$mesos$MesosClusterScheduler$$createTaskInfo(MesosClusterScheduler.scala:538)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler$$anonfun$scheduleTasks$1.apply(MesosClusterScheduler.scala:570)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler$$anonfun$scheduleTasks$1.apply(MesosClusterScheduler.scala:555)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.scheduleTasks(MesosClusterScheduler.scala:555)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.resourceOffers(MesosClusterScheduler.scala:621)
```

And after:

```
  "message" : "Malformed request: org.apache.spark.deploy.rest.SubmitRestProtocolException: Validation of message CreateSubmissionRequest failed!\n\torg.apache.spark.deploy.rest.SubmitRestProtocolMessage.validate(SubmitRestProtocolMessage.scala:70)\n\torg.apache.spark.deploy.rest.SubmitRequestServlet.doPost(RestSubmissionServer.scala:272)\n\tjavax.servlet.http.HttpServlet.service(HttpServlet.java:707)\n\tjavax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\torg.spark_project.jetty.servlet.ServletHolder.handle(ServletHolder.java:845)\n\torg.spark_project.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:583)\n\torg.spark_project.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)\n\torg.spark_project.jetty.servlet.ServletHandler.doScope(ServletHandler.java:511)\n\torg.spark_project.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)\n\torg.spark_project.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\torg.spark_project.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)\n\torg.spark_project.jetty.server.Server.handle(Server.java:524)\n\torg.spark_project.jetty.server.HttpChannel.handle(HttpChannel.java:319)\n\torg.spark_project.jetty.server.HttpConnection.onFillable(HttpConnection.java:253)\n\torg.spark_project.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)\n\torg.spark_project.jetty.io.FillInterest.fillable(FillInterest.java:95)\n\torg.spark_project.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)\n\torg.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)\n\torg.spark_project.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)\n\tjava.lang.Thread.run(Thread.java:745)"
```

Author: German Schiavon <germanschiavon@gmail.com>

Closes #19793 from Gschiavon/fix-submission-request.
---
 .../deploy/rest/SubmitRestProtocolRequest.scala      |  2 ++
 .../spark/deploy/rest/SubmitRestProtocolSuite.scala  |  2 ++
 .../spark/deploy/rest/mesos/MesosRestServer.scala    | 12 ++++++++----
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
index 0d50a768942ed..86ddf954ca128 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
@@ -46,6 +46,8 @@ private[rest] class CreateSubmissionRequest extends SubmitRestProtocolRequest {
     super.doValidate()
     assert(sparkProperties != null, "No Spark properties set!")
     assertFieldIsSet(appResource, "appResource")
+    assertFieldIsSet(appArgs, "appArgs")
+    assertFieldIsSet(environmentVariables, "environmentVariables")
     assertPropertyIsSet("spark.app.name")
     assertPropertyIsBoolean("spark.driver.supervise")
     assertPropertyIsNumeric("spark.driver.cores")
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
index 725b8848bc052..75c50af23c66a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
@@ -86,6 +86,8 @@ class SubmitRestProtocolSuite extends SparkFunSuite {
     message.clientSparkVersion = "1.2.3"
     message.appResource = "honey-walnut-cherry.jar"
     message.mainClass = "org.apache.spark.examples.SparkPie"
+    message.appArgs = Array("two slices")
+    message.environmentVariables = Map("PATH" -> "/dev/null")
     val conf = new SparkConf(false)
     conf.set("spark.app.name", "SparkPie")
     message.sparkProperties = conf.getAll.toMap
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
index ff60b88c6d533..bb8dfee165183 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -77,10 +77,16 @@ private[mesos] class MesosSubmitRequestServlet(
   private def buildDriverDescription(request: CreateSubmissionRequest): MesosDriverDescription = {
     // Required fields, including the main class because python is not yet supported
     val appResource = Option(request.appResource).getOrElse {
-      throw new SubmitRestMissingFieldException("Application jar is missing.")
+      throw new SubmitRestMissingFieldException("Application jar 'appResource' is missing.")
     }
     val mainClass = Option(request.mainClass).getOrElse {
-      throw new SubmitRestMissingFieldException("Main class is missing.")
+      throw new SubmitRestMissingFieldException("Main class 'mainClass' is missing.")
+    }
+    val appArgs = Option(request.appArgs).getOrElse {
+      throw new SubmitRestMissingFieldException("Application arguments 'appArgs' are missing.")
+    }
+    val environmentVariables = Option(request.environmentVariables).getOrElse {
+      throw new SubmitRestMissingFieldException("Environment variables 'environmentVariables' are missing.")
     }
 
     // Optional fields
@@ -91,8 +97,6 @@ private[mesos] class MesosSubmitRequestServlet(
     val superviseDriver = sparkProperties.get("spark.driver.supervise")
     val driverMemory = sparkProperties.get("spark.driver.memory")
     val driverCores = sparkProperties.get("spark.driver.cores")
-    val appArgs = request.appArgs
-    val environmentVariables = request.environmentVariables
     val name = request.sparkProperties.getOrElse("spark.app.name", mainClass)
 
     // Construct driver description

From 704af4bd67d981a2efb5eb00110e19c5f7e7c924 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 12 Dec 2017 13:40:01 -0800
Subject: [PATCH 093/356] Revert "[SPARK-22574][MESOS][SUBMIT] Check submission
 request parameters"

This reverts commit 7a51e71355485bb176a1387d99ec430c5986cbec.
---
 .../deploy/rest/SubmitRestProtocolRequest.scala      |  2 --
 .../spark/deploy/rest/SubmitRestProtocolSuite.scala  |  2 --
 .../spark/deploy/rest/mesos/MesosRestServer.scala    | 12 ++++--------
 3 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
index 86ddf954ca128..0d50a768942ed 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
@@ -46,8 +46,6 @@ private[rest] class CreateSubmissionRequest extends SubmitRestProtocolRequest {
     super.doValidate()
     assert(sparkProperties != null, "No Spark properties set!")
     assertFieldIsSet(appResource, "appResource")
-    assertFieldIsSet(appArgs, "appArgs")
-    assertFieldIsSet(environmentVariables, "environmentVariables")
     assertPropertyIsSet("spark.app.name")
     assertPropertyIsBoolean("spark.driver.supervise")
     assertPropertyIsNumeric("spark.driver.cores")
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
index 75c50af23c66a..725b8848bc052 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
@@ -86,8 +86,6 @@ class SubmitRestProtocolSuite extends SparkFunSuite {
     message.clientSparkVersion = "1.2.3"
     message.appResource = "honey-walnut-cherry.jar"
     message.mainClass = "org.apache.spark.examples.SparkPie"
-    message.appArgs = Array("two slices")
-    message.environmentVariables = Map("PATH" -> "/dev/null")
     val conf = new SparkConf(false)
     conf.set("spark.app.name", "SparkPie")
     message.sparkProperties = conf.getAll.toMap
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
index bb8dfee165183..ff60b88c6d533 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -77,16 +77,10 @@ private[mesos] class MesosSubmitRequestServlet(
   private def buildDriverDescription(request: CreateSubmissionRequest): MesosDriverDescription = {
     // Required fields, including the main class because python is not yet supported
     val appResource = Option(request.appResource).getOrElse {
-      throw new SubmitRestMissingFieldException("Application jar 'appResource' is missing.")
+      throw new SubmitRestMissingFieldException("Application jar is missing.")
     }
     val mainClass = Option(request.mainClass).getOrElse {
-      throw new SubmitRestMissingFieldException("Main class 'mainClass' is missing.")
-    }
-    val appArgs = Option(request.appArgs).getOrElse {
-      throw new SubmitRestMissingFieldException("Application arguments 'appArgs' are missing.")
-    }
-    val environmentVariables = Option(request.environmentVariables).getOrElse {
-      throw new SubmitRestMissingFieldException("Environment variables 'environmentVariables' are missing.")
+      throw new SubmitRestMissingFieldException("Main class is missing.")
     }
 
     // Optional fields
@@ -97,6 +91,8 @@ private[mesos] class MesosSubmitRequestServlet(
     val superviseDriver = sparkProperties.get("spark.driver.supervise")
     val driverMemory = sparkProperties.get("spark.driver.memory")
     val driverCores = sparkProperties.get("spark.driver.cores")
+    val appArgs = request.appArgs
+    val environmentVariables = request.environmentVariables
     val name = request.sparkProperties.getOrElse("spark.app.name", mainClass)
 
     // Construct driver description

From 17cdabb88761e67ca555299109f89afdf02a4280 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Wed, 13 Dec 2017 07:42:24 +0900
Subject: [PATCH 094/356] [SPARK-19809][SQL][TEST] NullPointerException on
 zero-size ORC file

## What changes were proposed in this pull request?

Until 2.2.1, Spark raises `NullPointerException` on zero-size ORC files. Usually, these zero-size ORC files are generated by 3rd-party apps like Flume.

```scala
scala> sql("create table empty_orc(a int) stored as orc location '/tmp/empty_orc'")

$ touch /tmp/empty_orc/zero.orc

scala> sql("select * from empty_orc").show
java.lang.RuntimeException: serious problem at
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.generateSplitsInfo(OrcInputFormat.java:1021)
...
Caused by: java.lang.NullPointerException at
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$BISplitStrategy.getSplits(OrcInputFormat.java:560)
```

After [SPARK-22279](https://github.com/apache/spark/pull/19499), Apache Spark with the default configuration doesn't have this bug. Although Hive 1.2.1 library code path still has the problem, we had better have a test coverage on what we have now in order to prevent future regression on it.

## How was this patch tested?

Pass a newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19948 from dongjoon-hyun/SPARK-19809-EMPTY-FILE.
---
 .../sql/hive/execution/SQLQuerySuite.scala      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index f2562c33e2a6e..93c91d3fcb727 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2172,4 +2172,21 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  test("SPARK-19809 NullPointerException on zero-size ORC file") {
+    Seq("native", "hive").foreach { orcImpl =>
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+        withTempPath { dir =>
+          withTable("spark_19809") {
+            sql(s"CREATE TABLE spark_19809(a int) STORED AS ORC LOCATION '$dir'")
+            Files.touch(new File(s"${dir.getCanonicalPath}", "zero.orc"))
+
+            withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
+              checkAnswer(sql("SELECT * FROM spark_19809"), Seq.empty)
+            }
+          }
+        }
+      }
+    }
+  }
 }

From b03af8b582b9b71b09eaf3a1c01d1b3ef5f072e8 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Tue, 12 Dec 2017 17:37:01 -0800
Subject: [PATCH 095/356] [SPARK-21087][ML][FOLLOWUP] Sync SharedParamsCodeGen
 and sharedParams.

## What changes were proposed in this pull request?
#19208 modified ```sharedParams.scala```, but didn't generated by ```SharedParamsCodeGen.scala```. This involves mismatch between them.

## How was this patch tested?
Existing test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19958 from yanboliang/spark-21087.
---
 .../spark/ml/param/shared/SharedParamsCodeGen.scala    |  8 ++++----
 .../apache/spark/ml/param/shared/sharedParams.scala    | 10 ++++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index c54062921fce6..a267bbc874322 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -84,10 +84,10 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[String]("solver", "the solver algorithm for optimization", finalFields = false),
       ParamDesc[Int]("aggregationDepth", "suggested depth for treeAggregate (>= 2)", Some("2"),
         isValid = "ParamValidators.gtEq(2)", isExpertParam = true),
-      ParamDesc[Boolean]("collectSubModels", "If set to false, then only the single best " +
-        "sub-model will be available after fitting. If set to true, then all sub-models will be " +
-        "available. Warning: For large models, collecting all sub-models can cause OOMs on the " +
-        "Spark driver.",
+      ParamDesc[Boolean]("collectSubModels", "whether to collect a list of sub-models trained " +
+        "during tuning. If set to false, then only the single best sub-model will be available " +
+        "after fitting. If set to true, then all sub-models will be available. Warning: For " +
+        "large models, collecting all sub-models can cause OOMs on the Spark driver",
         Some("false"), isExpertParam = true)
     )
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 34aa38ac751fd..0004f085d10f2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -470,15 +470,17 @@ trait HasAggregationDepth extends Params {
 }
 
 /**
- * Trait for shared param collectSubModels (default: false).
+ * Trait for shared param collectSubModels (default: false). This trait may be changed or
+ * removed between minor versions.
  */
-private[ml] trait HasCollectSubModels extends Params {
+@DeveloperApi
+trait HasCollectSubModels extends Params {
 
   /**
-   * Param for whether to collect a list of sub-models trained during tuning.
+   * Param for whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver.
    * @group expertParam
    */
-  final val collectSubModels: BooleanParam = new BooleanParam(this, "collectSubModels", "whether to collect a list of sub-models trained during tuning")
+  final val collectSubModels: BooleanParam = new BooleanParam(this, "collectSubModels", "whether to collect a list of sub-models trained during tuning. If set to false, then only the single best sub-model will be available after fitting. If set to true, then all sub-models will be available. Warning: For large models, collecting all sub-models can cause OOMs on the Spark driver")
 
   setDefault(collectSubModels, false)
 

From 4117786a87f9d7631dec58a8e7aef09403b20a27 Mon Sep 17 00:00:00 2001
From: Marco Gaido <mgaido@hortonworks.com>
Date: Wed, 13 Dec 2017 10:29:14 +0800
Subject: [PATCH 096/356] [SPARK-22716][SQL] Avoid the creation of mutable
 states in addReferenceObj

## What changes were proposed in this pull request?

We have two methods to reference an object `addReferenceMinorObj` and `addReferenceObj `. The latter creates a new global variable, which means new entries in the constant pool.

The PR unifies the two method in a single `addReferenceObj` which returns the code to access the object in the `references` array and doesn't add new mutable states.

## How was this patch tested?

added UTs.

Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19916 from mgaido91/SPARK-22716.
---
 .../spark/sql/catalyst/expressions/Cast.scala |  8 +++----
 .../sql/catalyst/expressions/ScalaUDF.scala   |  2 +-
 .../expressions/codegen/CodeGenerator.scala   | 22 ++++---------------
 .../expressions/datetimeExpressions.scala     | 20 ++++++++---------
 .../sql/catalyst/expressions/literals.scala   |  9 ++++----
 .../spark/sql/catalyst/expressions/misc.scala |  2 +-
 .../expressions/objects/objects.scala         |  8 +++----
 .../expressions/CodeGenerationSuite.scala     |  7 ++++++
 .../expressions/MiscExpressionsSuite.scala    |  1 -
 .../catalyst/expressions/ScalaUDFSuite.scala  |  3 ++-
 .../aggregate/RowBasedHashMapGenerator.scala  |  4 ++--
 .../VectorizedHashMapGenerator.scala          |  4 ++--
 12 files changed, 42 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index b8d3661a00abc..5279d41278967 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -605,7 +605,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
           org.apache.spark.sql.catalyst.util.DateTimeUtils.dateToString($c));"""
       case TimestampType =>
-        val tz = ctx.addReferenceMinorObj(timeZone)
+        val tz = ctx.addReferenceObj("timeZone", timeZone)
         (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
           org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c, $tz));"""
       case _ =>
@@ -633,7 +633,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         }
        """
     case TimestampType =>
-      val tz = ctx.addReferenceMinorObj(timeZone)
+      val tz = ctx.addReferenceObj("timeZone", timeZone)
       (c, evPrim, evNull) =>
         s"$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.millisToDays($c / 1000L, $tz);"
     case _ =>
@@ -713,7 +713,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       from: DataType,
       ctx: CodegenContext): CastFunction = from match {
     case StringType =>
-      val tz = ctx.addReferenceMinorObj(timeZone)
+      val tz = ctx.addReferenceObj("timeZone", timeZone)
       val longOpt = ctx.freshName("longOpt")
       (c, evPrim, evNull) =>
         s"""
@@ -730,7 +730,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     case _: IntegralType =>
       (c, evPrim, evNull) => s"$evPrim = ${longToTimeStampCode(c)};"
     case DateType =>
-      val tz = ctx.addReferenceMinorObj(timeZone)
+      val tz = ctx.addReferenceObj("timeZone", timeZone)
       (c, evPrim, evNull) =>
         s"$evPrim = org.apache.spark.sql.catalyst.util.DateTimeUtils.daysToMillis($c, $tz) * 1000;"
     case DecimalType() =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index 4d26d9819321b..a3cf7612016b5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -1000,7 +1000,7 @@ case class ScalaUDF(
       ctx: CodegenContext,
       ev: ExprCode): ExprCode = {
     val scalaUDF = ctx.freshName("scalaUDF")
-    val scalaUDFRef = ctx.addReferenceMinorObj(this, scalaUDFClassName)
+    val scalaUDFRef = ctx.addReferenceObj("scalaUDFRef", this, scalaUDFClassName)
 
     // Object to convert the returned value of user-defined functions to Catalyst type
     val catalystConverterTerm = ctx.freshName("catalystConverter")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 5c9e604a8d293..4b8b16ff71ecb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -109,28 +109,14 @@ class CodegenContext {
    *
    * Returns the code to access it.
    *
-   * This is for minor objects not to store the object into field but refer it from the references
-   * field at the time of use because number of fields in class is limited so we should reduce it.
+   * This does not to store the object into field but refer it from the references field at the
+   * time of use because number of fields in class is limited so we should reduce it.
    */
-  def addReferenceMinorObj(obj: Any, className: String = null): String = {
+  def addReferenceObj(objName: String, obj: Any, className: String = null): String = {
     val idx = references.length
     references += obj
     val clsName = Option(className).getOrElse(obj.getClass.getName)
-    s"(($clsName) references[$idx])"
-  }
-
-  /**
-   * Add an object to `references`, create a class member to access it.
-   *
-   * Returns the name of class member.
-   */
-  def addReferenceObj(name: String, obj: Any, className: String = null): String = {
-    val term = freshName(name)
-    val idx = references.length
-    references += obj
-    val clsName = Option(className).getOrElse(obj.getClass.getName)
-    addMutableState(clsName, term, s"$term = ($clsName) references[$idx];")
-    term
+    s"(($clsName) references[$idx] /* $objName */)"
   }
 
   /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index eaf8788888211..44d54a20844a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -226,7 +226,7 @@ case class Hour(child: Expression, timeZoneId: Option[String] = None)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, c => s"$dtu.getHours($c, $tz)")
   }
@@ -257,7 +257,7 @@ case class Minute(child: Expression, timeZoneId: Option[String] = None)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, c => s"$dtu.getMinutes($c, $tz)")
   }
@@ -288,7 +288,7 @@ case class Second(child: Expression, timeZoneId: Option[String] = None)
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, c => s"$dtu.getSeconds($c, $tz)")
   }
@@ -529,7 +529,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     defineCodeGen(ctx, ev, (timestamp, format) => {
       s"""UTF8String.fromString($dtu.newDateFormat($format.toString(), $tz)
           .format(new java.util.Date($timestamp / 1000)))"""
@@ -691,7 +691,7 @@ abstract class UnixTime
             }""")
         }
       case StringType =>
-        val tz = ctx.addReferenceMinorObj(timeZone)
+        val tz = ctx.addReferenceObj("timeZone", timeZone)
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
         nullSafeCodeGen(ctx, ev, (string, format) => {
           s"""
@@ -715,7 +715,7 @@ abstract class UnixTime
             ${ev.value} = ${eval1.value} / 1000000L;
           }""")
       case DateType =>
-        val tz = ctx.addReferenceMinorObj(timeZone)
+        val tz = ctx.addReferenceObj("timeZone", timeZone)
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
         val eval1 = left.genCode(ctx)
         ev.copy(code = s"""
@@ -827,7 +827,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
           }""")
       }
     } else {
-      val tz = ctx.addReferenceMinorObj(timeZone)
+      val tz = ctx.addReferenceObj("timeZone", timeZone)
       val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
       nullSafeCodeGen(ctx, ev, (seconds, f) => {
         s"""
@@ -969,7 +969,7 @@ case class TimeAdd(start: Expression, interval: Expression, timeZoneId: Option[S
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, (sd, i) => {
       s"""$dtu.timestampAddInterval($sd, $i.months, $i.microseconds, $tz)"""
@@ -1065,7 +1065,7 @@ case class TimeSub(start: Expression, interval: Expression, timeZoneId: Option[S
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, (sd, i) => {
       s"""$dtu.timestampAddInterval($sd, 0 - $i.months, 0 - $i.microseconds, $tz)"""
@@ -1143,7 +1143,7 @@ case class MonthsBetween(date1: Expression, date2: Expression, timeZoneId: Optio
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tz = ctx.addReferenceMinorObj(timeZone)
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     defineCodeGen(ctx, ev, (l, r) => {
       s"""$dtu.monthsBetween($l, $r, $tz)"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index eaeaf08c37b4e..383203a209833 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -290,7 +290,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression {
         case FloatType =>
           val v = value.asInstanceOf[Float]
           if (v.isNaN || v.isInfinite) {
-            val boxedValue = ctx.addReferenceMinorObj(v)
+            val boxedValue = ctx.addReferenceObj("boxedValue", v)
             val code = s"final $javaType ${ev.value} = ($javaType) $boxedValue;"
             ev.copy(code = code)
           } else {
@@ -299,7 +299,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression {
         case DoubleType =>
           val v = value.asInstanceOf[Double]
           if (v.isNaN || v.isInfinite) {
-            val boxedValue = ctx.addReferenceMinorObj(v)
+            val boxedValue = ctx.addReferenceObj("boxedValue", v)
             val code = s"final $javaType ${ev.value} = ($javaType) $boxedValue;"
             ev.copy(code = code)
           } else {
@@ -309,8 +309,9 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression {
           ev.copy(code = "", value = s"($javaType)$value")
         case TimestampType | LongType =>
           ev.copy(code = "", value = s"${value}L")
-        case other =>
-          ev.copy(code = "", value = ctx.addReferenceMinorObj(value, ctx.javaType(dataType)))
+        case _ =>
+          ev.copy(code = "", value = ctx.addReferenceObj("literal", value,
+            ctx.javaType(dataType)))
       }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
index b86e271fe2958..4b9006ab5b423 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/misc.scala
@@ -81,7 +81,7 @@ case class AssertTrue(child: Expression) extends UnaryExpression with ImplicitCa
 
     // Use unnamed reference that doesn't create a local field here to reduce the number of fields
     // because errMsgField is used only when the value is null or false.
-    val errMsgField = ctx.addReferenceMinorObj(errMsg)
+    val errMsgField = ctx.addReferenceObj("errMsg", errMsg)
     ExprCode(code = s"""${eval.code}
        |if (${eval.isNull} || !${eval.value}) {
        |  throw new RuntimeException($errMsgField);
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 349afece84d5c..4bd395eadcf19 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -1123,7 +1123,7 @@ case class CreateExternalRow(children: Seq[Expression], schema: StructType)
       expressions = childrenCodes,
       funcName = "createExternalRow",
       extraArguments = "Object[]" -> values :: Nil)
-    val schemaField = ctx.addReferenceMinorObj(schema)
+    val schemaField = ctx.addReferenceObj("schema", schema)
 
     val code =
       s"""
@@ -1310,7 +1310,7 @@ case class AssertNotNull(child: Expression, walkedTypePath: Seq[String] = Nil)
 
     // Use unnamed reference that doesn't create a local field here to reduce the number of fields
     // because errMsgField is used only when the value is null.
-    val errMsgField = ctx.addReferenceMinorObj(errMsg)
+    val errMsgField = ctx.addReferenceObj("errMsg", errMsg)
 
     val code = s"""
       ${childGen.code}
@@ -1347,7 +1347,7 @@ case class GetExternalRowField(
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Use unnamed reference that doesn't create a local field here to reduce the number of fields
     // because errMsgField is used only when the field is null.
-    val errMsgField = ctx.addReferenceMinorObj(errMsg)
+    val errMsgField = ctx.addReferenceObj("errMsg", errMsg)
     val row = child.genCode(ctx)
     val code = s"""
       ${row.code}
@@ -1387,7 +1387,7 @@ case class ValidateExternalType(child: Expression, expected: DataType)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Use unnamed reference that doesn't create a local field here to reduce the number of fields
     // because errMsgField is used only when the type doesn't match.
-    val errMsgField = ctx.addReferenceMinorObj(errMsg)
+    val errMsgField = ctx.addReferenceObj("errMsg", errMsg)
     val input = child.genCode(ctx)
     val obj = input.value
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index 40bf29bb3b573..a969811019161 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -394,4 +394,11 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
       Map("add" -> Literal(1))).genCode(ctx)
     assert(ctx.mutableStates.isEmpty)
   }
+
+  test("SPARK-22716: addReferenceObj should not add mutable states") {
+    val ctx = new CodegenContext
+    val foo = new Object()
+    ctx.addReferenceObj("foo", foo)
+    assert(ctx.mutableStates.isEmpty)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
index 4fe7b436982b1..facc863081303 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MiscExpressionsSuite.scala
@@ -43,5 +43,4 @@ class MiscExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Length(Uuid()), 36)
     assert(evaluate(Uuid()) !== evaluate(Uuid()))
   }
-
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
index 70dea4b39d55d..b0687fe900d3d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
@@ -51,6 +51,7 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22695: ScalaUDF should not use global variables") {
     val ctx = new CodegenContext
     ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    // ScalaUDF can be very verbose and trigger reduceCodeSize
+    assert(ctx.mutableStates.forall(_._2.startsWith("globalIsNull")))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
index 3718424931b40..fd25707dd4ca6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/RowBasedHashMapGenerator.scala
@@ -47,7 +47,7 @@ class RowBasedHashMapGenerator(
     val generatedKeySchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         groupingKeySchema.map { key =>
-          val keyName = ctx.addReferenceMinorObj(key.name)
+          val keyName = ctx.addReferenceObj("keyName", key.name)
           key.dataType match {
             case d: DecimalType =>
               s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
@@ -60,7 +60,7 @@ class RowBasedHashMapGenerator(
     val generatedValueSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         bufferSchema.map { key =>
-          val keyName = ctx.addReferenceMinorObj(key.name)
+          val keyName = ctx.addReferenceObj("keyName", key.name)
           key.dataType match {
             case d: DecimalType =>
               s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index f04cd48072f17..0380ee8b09d63 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -54,7 +54,7 @@ class VectorizedHashMapGenerator(
     val generatedSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         (groupingKeySchema ++ bufferSchema).map { key =>
-          val keyName = ctx.addReferenceMinorObj(key.name)
+          val keyName = ctx.addReferenceObj("keyName", key.name)
           key.dataType match {
             case d: DecimalType =>
               s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(
@@ -67,7 +67,7 @@ class VectorizedHashMapGenerator(
     val generatedAggBufferSchema: String =
       s"new org.apache.spark.sql.types.StructType()" +
         bufferSchema.map { key =>
-          val keyName = ctx.addReferenceMinorObj(key.name)
+          val keyName = ctx.addReferenceObj("keyName", key.name)
           key.dataType match {
             case d: DecimalType =>
               s""".add($keyName, org.apache.spark.sql.types.DataTypes.createDecimalType(

From c7d0148615c921dca782ee3785b5d0cd59e42262 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 13 Dec 2017 10:40:05 +0800
Subject: [PATCH 097/356] [SPARK-22600][SQL] Fix 64kb limit for deeply nested
 expressions under wholestage codegen

## What changes were proposed in this pull request?

SPARK-22543 fixes the 64kb compile error for deeply nested expression for non-wholestage codegen. This PR extends it to support wholestage codegen.

This patch brings some util methods in to extract necessary parameters for an expression if it is split to a function.

The util methods are put in object `ExpressionCodegen` under `codegen`. The main entry is `getExpressionInputParams` which returns all necessary parameters to evaluate the given expression in a split function.

This util methods can be used to split expressions too. This is a TODO item later.

## How was this patch tested?

Added test.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19813 from viirya/reduce-expr-code-for-wholestage.
---
 .../sql/catalyst/expressions/Expression.scala |  37 ++-
 .../expressions/codegen/CodeGenerator.scala   |  44 ++-
 .../codegen/ExpressionCodegen.scala           | 269 ++++++++++++++++++
 .../codegen/ExpressionCodegenSuite.scala      | 220 ++++++++++++++
 .../sql/execution/ColumnarBatchScan.scala     |   5 +-
 .../execution/WholeStageCodegenSuite.scala    |  23 +-
 6 files changed, 585 insertions(+), 13 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 743782a6453e9..329ea5d421509 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -105,6 +105,12 @@ abstract class Expression extends TreeNode[Expression] {
       val isNull = ctx.freshName("isNull")
       val value = ctx.freshName("value")
       val eval = doGenCode(ctx, ExprCode("", isNull, value))
+      eval.isNull = if (this.nullable) eval.isNull else "false"
+
+      // Records current input row and variables of this expression.
+      eval.inputRow = ctx.INPUT_ROW
+      eval.inputVars = findInputVars(ctx, eval)
+
       reduceCodeSize(ctx, eval)
       if (eval.code.nonEmpty) {
         // Add `this` in the comment.
@@ -115,9 +121,29 @@ abstract class Expression extends TreeNode[Expression] {
     }
   }
 
+  /**
+   * Returns the input variables to this expression.
+   */
+  private def findInputVars(ctx: CodegenContext, eval: ExprCode): Seq[ExprInputVar] = {
+    if (ctx.currentVars != null) {
+      this.collect {
+        case b @ BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null =>
+          ExprInputVar(exprCode = ctx.currentVars(ordinal),
+            dataType = b.dataType, nullable = b.nullable)
+      }
+    } else {
+      Seq.empty
+    }
+  }
+
+  /**
+   * In order to prevent 64kb compile error, reducing the size of generated codes by
+   * separating it into a function if the size exceeds a threshold.
+   */
   private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = {
-    // TODO: support whole stage codegen too
-    if (eval.code.trim.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) {
+    lazy val funcParams = ExpressionCodegen.getExpressionInputParams(ctx, this)
+
+    if (eval.code.trim.length > 1024 && funcParams.isDefined) {
       val setIsNull = if (eval.isNull != "false" && eval.isNull != "true") {
         val globalIsNull = ctx.freshName("globalIsNull")
         ctx.addMutableState(ctx.JAVA_BOOLEAN, globalIsNull)
@@ -132,9 +158,12 @@ abstract class Expression extends TreeNode[Expression] {
       val newValue = ctx.freshName("value")
 
       val funcName = ctx.freshName(nodeName)
+      val callParams = funcParams.map(_._1.mkString(", ")).get
+      val declParams = funcParams.map(_._2.mkString(", ")).get
+
       val funcFullName = ctx.addNewFunction(funcName,
         s"""
-           |private $javaType $funcName(InternalRow ${ctx.INPUT_ROW}) {
+           |private $javaType $funcName($declParams) {
            |  ${eval.code.trim}
            |  $setIsNull
            |  return ${eval.value};
@@ -142,7 +171,7 @@ abstract class Expression extends TreeNode[Expression] {
            """.stripMargin)
 
       eval.value = newValue
-      eval.code = s"$javaType $newValue = $funcFullName(${ctx.INPUT_ROW});"
+      eval.code = s"$javaType $newValue = $funcFullName($callParams);"
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 4b8b16ff71ecb..257c3f10fa08b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -55,8 +55,24 @@ import org.apache.spark.util.{ParentClassLoader, Utils}
  *                 to null.
  * @param value A term for a (possibly primitive) value of the result of the evaluation. Not
  *              valid if `isNull` is set to `true`.
+ * @param inputRow A term that holds the input row name when generating this code.
+ * @param inputVars A list of [[ExprInputVar]] that holds input variables when generating this code.
  */
-case class ExprCode(var code: String, var isNull: String, var value: String)
+case class ExprCode(
+    var code: String,
+    var isNull: String,
+    var value: String,
+    var inputRow: String = null,
+    var inputVars: Seq[ExprInputVar] = Seq.empty)
+
+/**
+ * Represents an input variable [[ExprCode]] to an evaluation of an [[Expression]].
+ *
+ * @param exprCode The [[ExprCode]] that represents the evaluation result for the input variable.
+ * @param dataType The data type of the input variable.
+ * @param nullable Whether the input variable can be null or not.
+ */
+case class ExprInputVar(exprCode: ExprCode, dataType: DataType, nullable: Boolean)
 
 /**
  * State used for subexpression elimination.
@@ -1012,16 +1028,25 @@ class CodegenContext {
     commonExprs.foreach { e =>
       val expr = e.head
       val fnName = freshName("evalExpr")
-      val isNull = s"${fnName}IsNull"
+      val isNull = if (expr.nullable) {
+        s"${fnName}IsNull"
+      } else {
+        ""
+      }
       val value = s"${fnName}Value"
 
       // Generate the code for this expression tree and wrap it in a function.
       val eval = expr.genCode(this)
+      val assignIsNull = if (expr.nullable) {
+        s"$isNull = ${eval.isNull};"
+      } else {
+        ""
+      }
       val fn =
         s"""
            |private void $fnName(InternalRow $INPUT_ROW) {
            |  ${eval.code.trim}
-           |  $isNull = ${eval.isNull};
+           |  $assignIsNull
            |  $value = ${eval.value};
            |}
            """.stripMargin
@@ -1039,12 +1064,17 @@ class CodegenContext {
       //   2. Less code.
       // Currently, we will do this for all non-leaf only expression trees (i.e. expr trees with
       // at least two nodes) as the cost of doing it is expected to be low.
-      addMutableState(JAVA_BOOLEAN, isNull, s"$isNull = false;")
-      addMutableState(javaType(expr.dataType), value,
-        s"$value = ${defaultValue(expr.dataType)};")
+      if (expr.nullable) {
+        addMutableState(JAVA_BOOLEAN, isNull)
+      }
+      addMutableState(javaType(expr.dataType), value)
 
       subexprFunctions += s"${addNewFunction(fnName, fn)}($INPUT_ROW);"
-      val state = SubExprEliminationState(isNull, value)
+      val state = if (expr.nullable) {
+        SubExprEliminationState(isNull, value)
+      } else {
+        SubExprEliminationState("false", value)
+      }
       e.foreach(subExprEliminationExprs.put(_, state))
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
new file mode 100644
index 0000000000000..a2dda48e951d1
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import scala.collection.mutable
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.DataType
+
+/**
+ * Defines util methods used in expression code generation.
+ */
+object ExpressionCodegen {
+
+  /**
+   * Given an expression, returns the all necessary parameters to evaluate it, so the generated
+   * code of this expression can be split in a function.
+   * The 1st string in returned tuple is the parameter strings used to call the function.
+   * The 2nd string in returned tuple is the parameter strings used to declare the function.
+   *
+   * Returns `None` if it can't produce valid parameters.
+   *
+   * Params to include:
+   * 1. Evaluated columns referred by this, children or deferred expressions.
+   * 2. Rows referred by this, children or deferred expressions.
+   * 3. Eliminated subexpressions referred by children expressions.
+   */
+  def getExpressionInputParams(
+      ctx: CodegenContext,
+      expr: Expression): Option[(Seq[String], Seq[String])] = {
+    val subExprs = getSubExprInChildren(ctx, expr)
+    val subExprCodes = getSubExprCodes(ctx, subExprs)
+    val subVars = subExprs.zip(subExprCodes).map { case (subExpr, subExprCode) =>
+      ExprInputVar(subExprCode, subExpr.dataType, subExpr.nullable)
+    }
+    val paramsFromSubExprs = prepareFunctionParams(ctx, subVars)
+
+    val inputVars = getInputVarsForChildren(ctx, expr)
+    val paramsFromColumns = prepareFunctionParams(ctx, inputVars)
+
+    val inputRows = ctx.INPUT_ROW +: getInputRowsForChildren(ctx, expr)
+    val paramsFromRows = inputRows.distinct.filter(_ != null).map { row =>
+      (row, s"InternalRow $row")
+    }
+
+    val paramsLength = getParamLength(ctx, inputVars ++ subVars) + paramsFromRows.length
+    // Maximum allowed parameter number for Java's method descriptor.
+    if (paramsLength > 255) {
+      None
+    } else {
+      val allParams = (paramsFromRows ++ paramsFromColumns ++ paramsFromSubExprs).unzip
+      val callParams = allParams._1.distinct
+      val declParams = allParams._2.distinct
+      Some((callParams, declParams))
+    }
+  }
+
+  /**
+   * Returns the eliminated subexpressions in the children expressions.
+   */
+  def getSubExprInChildren(ctx: CodegenContext, expr: Expression): Seq[Expression] = {
+    expr.children.flatMap { child =>
+      child.collect {
+        case e if ctx.subExprEliminationExprs.contains(e) => e
+      }
+    }.distinct
+  }
+
+  /**
+   * A small helper function to return `ExprCode`s that represent subexpressions.
+   */
+  def getSubExprCodes(ctx: CodegenContext, subExprs: Seq[Expression]): Seq[ExprCode] = {
+    subExprs.map { subExpr =>
+      val state = ctx.subExprEliminationExprs(subExpr)
+      ExprCode(code = "", value = state.value, isNull = state.isNull)
+    }
+  }
+
+  /**
+   * Retrieves previous input rows referred by children and deferred expressions.
+   */
+  def getInputRowsForChildren(ctx: CodegenContext, expr: Expression): Seq[String] = {
+    expr.children.flatMap(getInputRows(ctx, _)).distinct
+  }
+
+  /**
+   * Given a child expression, retrieves previous input rows referred by it or deferred expressions
+   * which are needed to evaluate it.
+   */
+  def getInputRows(ctx: CodegenContext, child: Expression): Seq[String] = {
+    child.flatMap {
+      // An expression directly evaluates on current input row.
+      case BoundReference(ordinal, _, _) if ctx.currentVars == null ||
+          ctx.currentVars(ordinal) == null =>
+        Seq(ctx.INPUT_ROW)
+
+      // An expression which is not evaluated yet. Tracks down to find input rows.
+      case BoundReference(ordinal, _, _) if !isEvaluated(ctx.currentVars(ordinal)) =>
+        trackDownRow(ctx, ctx.currentVars(ordinal))
+
+      case _ => Seq.empty
+    }.distinct
+  }
+
+  /**
+   * Tracks down input rows referred by the generated code snippet.
+   */
+  def trackDownRow(ctx: CodegenContext, exprCode: ExprCode): Seq[String] = {
+    val exprCodes = mutable.Queue[ExprCode](exprCode)
+    val inputRows = mutable.ArrayBuffer.empty[String]
+
+    while (exprCodes.nonEmpty) {
+      val curExprCode = exprCodes.dequeue()
+      if (curExprCode.inputRow != null) {
+        inputRows += curExprCode.inputRow
+      }
+      curExprCode.inputVars.foreach { inputVar =>
+        if (!isEvaluated(inputVar.exprCode)) {
+          exprCodes.enqueue(inputVar.exprCode)
+        }
+      }
+    }
+    inputRows
+  }
+
+  /**
+   * Retrieves previously evaluated columns referred by children and deferred expressions.
+   * Returned tuple contains the list of expressions and the list of generated codes.
+   */
+  def getInputVarsForChildren(
+      ctx: CodegenContext,
+      expr: Expression): Seq[ExprInputVar] = {
+    expr.children.flatMap(getInputVars(ctx, _)).distinct
+  }
+
+  /**
+   * Given a child expression, retrieves previously evaluated columns referred by it or
+   * deferred expressions which are needed to evaluate it.
+   */
+  def getInputVars(ctx: CodegenContext, child: Expression): Seq[ExprInputVar] = {
+    if (ctx.currentVars == null) {
+      return Seq.empty
+    }
+
+    child.flatMap {
+      // An evaluated variable.
+      case b @ BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null &&
+          isEvaluated(ctx.currentVars(ordinal)) =>
+        Seq(ExprInputVar(ctx.currentVars(ordinal), b.dataType, b.nullable))
+
+      // An input variable which is not evaluated yet. Tracks down to find any evaluated variables
+      // in the expression path.
+      // E.g., if this expression is "d = c + 1" and "c" is not evaluated. We need to track to
+      // "c = a + b" and see if "a" and "b" are evaluated. If they are, we need to return them so
+      // to include them into parameters, if not, we track down further.
+      case BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null =>
+        trackDownVar(ctx, ctx.currentVars(ordinal))
+
+      case _ => Seq.empty
+    }.distinct
+  }
+
+  /**
+   * Tracks down previously evaluated columns referred by the generated code snippet.
+   */
+  def trackDownVar(ctx: CodegenContext, exprCode: ExprCode): Seq[ExprInputVar] = {
+    val exprCodes = mutable.Queue[ExprCode](exprCode)
+    val inputVars = mutable.ArrayBuffer.empty[ExprInputVar]
+
+    while (exprCodes.nonEmpty) {
+      exprCodes.dequeue().inputVars.foreach { inputVar =>
+        if (isEvaluated(inputVar.exprCode)) {
+          inputVars += inputVar
+        } else {
+          exprCodes.enqueue(inputVar.exprCode)
+        }
+      }
+    }
+    inputVars
+  }
+
+  /**
+   * Helper function to calculate the size of an expression as function parameter.
+   */
+  def calculateParamLength(ctx: CodegenContext, input: ExprInputVar): Int = {
+    (if (input.nullable) 1 else 0) + ctx.javaType(input.dataType) match {
+      case ctx.JAVA_LONG | ctx.JAVA_DOUBLE => 2
+      case _ => 1
+    }
+  }
+
+  /**
+   * In Java, a method descriptor is valid only if it represents method parameters with a total
+   * length of 255 or less. `this` contributes one unit and a parameter of type long or double
+   * contributes two units.
+   */
+  def getParamLength(ctx: CodegenContext, inputs: Seq[ExprInputVar]): Int = {
+    // Initial value is 1 for `this`.
+    1 + inputs.map(calculateParamLength(ctx, _)).sum
+  }
+
+  /**
+   * Given the lists of input attributes and variables to this expression, returns the strings of
+   * funtion parameters. The first is the variable names used to call the function, the second is
+   * the parameters used to declare the function in generated code.
+   */
+  def prepareFunctionParams(
+      ctx: CodegenContext,
+      inputVars: Seq[ExprInputVar]): Seq[(String, String)] = {
+    inputVars.flatMap { inputVar =>
+      val params = mutable.ArrayBuffer.empty[(String, String)]
+      val ev = inputVar.exprCode
+
+      // Only include the expression value if it is not a literal.
+      if (!isLiteral(ev)) {
+        val argType = ctx.javaType(inputVar.dataType)
+        params += ((ev.value, s"$argType ${ev.value}"))
+      }
+
+      // If it is a nullable expression and `isNull` is not a literal.
+      if (inputVar.nullable && ev.isNull != "true" && ev.isNull != "false") {
+        params += ((ev.isNull, s"boolean ${ev.isNull}"))
+      }
+
+      params
+    }.distinct
+  }
+
+  /**
+   * Only applied to the `ExprCode` in `ctx.currentVars`.
+   * Returns true if this value is a literal.
+   */
+  def isLiteral(exprCode: ExprCode): Boolean = {
+    assert(exprCode.value.nonEmpty, "ExprCode.value can't be empty string.")
+
+    if (exprCode.value == "true" || exprCode.value == "false" || exprCode.value == "null") {
+      true
+    } else {
+      // The valid characters for the first character of a Java variable is [a-zA-Z_$].
+      exprCode.value.head match {
+        case v if v >= 'a' && v <= 'z' => false
+        case v if v >= 'A' && v <= 'Z' => false
+        case '_' | '$' => false
+        case _ => true
+      }
+    }
+  }
+
+  /**
+   * Only applied to the `ExprCode` in `ctx.currentVars`.
+   * The code is emptied after evaluation.
+   */
+  def isEvaluated(exprCode: ExprCode): Boolean = exprCode.code == ""
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala
new file mode 100644
index 0000000000000..39d58cabff228
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.IntegerType
+
+class ExpressionCodegenSuite extends SparkFunSuite {
+
+  test("Returns eliminated subexpressions for expression") {
+    val ctx = new CodegenContext()
+    val subExpr = Add(Literal(1), Literal(2))
+    val exprs = Seq(Add(subExpr, Literal(3)), Add(subExpr, Literal(4)))
+
+    ctx.generateExpressions(exprs, doSubexpressionElimination = true)
+    val subexpressions = ExpressionCodegen.getSubExprInChildren(ctx, exprs(0))
+    assert(subexpressions.length == 1 && subexpressions(0) == subExpr)
+  }
+
+  test("Gets parameters for subexpressions") {
+    val ctx = new CodegenContext()
+    val subExprs = Seq(
+      Add(Literal(1), AttributeReference("a", IntegerType, nullable = false)()), // non-nullable
+      Add(Literal(2), AttributeReference("b", IntegerType, nullable = true)()))  // nullable
+
+    ctx.subExprEliminationExprs.put(subExprs(0), SubExprEliminationState("false", "value1"))
+    ctx.subExprEliminationExprs.put(subExprs(1), SubExprEliminationState("isNull2", "value2"))
+
+    val subExprCodes = ExpressionCodegen.getSubExprCodes(ctx, subExprs)
+    val subVars = subExprs.zip(subExprCodes).map { case (expr, exprCode) =>
+      ExprInputVar(exprCode, expr.dataType, expr.nullable)
+    }
+    val params = ExpressionCodegen.prepareFunctionParams(ctx, subVars)
+    assert(params.length == 3)
+    assert(params(0) == Tuple2("value1", "int value1"))
+    assert(params(1) == Tuple2("value2", "int value2"))
+    assert(params(2) == Tuple2("isNull2", "boolean isNull2"))
+  }
+
+  test("Returns input variables for expression: current variables") {
+    val ctx = new CodegenContext()
+    val currentVars = Seq(
+      ExprCode("", isNull = "false", value = "value1"),             // evaluated
+      ExprCode("", isNull = "isNull2", value = "value2"),           // evaluated
+      ExprCode("fake code;", isNull = "isNull3", value = "value3")) // not evaluated
+    ctx.currentVars = currentVars
+    ctx.INPUT_ROW = null
+
+    val expr = If(Literal(false),
+      Add(BoundReference(0, IntegerType, nullable = false),
+          BoundReference(1, IntegerType, nullable = true)),
+        BoundReference(2, IntegerType, nullable = true))
+
+    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
+    // Only two evaluated variables included.
+    assert(inputVars.length == 2)
+    assert(inputVars(0).dataType == IntegerType && inputVars(0).nullable == false)
+    assert(inputVars(1).dataType == IntegerType && inputVars(1).nullable == true)
+    assert(inputVars(0).exprCode == currentVars(0))
+    assert(inputVars(1).exprCode == currentVars(1))
+
+    val params = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
+    assert(params.length == 3)
+    assert(params(0) == Tuple2("value1", "int value1"))
+    assert(params(1) == Tuple2("value2", "int value2"))
+    assert(params(2) == Tuple2("isNull2", "boolean isNull2"))
+  }
+
+  test("Returns input variables for expression: deferred variables") {
+    val ctx = new CodegenContext()
+
+    // The referred column is not evaluated yet. But it depends on an evaluated column from
+    // other operator.
+    val currentVars = Seq(ExprCode("fake code;", isNull = "isNull1", value = "value1"))
+
+    // currentVars(0) depends on this evaluated column.
+    currentVars(0).inputVars = Seq(ExprInputVar(ExprCode("", isNull = "isNull2", value = "value2"),
+      dataType = IntegerType, nullable = true))
+    ctx.currentVars = currentVars
+    ctx.INPUT_ROW = null
+
+    val expr = Add(Literal(1), BoundReference(0, IntegerType, nullable = false))
+    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
+    assert(inputVars.length == 1)
+    assert(inputVars(0).dataType == IntegerType && inputVars(0).nullable == true)
+
+    val params = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
+    assert(params.length == 2)
+    assert(params(0) == Tuple2("value2", "int value2"))
+    assert(params(1) == Tuple2("isNull2", "boolean isNull2"))
+  }
+
+  test("Returns input rows for expression") {
+    val ctx = new CodegenContext()
+    ctx.currentVars = null
+    ctx.INPUT_ROW = "i"
+
+    val expr = Add(BoundReference(0, IntegerType, nullable = false),
+      BoundReference(1, IntegerType, nullable = true))
+    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
+    assert(inputRows.length == 1)
+    assert(inputRows(0) == "i")
+  }
+
+  test("Returns input rows for expression: deferred expression") {
+    val ctx = new CodegenContext()
+
+    // The referred column is not evaluated yet. But it depends on an input row from
+    // other operator.
+    val currentVars = Seq(ExprCode("fake code;", isNull = "isNull1", value = "value1"))
+    currentVars(0).inputRow = "inputadaptor_row1"
+    ctx.currentVars = currentVars
+    ctx.INPUT_ROW = null
+
+    val expr = Add(Literal(1), BoundReference(0, IntegerType, nullable = false))
+    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
+    assert(inputRows.length == 1)
+    assert(inputRows(0) == "inputadaptor_row1")
+  }
+
+  test("Returns both input rows and variables for expression") {
+    val ctx = new CodegenContext()
+    // 5 input variables in currentVars:
+    //   1 evaluated variable (value1).
+    //   3 not evaluated variables.
+    //     value2 depends on an evaluated column from other operator.
+    //     value3 depends on an input row from other operator.
+    //     value4 depends on a not evaluated yet column from other operator.
+    //   1 null indicating to use input row "i".
+    val currentVars = Seq(
+      ExprCode("", isNull = "false", value = "value1"),
+      ExprCode("fake code;", isNull = "isNull2", value = "value2"),
+      ExprCode("fake code;", isNull = "isNull3", value = "value3"),
+      ExprCode("fake code;", isNull = "isNull4", value = "value4"),
+      null)
+    // value2 depends on this evaluated column.
+    currentVars(1).inputVars = Seq(ExprInputVar(ExprCode("", isNull = "isNull5", value = "value5"),
+      dataType = IntegerType, nullable = true))
+    // value3 depends on an input row "inputadaptor_row1".
+    currentVars(2).inputRow = "inputadaptor_row1"
+    // value4 depends on another not evaluated yet column.
+    currentVars(3).inputVars = Seq(ExprInputVar(ExprCode("fake code;",
+      isNull = "isNull6", value = "value6"), dataType = IntegerType, nullable = true))
+    ctx.currentVars = currentVars
+    ctx.INPUT_ROW = "i"
+
+    // expr: if (false) { value1 + value2 } else { (value3 + value4) + i[5] }
+    val expr = If(Literal(false),
+      Add(BoundReference(0, IntegerType, nullable = false),
+          BoundReference(1, IntegerType, nullable = true)),
+      Add(Add(BoundReference(2, IntegerType, nullable = true),
+              BoundReference(3, IntegerType, nullable = true)),
+          BoundReference(4, IntegerType, nullable = true))) // this is based on input row "i".
+
+    // input rows: "i", "inputadaptor_row1".
+    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
+    assert(inputRows.length == 2)
+    assert(inputRows(0) == "inputadaptor_row1")
+    assert(inputRows(1) == "i")
+
+    // input variables: value1 and value5
+    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
+    assert(inputVars.length == 2)
+
+    // value1 has inlined isNull "false", so don't need to include it in the params.
+    val inputVarParams = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
+    assert(inputVarParams.length == 3)
+    assert(inputVarParams(0) == Tuple2("value1", "int value1"))
+    assert(inputVarParams(1) == Tuple2("value5", "int value5"))
+    assert(inputVarParams(2) == Tuple2("isNull5", "boolean isNull5"))
+  }
+
+  test("isLiteral: literals") {
+    val literals = Seq(
+      ExprCode("", "", "true"),
+      ExprCode("", "", "false"),
+      ExprCode("", "", "1"),
+      ExprCode("", "", "-1"),
+      ExprCode("", "", "1L"),
+      ExprCode("", "", "-1L"),
+      ExprCode("", "", "1.0f"),
+      ExprCode("", "", "-1.0f"),
+      ExprCode("", "", "0.1f"),
+      ExprCode("", "", "-0.1f"),
+      ExprCode("", "", """"string""""),
+      ExprCode("", "", "(byte)-1"),
+      ExprCode("", "", "(short)-1"),
+      ExprCode("", "", "null"))
+
+    literals.foreach(l => assert(ExpressionCodegen.isLiteral(l) == true))
+  }
+
+  test("isLiteral: non literals") {
+    val variables = Seq(
+      ExprCode("", "", "var1"),
+      ExprCode("", "", "_var2"),
+      ExprCode("", "", "$var3"),
+      ExprCode("", "", "v1a2r3"),
+      ExprCode("", "", "_1v2a3r"),
+      ExprCode("", "", "$1v2a3r"))
+
+    variables.foreach(v => assert(ExpressionCodegen.isLiteral(v) == false))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index a9bfb634fbdea..05186c4472566 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -108,7 +108,10 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
          |}""".stripMargin)
 
     ctx.currentVars = null
+    // `rowIdx` isn't in `ctx.currentVars`. If the expressions are split later, we can't track it.
+    // So making it as global variable.
     val rowidx = ctx.freshName("rowIdx")
+    ctx.addMutableState(ctx.JAVA_INT, rowidx)
     val columnsBatchInput = (output zip colVars).map { case (attr, colVar) =>
       genCodeColumnVector(ctx, colVar, rowidx, attr.dataType, attr.nullable)
     }
@@ -128,7 +131,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
        |  int $numRows = $batch.numRows();
        |  int $localEnd = $numRows - $idx;
        |  for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
-       |    int $rowidx = $idx + $localIdx;
+       |    $rowidx = $idx + $localIdx;
        |    ${consume(ctx, columnsBatchInput).trim}
        |    $shouldStop
        |  }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index bc05dca578c47..1281169b607c9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{QueryTest, Row, SaveMode}
+import org.apache.spark.sql.{Column, QueryTest, Row, SaveMode}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
@@ -236,4 +237,24 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-22551: Fix 64kb limit for deeply nested expressions under wholestage codegen") {
+    import testImplicits._
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      val df = Seq(("abc", 1)).toDF("key", "int")
+      df.write.parquet(path)
+
+      var strExpr: Expression = col("key").expr
+      for (_ <- 1 to 150) {
+        strExpr = Decode(Encode(strExpr, Literal("utf-8")), Literal("utf-8"))
+      }
+      val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr))
+
+      val df2 = spark.read.parquet(path).select(expressions.map(Column(_)): _*)
+      val plan = df2.queryExecution.executedPlan
+      assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
+      df2.collect()
+    }
+  }
 }

From 0e36ba6212bc24b3185e385914fbf2d62cbfb6da Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Tue, 12 Dec 2017 21:28:24 -0800
Subject: [PATCH 098/356] [SPARK-22644][ML][TEST] Make ML testsuite support
 StructuredStreaming test

## What changes were proposed in this pull request?

We need to add some helper code to make testing ML transformers & models easier with streaming data. These tests might help us catch any remaining issues and we could encourage future PRs to use these tests to prevent new Models & Transformers from having issues.

I add a `MLTest` trait which extends `StreamTest` trait, and override `createSparkSession`. So ML testsuite can only extend `MLTest`, to use both ML & Stream test util functions.

I only modify one testcase in `LinearRegressionSuite`, for first pass review.

Link to #19746

## How was this patch tested?

`MLTestSuite` added.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19843 from WeichenXu123/ml_stream_test_helper.
---
 mllib/pom.xml                                 | 14 +++
 .../ml/regression/LinearRegressionSuite.scala |  8 +-
 .../org/apache/spark/ml/util/MLTest.scala     | 91 +++++++++++++++++++
 .../apache/spark/ml/util/MLTestSuite.scala    | 47 ++++++++++
 .../spark/sql/streaming/StreamTest.scala      | 67 +++++++++-----
 .../spark/sql/test/TestSQLContext.scala       |  2 +-
 6 files changed, 203 insertions(+), 26 deletions(-)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala

diff --git a/mllib/pom.xml b/mllib/pom.xml
index 925b5422a54cc..a906c9e02cd4c 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -60,6 +60,20 @@
       <artifactId>spark-sql_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-graphx_${scala.binary.version}</artifactId>
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 0e0be58dbf022..aec5ac0e75896 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -24,13 +24,12 @@ import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
 import org.apache.spark.sql.{DataFrame, Row}
 
-class LinearRegressionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -233,7 +232,8 @@ class LinearRegressionSuite
       assert(model2.intercept ~== interceptR relTol 1E-3)
       assert(model2.coefficients ~= coefficientsR relTol 1E-3)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
new file mode 100644
index 0000000000000..7a5426ebadaa5
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.util
+
+import java.io.File
+
+import org.scalatest.Suite
+
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.{PipelineModel, Transformer}
+import org.apache.spark.sql.{DataFrame, Encoder, Row}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.sql.test.TestSparkSession
+import org.apache.spark.util.Utils
+
+trait MLTest extends StreamTest with TempDirectory { self: Suite =>
+
+  @transient var sc: SparkContext = _
+  @transient var checkpointDir: String = _
+
+  protected override def createSparkSession: TestSparkSession = {
+    new TestSparkSession(new SparkContext("local[2]", "MLlibUnitTest", sparkConf))
+  }
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sc = spark.sparkContext
+    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
+    sc.setCheckpointDir(checkpointDir)
+  }
+
+  override def afterAll() {
+    try {
+      Utils.deleteRecursively(new File(checkpointDir))
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  def testTransformerOnStreamData[A : Encoder](
+      dataframe: DataFrame,
+      transformer: Transformer,
+      firstResultCol: String,
+      otherResultCols: String*)
+      (checkFunction: Row => Unit): Unit = {
+
+    val columnNames = dataframe.schema.fieldNames
+    val stream = MemoryStream[A]
+    val streamDF = stream.toDS().toDF(columnNames: _*)
+
+    val data = dataframe.as[A].collect()
+
+    val streamOutput = transformer.transform(streamDF)
+      .select(firstResultCol, otherResultCols: _*)
+    testStream(streamOutput) (
+      AddData(stream, data: _*),
+      CheckAnswer(checkFunction)
+    )
+  }
+
+  def testTransformer[A : Encoder](
+      dataframe: DataFrame,
+      transformer: Transformer,
+      firstResultCol: String,
+      otherResultCols: String*)
+      (checkFunction: Row => Unit): Unit = {
+    testTransformerOnStreamData(dataframe, transformer, firstResultCol,
+      otherResultCols: _*)(checkFunction)
+
+    val dfOutput = transformer.transform(dataframe)
+    dfOutput.select(firstResultCol, otherResultCols: _*).collect().foreach { row =>
+      checkFunction(row)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala
new file mode 100644
index 0000000000000..56217ec4f3b0c
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.util
+
+import org.apache.spark.ml.{PipelineModel, Transformer}
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.sql.Row
+
+class MLTestSuite extends MLTest {
+
+  import testImplicits._
+
+  test("test transformer on stream data") {
+
+    val data = Seq((0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "f"))
+      .toDF("id", "label")
+    val indexer = new StringIndexer().setStringOrderType("alphabetAsc")
+      .setInputCol("label").setOutputCol("indexed")
+    val indexerModel = indexer.fit(data)
+    testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") {
+      case Row(id: Int, indexed: Double) =>
+        assert(id === indexed.toInt)
+    }
+
+    intercept[Exception] {
+      testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") {
+        case Row(id: Int, indexed: Double) =>
+          assert(id != indexed.toInt)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index e68fca050571f..dc5b998ad68b5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -133,6 +133,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     }
 
     def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, false, false)
+
+    def apply(checkFunction: Row => Unit): CheckAnswerRowsByFunc =
+      CheckAnswerRowsByFunc(checkFunction, false)
   }
 
   /**
@@ -154,6 +157,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     }
 
     def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, true, false)
+
+    def apply(checkFunction: Row => Unit): CheckAnswerRowsByFunc =
+      CheckAnswerRowsByFunc(checkFunction, true)
   }
 
   case class CheckAnswerRows(expectedAnswer: Seq[Row], lastOnly: Boolean, isSorted: Boolean)
@@ -162,6 +168,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
   }
 
+  case class CheckAnswerRowsByFunc(checkFunction: Row => Unit, lastOnly: Boolean)
+      extends StreamAction with StreamMustBeRunning {
+    override def toString: String = s"$operatorName: ${checkFunction.toString()}"
+    private def operatorName = if (lastOnly) "CheckLastBatchByFunc" else "CheckAnswerByFunc"
+  }
+
   /** Stops the stream. It must currently be running. */
   case object StopStream extends StreamAction with StreamMustBeRunning
 
@@ -352,6 +364,29 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
          """.stripMargin)
     }
 
+    def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
+      verify(currentStream != null, "stream not running")
+      // Get the map of source index to the current source objects
+      val indexToSource = currentStream
+        .logicalPlan
+        .collect { case StreamingExecutionRelation(s, _) => s }
+        .zipWithIndex
+        .map(_.swap)
+        .toMap
+
+      // Block until all data added has been processed for all the source
+      awaiting.foreach { case (sourceIndex, offset) =>
+        failAfter(streamingTimeout) {
+          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
+        }
+      }
+
+      try if (lastOnly) sink.latestBatchData else sink.allData catch {
+        case e: Exception =>
+          failTest("Exception while getting data from sink", e)
+      }
+    }
+
     var manualClockExpectedTime = -1L
     val defaultCheckpointLocation =
       Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
@@ -552,30 +587,20 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             e.runAction()
 
           case CheckAnswerRows(expectedAnswer, lastOnly, isSorted) =>
-            verify(currentStream != null, "stream not running")
-            // Get the map of source index to the current source objects
-            val indexToSource = currentStream
-              .logicalPlan
-              .collect { case StreamingExecutionRelation(s, _) => s }
-              .zipWithIndex
-              .map(_.swap)
-              .toMap
-
-            // Block until all data added has been processed for all the source
-            awaiting.foreach { case (sourceIndex, offset) =>
-              failAfter(streamingTimeout) {
-                currentStream.awaitOffset(indexToSource(sourceIndex), offset)
-              }
-            }
-
-            val sparkAnswer = try if (lastOnly) sink.latestBatchData else sink.allData catch {
-              case e: Exception =>
-                failTest("Exception while getting data from sink", e)
-            }
-
+            val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
             QueryTest.sameRows(expectedAnswer, sparkAnswer, isSorted).foreach {
               error => failTest(error)
             }
+
+          case CheckAnswerRowsByFunc(checkFunction, lastOnly) =>
+            val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
+            sparkAnswer.foreach { row =>
+              try {
+                checkFunction(row)
+              } catch {
+                case e: Throwable => failTest(e.toString)
+              }
+            }
         }
         pos += 1
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 959edf9a49371..4286e8a6ca2c8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.internal.{SessionState, SessionStateBuilder, SQLConf
 /**
  * A special `SparkSession` prepared for testing.
  */
-private[sql] class TestSparkSession(sc: SparkContext) extends SparkSession(sc) { self =>
+private[spark] class TestSparkSession(sc: SparkContext) extends SparkSession(sc) { self =>
   def this(sparkConf: SparkConf) {
     this(new SparkContext("local[2]", "test-sql-context",
       sparkConf.set("spark.sql.testkey", "true")))

From 6b80ce4fb20da57c9513b94ab02b53a5fd7571d0 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 12 Dec 2017 22:41:38 -0800
Subject: [PATCH 099/356] [SPARK-19809][SQL][TEST][FOLLOWUP] Move the test case
 to HiveOrcQuerySuite

## What changes were proposed in this pull request?

As a follow-up of #19948 , this PR moves the test case and adds comments.

## How was this patch tested?

Pass the Jenkins.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19960 from dongjoon-hyun/SPARK-19809-2.
---
 .../sql/hive/execution/SQLQuerySuite.scala    | 36 --------------
 .../sql/hive/orc/HiveOrcQuerySuite.scala      | 48 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 93c91d3fcb727..c11e37a516646 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2153,40 +2153,4 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
-
-  test("SPARK-22267 Spark SQL incorrectly reads ORC files when column order is different") {
-    Seq("native", "hive").foreach { orcImpl =>
-      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
-        withTempPath { f =>
-          val path = f.getCanonicalPath
-          Seq(1 -> 2).toDF("c1", "c2").write.orc(path)
-          checkAnswer(spark.read.orc(path), Row(1, 2))
-
-          withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
-            withTable("t") {
-              sql(s"CREATE EXTERNAL TABLE t(c2 INT, c1 INT) STORED AS ORC LOCATION '$path'")
-              checkAnswer(spark.table("t"), Row(2, 1))
-            }
-          }
-        }
-      }
-    }
-  }
-
-  test("SPARK-19809 NullPointerException on zero-size ORC file") {
-    Seq("native", "hive").foreach { orcImpl =>
-      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
-        withTempPath { dir =>
-          withTable("spark_19809") {
-            sql(s"CREATE TABLE spark_19809(a int) STORED AS ORC LOCATION '$dir'")
-            Files.touch(new File(s"${dir.getCanonicalPath}", "zero.orc"))
-
-            withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
-              checkAnswer(sql("SELECT * FROM spark_19809"), Seq.empty)
-            }
-          }
-        }
-      }
-    }
-  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
index 7244c369bd3f4..92b2f069cacd6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcQuerySuite.scala
@@ -17,7 +17,11 @@
 
 package org.apache.spark.sql.hive.orc
 
-import org.apache.spark.sql.AnalysisException
+import java.io.File
+
+import com.google.common.io.Files
+
+import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.sql.execution.datasources.orc.OrcQueryTest
@@ -162,4 +166,46 @@ class HiveOrcQuerySuite extends OrcQueryTest with TestHiveSingleton {
         }
     }
   }
+
+  // Since Hive 1.2.1 library code path still has this problem, users may hit this
+  // when spark.sql.hive.convertMetastoreOrc=false. However, after SPARK-22279,
+  // Apache Spark with the default configuration doesn't hit this bug.
+  test("SPARK-22267 Spark SQL incorrectly reads ORC files when column order is different") {
+    Seq("native", "hive").foreach { orcImpl =>
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+        withTempPath { f =>
+          val path = f.getCanonicalPath
+          Seq(1 -> 2).toDF("c1", "c2").write.orc(path)
+          checkAnswer(spark.read.orc(path), Row(1, 2))
+
+          withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
+            withTable("t") {
+              sql(s"CREATE EXTERNAL TABLE t(c2 INT, c1 INT) STORED AS ORC LOCATION '$path'")
+              checkAnswer(spark.table("t"), Row(2, 1))
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Since Hive 1.2.1 library code path still has this problem, users may hit this
+  // when spark.sql.hive.convertMetastoreOrc=false. However, after SPARK-22279,
+  // Apache Spark with the default configuration doesn't hit this bug.
+  test("SPARK-19809 NullPointerException on zero-size ORC file") {
+    Seq("native", "hive").foreach { orcImpl =>
+      withSQLConf(SQLConf.ORC_IMPLEMENTATION.key -> orcImpl) {
+        withTempPath { dir =>
+          withTable("spark_19809") {
+            sql(s"CREATE TABLE spark_19809(a int) STORED AS ORC LOCATION '$dir'")
+            Files.touch(new File(s"${dir.getCanonicalPath}", "zero.orc"))
+
+            withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") { // default since 2.3.0
+              checkAnswer(spark.table("spark_19809"), Seq.empty)
+            }
+          }
+        }
+      }
+    }
+  }
 }

From 13e489b6754f4d3569dad99bf5be2d5b0914dd68 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 12 Dec 2017 22:48:31 -0800
Subject: [PATCH 100/356] [SPARK-22759][SQL] Filters can be combined iff both
 are deterministic

## What changes were proposed in this pull request?
The query execution/optimization does not guarantee the expressions are evaluated in order. We only can combine them if and only if both are deterministic. We need to update the optimizer rule: CombineFilters.

## How was this patch tested?
Updated the existing tests.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19947 from gatorsmile/combineFilters.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala     |  4 +++-
 .../sql/catalyst/optimizer/FilterPushdownSuite.scala | 12 ++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 484cd8c2475f5..577693506ed34 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -695,7 +695,9 @@ object CombineUnions extends Rule[LogicalPlan] {
  */
 object CombineFilters extends Rule[LogicalPlan] with PredicateHelper {
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
-    case Filter(fc, nf @ Filter(nc, grandChild)) =>
+    // The query execution/optimization does not guarantee the expressions are evaluated in order.
+    // We only can combine them if and only if both are deterministic.
+    case Filter(fc, nf @ Filter(nc, grandChild)) if fc.deterministic && nc.deterministic =>
       (ExpressionSet(splitConjunctivePredicates(fc)) --
         ExpressionSet(splitConjunctivePredicates(nc))).reduceOption(And) match {
         case Some(ac) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index de0e7c7ee49ac..641824e6698f7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -94,19 +94,15 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
-  test("combine redundant deterministic filters") {
+  test("do not combine non-deterministic filters even if they are identical") {
     val originalQuery =
       testRelation
         .where(Rand(0) > 0.1 && 'a === 1)
-        .where(Rand(0) > 0.1 && 'a === 1)
+        .where(Rand(0) > 0.1 && 'a === 1).analyze
 
-    val optimized = Optimize.execute(originalQuery.analyze)
-    val correctAnswer =
-      testRelation
-        .where(Rand(0) > 0.1 && 'a === 1 && Rand(0) > 0.1)
-        .analyze
+    val optimized = Optimize.execute(originalQuery)
 
-    comparePlans(optimized, correctAnswer)
+    comparePlans(optimized, originalQuery)
   }
 
   test("SPARK-16164: Filter pushdown should keep the ordering in the logical plan") {

From bdb5e55c2a67d16a36ad6baa22296d714d3525af Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 13 Dec 2017 14:49:15 +0800
Subject: [PATCH 101/356] [SPARK-21322][SQL][FOLLOWUP] support histogram in
 filter cardinality estimation

## What changes were proposed in this pull request?

some code cleanup/refactor and naming improvement.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19952 from cloud-fan/minor.
---
 .../statsEstimation/EstimationUtils.scala     | 109 +++++++------
 .../statsEstimation/FilterEstimation.scala    | 152 +++++++++---------
 2 files changed, 134 insertions(+), 127 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 2f416f2af91f1..6f868cbd072c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -115,14 +115,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the first bin into which a column value falls for a specified
+   * Returns the index of the first bin into which the given value falls for a specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the first bin into which a column value falls.
    */
-  def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findFirstBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
     var i = 0
     while ((i < bins.length) && (value > bins(i).hi)) {
       i += 1
@@ -131,14 +127,10 @@ object EstimationUtils {
   }
 
   /**
-   * Returns the number of the last bin into which a column value falls for a specified
+   * Returns the index of the last bin into which the given value falls for a specified
    * numeric equi-height histogram.
-   *
-   * @param value a literal value of a column
-   * @param bins an array of bins for a given numeric equi-height histogram
-   * @return the id of the last bin into which a column value falls.
    */
-  def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
+  private def findLastBinForValue(value: Double, bins: Array[HistogramBin]): Int = {
     var i = bins.length - 1
     while ((i >= 0) && (value < bins(i).lo)) {
       i -= 1
@@ -147,65 +139,76 @@ object EstimationUtils {
   }
 
   /**
-   * Returns a percentage of a bin holding values for column value in the range of
-   * [lowerValue, higherValue]
-   *
-   * @param higherValue a given upper bound value of a specified column value range
-   * @param lowerValue a given lower bound value of a specified column value range
-   * @param bin a single histogram bin
-   * @return the percentage of a single bin holding values in [lowerValue, higherValue].
+   * Returns the possibility of the given histogram bin holding values within the given range
+   * [lowerBound, upperBound].
    */
-  private def getOccupation(
-      higherValue: Double,
-      lowerValue: Double,
+  private def binHoldingRangePossibility(
+      upperBound: Double,
+      lowerBound: Double,
       bin: HistogramBin): Double = {
-    assert(bin.lo <= lowerValue && lowerValue <= higherValue && higherValue <= bin.hi)
+    assert(bin.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bin.hi)
     if (bin.hi == bin.lo) {
       // the entire bin is covered in the range
       1.0
-    } else if (higherValue == lowerValue) {
+    } else if (upperBound == lowerBound) {
       // set percentage to 1/NDV
       1.0 / bin.ndv.toDouble
     } else {
       // Use proration since the range falls inside this bin.
-      math.min((higherValue - lowerValue) / (bin.hi - bin.lo), 1.0)
+      math.min((upperBound - lowerBound) / (bin.hi - bin.lo), 1.0)
     }
   }
 
   /**
-   * Returns the number of bins for column values in [lowerValue, higherValue].
-   * The column value distribution is saved in an equi-height histogram.  The return values is a
-   * double value is because we may return a portion of a bin. For example, a predicate
-   * "column = 8" may return the number of bins 0.2 if the holding bin has 5 distinct values.
+   * Returns the number of histogram bins holding values within the given range
+   * [lowerBound, upperBound].
+   *
+   * Note that the returned value is double type, because the range boundaries usually occupy a
+   * portion of a bin. An extreme case is [value, value] which is generated by equal predicate
+   * `col = value`, we can get higher accuracy by allowing returning portion of histogram bins.
    *
-   * @param higherId id of the high end bin holding the high end value of a column range
-   * @param lowerId id of the low end bin holding the low end value of a column range
-   * @param higherEnd a given upper bound value of a specified column value range
-   * @param lowerEnd a given lower bound value of a specified column value range
-   * @param histogram a numeric equi-height histogram
-   * @return the number of bins for column values in [lowerEnd, higherEnd].
+   * @param upperBound the highest value of the given range
+   * @param upperBoundInclusive whether the upperBound is included in the range
+   * @param lowerBound the lowest value of the given range
+   * @param lowerBoundInclusive whether the lowerBound is included in the range
+   * @param bins an array of bins for a given numeric equi-height histogram
    */
-  def getOccupationBins(
-      higherId: Int,
-      lowerId: Int,
-      higherEnd: Double,
-      lowerEnd: Double,
-      histogram: Histogram): Double = {
-    assert(lowerId <= higherId)
-
-    if (lowerId == higherId) {
-      val curBin = histogram.bins(lowerId)
-      getOccupation(higherEnd, lowerEnd, curBin)
+  def numBinsHoldingRange(
+      upperBound: Double,
+      upperBoundInclusive: Boolean,
+      lowerBound: Double,
+      lowerBoundInclusive: Boolean,
+      bins: Array[HistogramBin]): Double = {
+    assert(bins.head.lo <= lowerBound && lowerBound <= upperBound && upperBound <= bins.last.hi,
+      "Given range does not fit in the given histogram.")
+    assert(upperBound != lowerBound || upperBoundInclusive || lowerBoundInclusive,
+      s"'$lowerBound < value < $upperBound' is an invalid range.")
+
+    val upperBinIndex = if (upperBoundInclusive) {
+      findLastBinForValue(upperBound, bins)
+    } else {
+      findFirstBinForValue(upperBound, bins)
+    }
+    val lowerBinIndex = if (lowerBoundInclusive) {
+      findFirstBinForValue(lowerBound, bins)
+    } else {
+      findLastBinForValue(lowerBound, bins)
+    }
+    assert(lowerBinIndex <= upperBinIndex, "Invalid histogram data.")
+
+
+    if (lowerBinIndex == upperBinIndex) {
+      binHoldingRangePossibility(upperBound, lowerBound, bins(lowerBinIndex))
     } else {
-      // compute how much lowerEnd/higherEnd occupies its bin
-      val lowerCurBin = histogram.bins(lowerId)
-      val lowerPart = getOccupation(lowerCurBin.hi, lowerEnd, lowerCurBin)
+      // Computes the occupied portion of bins of the upperBound and lowerBound.
+      val lowerBin = bins(lowerBinIndex)
+      val lowerPart = binHoldingRangePossibility(lowerBin.hi, lowerBound, lowerBin)
 
-      val higherCurBin = histogram.bins(higherId)
-      val higherPart = getOccupation(higherEnd, higherCurBin.lo, higherCurBin)
+      val higherBin = bins(upperBinIndex)
+      val higherPart = binHoldingRangePossibility(upperBound, higherBin.lo, higherBin)
 
-      // the total length is lowerPart + higherPart + bins between them
-      lowerPart + higherPart + higherId - lowerId - 1
+      // The total number of bins is lowerPart + higherPart + bins between them
+      lowerPart + higherPart + upperBinIndex - lowerBinIndex - 1
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index f52a15edbfe4a..850dd1ba724a0 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -336,43 +336,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
         // returns 1/ndv if there is no histogram
         Some(1.0 / BigDecimal(ndv))
       } else {
-        // We compute filter selectivity using Histogram information.
-        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
-        val histogram = colStat.histogram.get
-        val hgmBins = histogram.bins
-
-        // find bins where column's current min and max locate.  Note that a column's [min, max]
-        // range may change due to another condition applied earlier.
-        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
-        val minBinId = EstimationUtils.findFirstBinForValue(min, hgmBins)
-        val maxBinId = EstimationUtils.findLastBinForValue(max, hgmBins)
-
-        // compute how many bins the column's current valid range [min, max] occupies.
-        // Note that a column's [min, max] range may vary after we apply some filter conditions.
-        val validRangeBins = EstimationUtils.getOccupationBins(maxBinId, minBinId, max,
-          min, histogram)
-
-        val lowerBinId = EstimationUtils.findFirstBinForValue(datum, hgmBins)
-        val higherBinId = EstimationUtils.findLastBinForValue(datum, hgmBins)
-        assert(lowerBinId <= higherBinId)
-        val lowerBinNdv = hgmBins(lowerBinId).ndv
-        val higherBinNdv = hgmBins(higherBinId).ndv
-        // assume uniform distribution in each bin
-        val occupiedBins = if (lowerBinId == higherBinId) {
-          1.0 / lowerBinNdv
-        } else {
-          (1.0 / lowerBinNdv) +   // lowest bin
-            (higherBinId - lowerBinId - 1) + // middle bins
-            (1.0 / higherBinNdv)  // highest bin
-        }
-        Some(occupiedBins / validRangeBins)
+        Some(computeEqualityPossibilityByHistogram(literal, colStat))
       }
 
     } else {  // not in interval
       Some(0.0)
     }
-
   }
 
   /**
@@ -542,11 +511,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
             }
         }
       } else {
-        val numericHistogram = colStat.histogram.get
-        val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
-        val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
-        val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-        percent = computePercentByEquiHeightHgm(op, numericHistogram, max, min, datum)
+        percent = computeComparisonPossibilityByHistogram(op, literal, colStat)
       }
 
       if (update) {
@@ -574,51 +539,90 @@ case class FilterEstimation(plan: Filter) extends Logging {
   }
 
   /**
-   * Returns the selectivity percentage for binary condition in the column's
-   * current valid range [min, max]
-   *
-   * @param op a binary comparison operator
-   * @param histogram a numeric equi-height histogram
-   * @param max the upper bound of the current valid range for a given column
-   * @param min the lower bound of the current valid range for a given column
-   * @param datumNumber the numeric value of a literal
-   * @return the selectivity percentage for a condition in the current range.
+   * Computes the possibility of an equality predicate using histogram.
    */
+  private def computeEqualityPossibilityByHistogram(
+      literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val histogram = colStat.histogram.get
 
-  def computePercentByEquiHeightHgm(
-      op: BinaryComparison,
-      histogram: Histogram,
-      max: Double,
-      min: Double,
-      datumNumber: Double): Double = {
     // find bins where column's current min and max locate.  Note that a column's [min, max]
     // range may change due to another condition applied earlier.
-    val minBinId = EstimationUtils.findFirstBinForValue(min, histogram.bins)
-    val maxBinId = EstimationUtils.findLastBinForValue(max, histogram.bins)
+    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
 
     // compute how many bins the column's current valid range [min, max] occupies.
-    // Note that a column's [min, max] range may vary after we apply some filter conditions.
-    val minToMaxLength = EstimationUtils.getOccupationBins(maxBinId, minBinId, max, min, histogram)
-
-    val datumInBinId = op match {
-      case LessThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.findFirstBinForValue(datumNumber, histogram.bins)
-      case LessThanOrEqual(_, _) | GreaterThan(_, _) =>
-        EstimationUtils.findLastBinForValue(datumNumber, histogram.bins)
-    }
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      upperBound = max,
+      upperBoundInclusive = true,
+      lowerBound = min,
+      lowerBoundInclusive = true,
+      histogram.bins)
+
+    val numBinsHoldingDatum = EstimationUtils.numBinsHoldingRange(
+      upperBound = datum,
+      upperBoundInclusive = true,
+      lowerBound = datum,
+      lowerBoundInclusive = true,
+      histogram.bins)
+
+    numBinsHoldingDatum / numBinsHoldingEntireRange
+  }
 
-    op match {
-      // LessThan and LessThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case LessThan(_, _) | LessThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(datumInBinId, minBinId, datumNumber, min,
-          histogram) / minToMaxLength
-      // GreaterThan and GreaterThanOrEqual share the same logic,
-      // but their datumInBinId may be different
-      case GreaterThan(_, _) | GreaterThanOrEqual(_, _) =>
-        EstimationUtils.getOccupationBins(maxBinId, datumInBinId, max, datumNumber,
-          histogram) / minToMaxLength
+  /**
+   * Computes the possibility of a comparison predicate using histogram.
+   */
+  private def computeComparisonPossibilityByHistogram(
+      op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
+    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val histogram = colStat.histogram.get
+
+    // find bins where column's current min and max locate.  Note that a column's [min, max]
+    // range may change due to another condition applied earlier.
+    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
+    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+
+    // compute how many bins the column's current valid range [min, max] occupies.
+    val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
+      max, upperBoundInclusive = true, min, lowerBoundInclusive = true, histogram.bins)
+
+    val numBinsHoldingRange = op match {
+      // LessThan and LessThanOrEqual share the same logic, the only difference is whether to
+      // include the upperBound in the range.
+      case _: LessThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = false,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram.bins)
+      case _: LessThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = datum,
+          upperBoundInclusive = true,
+          lowerBound = min,
+          lowerBoundInclusive = true,
+          histogram.bins)
+
+      // GreaterThan and GreaterThanOrEqual share the same logic, the only difference is whether to
+      // include the lowerBound in the range.
+      case _: GreaterThan =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = false,
+          histogram.bins)
+      case _: GreaterThanOrEqual =>
+        EstimationUtils.numBinsHoldingRange(
+          upperBound = max,
+          upperBoundInclusive = true,
+          lowerBound = datum,
+          lowerBoundInclusive = true,
+          histogram.bins)
     }
+
+    numBinsHoldingRange / numBinsHoldingEntireRange
   }
 
   /**

From 874350905ff41ffdd8dc07548aa3c4f5782a3b35 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@foxmail.com>
Date: Wed, 13 Dec 2017 09:10:03 +0200
Subject: [PATCH 102/356] [SPARK-22700][ML] Bucketizer.transform incorrectly
 drops row containing NaN

## What changes were proposed in this pull request?
only drops the rows containing NaN in the input columns

## How was this patch tested?
existing tests and added tests

Author: Ruifeng Zheng <ruifengz@foxmail.com>
Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19894 from zhengruifeng/bucketizer_nan.
---
 .../org/apache/spark/ml/feature/Bucketizer.scala   | 14 ++++++++------
 .../apache/spark/ml/feature/BucketizerSuite.scala  |  9 +++++++++
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
index e07f2a107badb..8299a3e95d822 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala
@@ -155,10 +155,16 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
   override def transform(dataset: Dataset[_]): DataFrame = {
     val transformedSchema = transformSchema(dataset.schema)
 
+    val (inputColumns, outputColumns) = if (isBucketizeMultipleColumns()) {
+      ($(inputCols).toSeq, $(outputCols).toSeq)
+    } else {
+      (Seq($(inputCol)), Seq($(outputCol)))
+    }
+
     val (filteredDataset, keepInvalid) = {
       if (getHandleInvalid == Bucketizer.SKIP_INVALID) {
         // "skip" NaN option is set, will filter out NaN values in the dataset
-        (dataset.na.drop().toDF(), false)
+        (dataset.na.drop(inputColumns).toDF(), false)
       } else {
         (dataset.toDF(), getHandleInvalid == Bucketizer.KEEP_INVALID)
       }
@@ -176,11 +182,7 @@ final class Bucketizer @Since("1.4.0") (@Since("1.4.0") override val uid: String
       }.withName(s"bucketizer_$idx")
     }
 
-    val (inputColumns, outputColumns) = if (isBucketizeMultipleColumns()) {
-      ($(inputCols).toSeq, $(outputCols).toSeq)
-    } else {
-      (Seq($(inputCol)), Seq($(outputCol)))
-    }
+
     val newCols = inputColumns.zipWithIndex.map { case (inputCol, idx) =>
       bucketizers(idx)(filteredDataset(inputCol).cast(DoubleType))
     }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
index 748dbd1b995d3..d9c97ae8067d3 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/BucketizerSuite.scala
@@ -123,6 +123,15 @@ class BucketizerSuite extends SparkFunSuite with MLlibTestSparkContext with Defa
     }
   }
 
+  test("Bucketizer should only drop NaN in input columns, with handleInvalid=skip") {
+    val df = spark.createDataFrame(Seq((2.3, 3.0), (Double.NaN, 3.0), (6.7, Double.NaN)))
+      .toDF("a", "b")
+    val splits = Array(Double.NegativeInfinity, 3.0, Double.PositiveInfinity)
+    val bucketizer = new Bucketizer().setInputCol("a").setOutputCol("x").setSplits(splits)
+    bucketizer.setHandleInvalid("skip")
+    assert(bucketizer.transform(df).count() == 2)
+  }
+
   test("Bucket continuous features, with NaN splits") {
     val splits = Array(Double.NegativeInfinity, -0.5, 0.0, 0.5, Double.PositiveInfinity, Double.NaN)
     withClue("Invalid NaN split was not caught during Bucketizer initialization") {

From 682eb4f2ea152ce1043fbe689ea95318926b91b0 Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Tue, 12 Dec 2017 23:30:06 -0800
Subject: [PATCH 103/356] [SPARK-22042][SQL] ReorderJoinPredicates can break
 when child's partitioning is not decided

## What changes were proposed in this pull request?

See jira description for the bug : https://issues.apache.org/jira/browse/SPARK-22042

Fix done in this PR is:  In `EnsureRequirements`, apply `ReorderJoinPredicates` over the input tree before doing its core logic. Since the tree is transformed bottom-up, we can assure that the children are resolved before doing `ReorderJoinPredicates`.

Theoretically this will guarantee to cover all such cases while keeping the code simple. My small grudge is for cosmetic reasons. This PR will look weird given that we don't call rules from other rules (not to my knowledge). I could have moved all the logic for `ReorderJoinPredicates` into `EnsureRequirements` but that will make it a but crowded. I am happy to discuss if there are better options.

## How was this patch tested?

Added a new test case

Author: Tejas Patil <tejasp@fb.com>

Closes #19257 from tejasapatil/SPARK-22042_ReorderJoinPredicates.
---
 .../spark/sql/execution/QueryExecution.scala  |  2 -
 .../exchange/EnsureRequirements.scala         | 76 ++++++++++++++-
 .../joins/ReorderJoinPredicates.scala         | 94 -------------------
 .../spark/sql/sources/BucketedReadSuite.scala | 31 ++++++
 4 files changed, 106 insertions(+), 97 deletions(-)
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index f404621399cea..946475a1e9751 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -29,7 +29,6 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec, ShowTablesCommand}
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
-import org.apache.spark.sql.execution.joins.ReorderJoinPredicates
 import org.apache.spark.sql.types.{BinaryType, DateType, DecimalType, TimestampType, _}
 import org.apache.spark.util.Utils
 
@@ -104,7 +103,6 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
   protected def preparations: Seq[Rule[SparkPlan]] = Seq(
     python.ExtractPythonUDFs,
     PlanSubqueries(sparkSession),
-    new ReorderJoinPredicates,
     EnsureRequirements(sparkSession.sessionState.conf),
     CollapseCodegenStages(sparkSession.sessionState.conf),
     ReuseExchange(sparkSession.sessionState.conf),
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 4e2ca37bc1a59..82f0b9f5cd060 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -17,10 +17,14 @@
 
 package org.apache.spark.sql.execution.exchange
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, ShuffledHashJoinExec,
+  SortMergeJoinExec}
 import org.apache.spark.sql.internal.SQLConf
 
 /**
@@ -248,6 +252,75 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     operator.withNewChildren(children)
   }
 
+  /**
+   * When the physical operators are created for JOIN, the ordering of join keys is based on order
+   * in which the join keys appear in the user query. That might not match with the output
+   * partitioning of the join node's children (thus leading to extra sort / shuffle being
+   * introduced). This rule will change the ordering of the join keys to match with the
+   * partitioning of the join nodes' children.
+   */
+  def reorderJoinPredicates(plan: SparkPlan): SparkPlan = {
+    def reorderJoinKeys(
+        leftKeys: Seq[Expression],
+        rightKeys: Seq[Expression],
+        leftPartitioning: Partitioning,
+        rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = {
+
+      def reorder(expectedOrderOfKeys: Seq[Expression],
+                  currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
+        val leftKeysBuffer = ArrayBuffer[Expression]()
+        val rightKeysBuffer = ArrayBuffer[Expression]()
+
+        expectedOrderOfKeys.foreach(expression => {
+          val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
+          leftKeysBuffer.append(leftKeys(index))
+          rightKeysBuffer.append(rightKeys(index))
+        })
+        (leftKeysBuffer, rightKeysBuffer)
+      }
+
+      if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) {
+        leftPartitioning match {
+          case HashPartitioning(leftExpressions, _)
+            if leftExpressions.length == leftKeys.length &&
+              leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) =>
+            reorder(leftExpressions, leftKeys)
+
+          case _ => rightPartitioning match {
+            case HashPartitioning(rightExpressions, _)
+              if rightExpressions.length == rightKeys.length &&
+                rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) =>
+              reorder(rightExpressions, rightKeys)
+
+            case _ => (leftKeys, rightKeys)
+          }
+        }
+      } else {
+        (leftKeys, rightKeys)
+      }
+    }
+
+    plan.transformUp {
+      case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left,
+        right) =>
+        val (reorderedLeftKeys, reorderedRightKeys) =
+          reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+        BroadcastHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
+          left, right)
+
+      case ShuffledHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
+        val (reorderedLeftKeys, reorderedRightKeys) =
+          reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+        ShuffledHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
+          left, right)
+
+      case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right) =>
+        val (reorderedLeftKeys, reorderedRightKeys) =
+          reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
+        SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right)
+    }
+  }
+
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
     case operator @ ShuffleExchangeExec(partitioning, child, _) =>
       child.children match {
@@ -255,6 +328,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
           if (childPartitioning.guarantees(partitioning)) child else operator
         case _ => operator
       }
-    case operator: SparkPlan => ensureDistributionAndOrdering(operator)
+    case operator: SparkPlan =>
+      ensureDistributionAndOrdering(reorderJoinPredicates(operator))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala
deleted file mode 100644
index 534d8c5689c27..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ReorderJoinPredicates.scala
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.joins
-
-import scala.collection.mutable.ArrayBuffer
-
-import org.apache.spark.sql.catalyst.expressions.Expression
-import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
-import org.apache.spark.sql.catalyst.rules.Rule
-import org.apache.spark.sql.execution.SparkPlan
-
-/**
- * When the physical operators are created for JOIN, the ordering of join keys is based on order
- * in which the join keys appear in the user query. That might not match with the output
- * partitioning of the join node's children (thus leading to extra sort / shuffle being
- * introduced). This rule will change the ordering of the join keys to match with the
- * partitioning of the join nodes' children.
- */
-class ReorderJoinPredicates extends Rule[SparkPlan] {
-  private def reorderJoinKeys(
-      leftKeys: Seq[Expression],
-      rightKeys: Seq[Expression],
-      leftPartitioning: Partitioning,
-      rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = {
-
-    def reorder(
-        expectedOrderOfKeys: Seq[Expression],
-        currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
-      val leftKeysBuffer = ArrayBuffer[Expression]()
-      val rightKeysBuffer = ArrayBuffer[Expression]()
-
-      expectedOrderOfKeys.foreach(expression => {
-        val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
-        leftKeysBuffer.append(leftKeys(index))
-        rightKeysBuffer.append(rightKeys(index))
-      })
-      (leftKeysBuffer, rightKeysBuffer)
-    }
-
-    if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) {
-      leftPartitioning match {
-        case HashPartitioning(leftExpressions, _)
-          if leftExpressions.length == leftKeys.length &&
-            leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) =>
-          reorder(leftExpressions, leftKeys)
-
-        case _ => rightPartitioning match {
-          case HashPartitioning(rightExpressions, _)
-            if rightExpressions.length == rightKeys.length &&
-              rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) =>
-            reorder(rightExpressions, rightKeys)
-
-          case _ => (leftKeys, rightKeys)
-        }
-      }
-    } else {
-      (leftKeys, rightKeys)
-    }
-  }
-
-  def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
-      val (reorderedLeftKeys, reorderedRightKeys) =
-        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
-      BroadcastHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
-        left, right)
-
-    case ShuffledHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left, right) =>
-      val (reorderedLeftKeys, reorderedRightKeys) =
-        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
-      ShuffledHashJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, buildSide, condition,
-        left, right)
-
-    case SortMergeJoinExec(leftKeys, rightKeys, joinType, condition, left, right) =>
-      val (reorderedLeftKeys, reorderedRightKeys) =
-        reorderJoinKeys(leftKeys, rightKeys, left.outputPartitioning, right.outputPartitioning)
-      SortMergeJoinExec(reorderedLeftKeys, reorderedRightKeys, joinType, condition, left, right)
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index ab18905e2ddb2..9025859e91066 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -602,6 +602,37 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
     )
   }
 
+  test("SPARK-22042 ReorderJoinPredicates can break when child's partitioning is not decided") {
+    withTable("bucketed_table", "table1", "table2") {
+      df.write.format("parquet").saveAsTable("table1")
+      df.write.format("parquet").saveAsTable("table2")
+      df.write.format("parquet").bucketBy(8, "j", "k").saveAsTable("bucketed_table")
+
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0") {
+        checkAnswer(
+          sql("""
+                |SELECT ab.i, ab.j, ab.k, c.i, c.j, c.k
+                |FROM (
+                |  SELECT a.i, a.j, a.k
+                |  FROM bucketed_table a
+                |  JOIN table1 b
+                |  ON a.i = b.i
+                |) ab
+                |JOIN table2 c
+                |ON ab.i = c.i
+                |""".stripMargin),
+          sql("""
+                |SELECT a.i, a.j, a.k, c.i, c.j, c.k
+                |FROM bucketed_table a
+                |JOIN table1 b
+                |ON a.i = b.i
+                |JOIN table2 c
+                |ON a.i = c.i
+                |""".stripMargin))
+      }
+    }
+  }
+
   test("error if there exists any malformed bucket files") {
     withTable("bucketed_table") {
       df1.write.format("parquet").bucketBy(8, "i").saveAsTable("bucketed_table")

From 7453ab0243dc04db1b586b0e5f588f9cdc9f72dd Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Wed, 13 Dec 2017 16:27:29 +0800
Subject: [PATCH 104/356] [SPARK-22745][SQL] read partition stats from Hive

## What changes were proposed in this pull request?

Currently Spark can read table stats (e.g. `totalSize, numRows`) from Hive, we can also support to read partition stats from Hive using the same logic.

## How was this patch tested?

Added a new test case and modified an existing test case.

Author: Zhenhua Wang <wangzhenhua@huawei.com>
Author: Zhenhua Wang <wzh_zju@163.com>

Closes #19932 from wzhfy/read_hive_partition_stats.
---
 .../spark/sql/hive/HiveExternalCatalog.scala  |  6 +-
 .../sql/hive/client/HiveClientImpl.scala      | 66 +++++++++++--------
 .../spark/sql/hive/StatisticsSuite.scala      | 32 ++++++---
 3 files changed, 62 insertions(+), 42 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 44e680dbd2f93..632e3e0c4c3f9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -668,7 +668,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val schema = restoreTableMetadata(rawTable).schema
 
     // convert table statistics to properties so that we can persist them through hive client
-    var statsProperties =
+    val statsProperties =
       if (stats.isDefined) {
         statsToProperties(stats.get, schema)
       } else {
@@ -1098,14 +1098,14 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val schema = restoreTableMetadata(rawTable).schema
 
     // convert partition statistics to properties so that we can persist them through hive api
-    val withStatsProps = lowerCasedParts.map(p => {
+    val withStatsProps = lowerCasedParts.map { p =>
       if (p.stats.isDefined) {
         val statsProperties = statsToProperties(p.stats.get, schema)
         p.copy(parameters = p.parameters ++ statsProperties)
       } else {
         p
       }
-    })
+    }
 
     // Note: Before altering table partitions in Hive, you *must* set the current database
     // to the one that contains the table of interest. Otherwise you will end up with the
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 08eb5c74f06d7..7233944dc96dd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -414,32 +414,6 @@ private[hive] class HiveClientImpl(
       }
       val comment = properties.get("comment")
 
-      // Here we are reading statistics from Hive.
-      // Note that this statistics could be overridden by Spark's statistics if that's available.
-      val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
-      val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
-      val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
-      // TODO: check if this estimate is valid for tables after partition pruning.
-      // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
-      // relatively cheap if parameters for the table are populated into the metastore.
-      // Currently, only totalSize, rawDataSize, and rowCount are used to build the field `stats`
-      // TODO: stats should include all the other two fields (`numFiles` and `numPartitions`).
-      // (see StatsSetupConst in Hive)
-      val stats =
-        // When table is external, `totalSize` is always zero, which will influence join strategy.
-        // So when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
-        // return None.
-        // In Hive, when statistics gathering is disabled, `rawDataSize` and `numRows` is always
-        // zero after INSERT command. So they are used here only if they are larger than zero.
-        if (totalSize.isDefined && totalSize.get > 0L) {
-          Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
-        } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
-          Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount.filter(_ > 0)))
-        } else {
-          // TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?
-          None
-        }
-
       CatalogTable(
         identifier = TableIdentifier(h.getTableName, Option(h.getDbName)),
         tableType = h.getTableType match {
@@ -478,7 +452,7 @@ private[hive] class HiveClientImpl(
         // For EXTERNAL_TABLE, the table properties has a particular field "EXTERNAL". This is added
         // in the function toHiveTable.
         properties = filteredProperties,
-        stats = stats,
+        stats = readHiveStats(properties),
         comment = comment,
         // In older versions of Spark(before 2.2.0), we expand the view original text and store
         // that into `viewExpandedText`, and that should be used in view resolution. So we get
@@ -1011,6 +985,11 @@ private[hive] object HiveClientImpl {
    */
   def fromHivePartition(hp: HivePartition): CatalogTablePartition = {
     val apiPartition = hp.getTPartition
+    val properties: Map[String, String] = if (hp.getParameters != null) {
+      hp.getParameters.asScala.toMap
+    } else {
+      Map.empty
+    }
     CatalogTablePartition(
       spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty),
       storage = CatalogStorageFormat(
@@ -1021,8 +1000,37 @@ private[hive] object HiveClientImpl {
         compressed = apiPartition.getSd.isCompressed,
         properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
           .map(_.asScala.toMap).orNull),
-        parameters =
-          if (hp.getParameters() != null) hp.getParameters().asScala.toMap else Map.empty)
+      parameters = properties,
+      stats = readHiveStats(properties))
+  }
+
+  /**
+   * Reads statistics from Hive.
+   * Note that this statistics could be overridden by Spark's statistics if that's available.
+   */
+  private def readHiveStats(properties: Map[String, String]): Option[CatalogStatistics] = {
+    val totalSize = properties.get(StatsSetupConst.TOTAL_SIZE).map(BigInt(_))
+    val rawDataSize = properties.get(StatsSetupConst.RAW_DATA_SIZE).map(BigInt(_))
+    val rowCount = properties.get(StatsSetupConst.ROW_COUNT).map(BigInt(_))
+    // NOTE: getting `totalSize` directly from params is kind of hacky, but this should be
+    // relatively cheap if parameters for the table are populated into the metastore.
+    // Currently, only totalSize, rawDataSize, and rowCount are used to build the field `stats`
+    // TODO: stats should include all the other two fields (`numFiles` and `numPartitions`).
+    // (see StatsSetupConst in Hive)
+
+    // When table is external, `totalSize` is always zero, which will influence join strategy.
+    // So when `totalSize` is zero, use `rawDataSize` instead. When `rawDataSize` is also zero,
+    // return None.
+    // In Hive, when statistics gathering is disabled, `rawDataSize` and `numRows` is always
+    // zero after INSERT command. So they are used here only if they are larger than zero.
+    if (totalSize.isDefined && totalSize.get > 0L) {
+      Some(CatalogStatistics(sizeInBytes = totalSize.get, rowCount = rowCount.filter(_ > 0)))
+    } else if (rawDataSize.isDefined && rawDataSize.get > 0) {
+      Some(CatalogStatistics(sizeInBytes = rawDataSize.get, rowCount = rowCount.filter(_ > 0)))
+    } else {
+      // TODO: still fill the rowCount even if sizeInBytes is empty. Might break anything?
+      None
+    }
   }
 
   // Below is the key of table properties for storing Hive-generated statistics
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 13f06a2503656..3af8af0814bb4 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -213,6 +213,27 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
     }
   }
 
+  test("SPARK-22745 - read Hive's statistics for partition") {
+    val tableName = "hive_stats_part_table"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (key STRING, value STRING) PARTITIONED BY (ds STRING)")
+      sql(s"INSERT INTO TABLE $tableName PARTITION (ds='2017-01-01') SELECT * FROM src")
+      var partition = spark.sessionState.catalog
+        .getPartition(TableIdentifier(tableName), Map("ds" -> "2017-01-01"))
+
+      assert(partition.stats.get.sizeInBytes == 5812)
+      assert(partition.stats.get.rowCount.isEmpty)
+
+      hiveClient
+        .runSqlHive(s"ANALYZE TABLE $tableName PARTITION (ds='2017-01-01') COMPUTE STATISTICS")
+      partition = spark.sessionState.catalog
+        .getPartition(TableIdentifier(tableName), Map("ds" -> "2017-01-01"))
+
+      assert(partition.stats.get.sizeInBytes == 5812)
+      assert(partition.stats.get.rowCount == Some(500))
+    }
+  }
+
   test("SPARK-21079 - analyze table with location different than that of individual partitions") {
     val tableName = "analyzeTable_part"
     withTable(tableName) {
@@ -353,15 +374,6 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
       createPartition("2010-01-02", 11,
         "SELECT '1', 'A' from src UNION ALL SELECT '1', 'A' from src")
 
-      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-01') COMPUTE STATISTICS NOSCAN")
-
-      assertPartitionStats("2010-01-01", "10", rowCount = None, sizeInBytes = 2000)
-      assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
-      assert(queryStats("2010-01-02", "10") === None)
-      assert(queryStats("2010-01-02", "11") === None)
-
-      sql(s"ANALYZE TABLE $tableName PARTITION (ds='2010-01-02') COMPUTE STATISTICS NOSCAN")
-
       assertPartitionStats("2010-01-01", "10", rowCount = None, sizeInBytes = 2000)
       assertPartitionStats("2010-01-01", "11", rowCount = None, sizeInBytes = 2000)
       assertPartitionStats("2010-01-02", "10", rowCount = None, sizeInBytes = 2000)
@@ -631,7 +643,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
        """.stripMargin)
     sql(s"INSERT INTO TABLE $tabName SELECT * FROM src")
     if (analyzedBySpark) sql(s"ANALYZE TABLE $tabName COMPUTE STATISTICS")
-    // This is to mimic the scenario in which Hive genrates statistics before we reading it
+    // This is to mimic the scenario in which Hive generates statistics before we read it
     if (analyzedByHive) hiveClient.runSqlHive(s"ANALYZE TABLE $tabName COMPUTE STATISTICS")
     val describeResult1 = hiveClient.runSqlHive(s"DESCRIBE FORMATTED $tabName")
 

From 58f7c825ae9dd2e3f548a9b7b4a9704f970dde5b Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 13 Dec 2017 07:52:21 -0600
Subject: [PATCH 105/356] [SPARK-20849][DOC][FOLLOWUP] Document R DecisionTree
 - Link Classification Example

## What changes were proposed in this pull request?
in https://github.com/apache/spark/pull/18067, only the regression example is linked

this pr link decision tree classification example to the doc

ping felixcheung

## How was this patch tested?
local build of docs

![default](https://user-images.githubusercontent.com/7322292/33922857-9b00fdd0-e008-11e7-92c2-85a3de52ea8f.png)

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19963 from zhengruifeng/r_examples.
---
 docs/ml-classification-regression.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md
index 083df2e405d62..bf979f3c73a52 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -223,6 +223,14 @@ More details on parameters can be found in the [Python API documentation](api/py
 
 </div>
 
+<div data-lang="r" markdown="1">
+
+Refer to the [R API docs](api/R/spark.decisionTree.html) for more details.
+
+{% include_example classification r/ml/decisionTree.R %}
+
+</div>
+
 </div>
 
 ## Random forest classifier

From f6bcd3e53fe55bab38383cce8d0340f6c6191972 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 14 Dec 2017 01:16:44 +0800
Subject: [PATCH 106/356] [SPARK-22767][SQL] use ctx.addReferenceObj in InSet
 and ScalaUDF

## What changes were proposed in this pull request?

We should not operate on `references` directly in `Expression.doGenCode`, instead we should use the high-level API `addReferenceObj`.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19962 from cloud-fan/codegen.
---
 .../sql/catalyst/expressions/ScalaUDF.scala   | 76 +++++--------------
 .../sql/catalyst/expressions/predicates.scala | 20 ++---
 .../catalyst/expressions/ScalaUDFSuite.scala  |  3 +-
 .../catalyst/optimizer/OptimizeInSuite.scala  |  2 +-
 4 files changed, 28 insertions(+), 73 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
index a3cf7612016b5..388ef42883ad3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDF.scala
@@ -76,11 +76,6 @@ case class ScalaUDF(
     }.foreach(println)
 
   */
-
-  // Accessors used in genCode
-  def userDefinedFunc(): AnyRef = function
-  def getChildren(): Seq[Expression] = children
-
   private[this] val f = children.size match {
     case 0 =>
       val func = function.asInstanceOf[() => Any]
@@ -981,41 +976,19 @@ case class ScalaUDF(
   }
 
   // scalastyle:on line.size.limit
-
-  private val converterClassName = classOf[Any => Any].getName
-  private val scalaUDFClassName = classOf[ScalaUDF].getName
-  private val typeConvertersClassName = CatalystTypeConverters.getClass.getName + ".MODULE$"
-
-  // Generate codes used to convert the arguments to Scala type for user-defined functions
-  private[this] def genCodeForConverter(ctx: CodegenContext, index: Int): (String, String) = {
-    val converterTerm = ctx.freshName("converter")
-    val expressionIdx = ctx.references.size - 1
-    (converterTerm,
-      s"$converterClassName $converterTerm = ($converterClassName)$typeConvertersClassName" +
-        s".createToScalaConverter(((Expression)((($scalaUDFClassName)" +
-        s"references[$expressionIdx]).getChildren().apply($index))).dataType());")
-  }
-
   override def doGenCode(
       ctx: CodegenContext,
       ev: ExprCode): ExprCode = {
-    val scalaUDF = ctx.freshName("scalaUDF")
-    val scalaUDFRef = ctx.addReferenceObj("scalaUDFRef", this, scalaUDFClassName)
-
-    // Object to convert the returned value of user-defined functions to Catalyst type
-    val catalystConverterTerm = ctx.freshName("catalystConverter")
+    val converterClassName = classOf[Any => Any].getName
 
+    // The type converters for inputs and the result.
+    val converters: Array[Any => Any] = children.map { c =>
+      CatalystTypeConverters.createToScalaConverter(c.dataType)
+    }.toArray :+ CatalystTypeConverters.createToCatalystConverter(dataType)
+    val convertersTerm = ctx.addReferenceObj("converters", converters, s"$converterClassName[]")
+    val errorMsgTerm = ctx.addReferenceObj("errMsg", udfErrorMessage)
     val resultTerm = ctx.freshName("result")
 
-    // This must be called before children expressions' codegen
-    // because ctx.references is used in genCodeForConverter
-    val converterTerms = children.indices.map(genCodeForConverter(ctx, _))
-
-    // Initialize user-defined function
-    val funcClassName = s"scala.Function${children.size}"
-
-    val funcTerm = ctx.freshName("udf")
-
     // codegen for children expressions
     val evals = children.map(_.genCode(ctx))
 
@@ -1023,38 +996,31 @@ case class ScalaUDF(
     // We need to get the boxedType of dataType's javaType here. Because for the dataType
     // such as IntegerType, its javaType is `int` and the returned type of user-defined
     // function is Object. Trying to convert an Object to `int` will cause casting exception.
-    val evalCode = evals.map(_.code).mkString
-    val (converters, funcArguments) = converterTerms.zipWithIndex.map {
-      case ((convName, convInit), i) =>
-        val eval = evals(i)
-        val argTerm = ctx.freshName("arg")
-        val convert =
-          s"""
-             |$convInit
-             |Object $argTerm = ${eval.isNull} ? null : $convName.apply(${eval.value});
-           """.stripMargin
-        (convert, argTerm)
+    val evalCode = evals.map(_.code).mkString("\n")
+    val (funcArgs, initArgs) = evals.zipWithIndex.map { case (eval, i) =>
+      val argTerm = ctx.freshName("arg")
+      val convert = s"$convertersTerm[$i].apply(${eval.value})"
+      val initArg = s"Object $argTerm = ${eval.isNull} ? null : $convert;"
+      (argTerm, initArg)
     }.unzip
 
-    val getFuncResult = s"$funcTerm.apply(${funcArguments.mkString(", ")})"
+    val udf = ctx.addReferenceObj("udf", function, s"scala.Function${children.length}")
+    val getFuncResult = s"$udf.apply(${funcArgs.mkString(", ")})"
+    val resultConverter = s"$convertersTerm[${children.length}]"
     val callFunc =
       s"""
          |${ctx.boxedType(dataType)} $resultTerm = null;
-         |$scalaUDFClassName $scalaUDF = $scalaUDFRef;
          |try {
-         |  $funcClassName $funcTerm = ($funcClassName)$scalaUDF.userDefinedFunc();
-         |  $converterClassName $catalystConverterTerm = ($converterClassName)
-         |    $typeConvertersClassName.createToCatalystConverter($scalaUDF.dataType());
-         |  $resultTerm = (${ctx.boxedType(dataType)})$catalystConverterTerm.apply($getFuncResult);
+         |  $resultTerm = (${ctx.boxedType(dataType)})$resultConverter.apply($getFuncResult);
          |} catch (Exception e) {
-         |  throw new org.apache.spark.SparkException($scalaUDF.udfErrorMessage(), e);
+         |  throw new org.apache.spark.SparkException($errorMsgTerm, e);
          |}
        """.stripMargin
 
     ev.copy(code =
       s"""
          |$evalCode
-         |${converters.mkString("\n")}
+         |${initArgs.mkString("\n")}
          |$callFunc
          |
          |boolean ${ev.isNull} = $resultTerm == null;
@@ -1065,7 +1031,7 @@ case class ScalaUDF(
        """.stripMargin)
   }
 
-  private[this] val converter = CatalystTypeConverters.createToCatalystConverter(dataType)
+  private[this] val resultConverter = CatalystTypeConverters.createToCatalystConverter(dataType)
 
   lazy val udfErrorMessage = {
     val funcCls = function.getClass.getSimpleName
@@ -1081,6 +1047,6 @@ case class ScalaUDF(
         throw new SparkException(udfErrorMessage, e)
     }
 
-    converter(result)
+    resultConverter(result)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 8eb41addaf689..ac9f56f78eb2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -328,7 +328,7 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
     }
   }
 
-  @transient private[this] lazy val set = child.dataType match {
+  @transient lazy val set: Set[Any] = child.dataType match {
     case _: AtomicType => hset
     case _: NullType => hset
     case _ =>
@@ -336,20 +336,11 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
       TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ hset
   }
 
-  def getSet(): Set[Any] = set
-
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val setName = classOf[Set[Any]].getName
-    val InSetName = classOf[InSet].getName
+    val setTerm = ctx.addReferenceObj("set", set)
     val childGen = child.genCode(ctx)
-    ctx.references += this
-    val setTerm = ctx.freshName("set")
-    val setNull = if (hasNull) {
-      s"""
-         |if (!${ev.value}) {
-         |  ${ev.isNull} = true;
-         |}
-       """.stripMargin
+    val setIsNull = if (hasNull) {
+      s"${ev.isNull} = !${ev.value};"
     } else {
       ""
     }
@@ -359,9 +350,8 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
          |${ctx.JAVA_BOOLEAN} ${ev.isNull} = ${childGen.isNull};
          |${ctx.JAVA_BOOLEAN} ${ev.value} = false;
          |if (!${ev.isNull}) {
-         |  $setName $setTerm = (($InSetName)references[${ctx.references.size - 1}]).getSet();
          |  ${ev.value} = $setTerm.contains(${childGen.value});
-         |  $setNull
+         |  $setIsNull
          |}
        """.stripMargin)
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
index b0687fe900d3d..70dea4b39d55d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
@@ -51,7 +51,6 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22695: ScalaUDF should not use global variables") {
     val ctx = new CodegenContext
     ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
-    // ScalaUDF can be very verbose and trigger reduceCodeSize
-    assert(ctx.mutableStates.forall(_._2.startsWith("globalIsNull")))
+    assert(ctx.mutableStates.isEmpty)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
index d7acd139225cd..478118ed709f7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeInSuite.scala
@@ -169,7 +169,7 @@ class OptimizeInSuite extends PlanTest {
       val optimizedPlan = OptimizeIn(plan)
       optimizedPlan match {
         case Filter(cond, _)
-          if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].getSet().size == 3 =>
+          if cond.isInstanceOf[InSet] && cond.asInstanceOf[InSet].set.size == 3 =>
         // pass
         case _ => fail("Unexpected result for OptimizedIn")
       }

From 8eb5609d8d961e54aa1ed0632f15f5e570fa627a Mon Sep 17 00:00:00 2001
From: zhoukang <zhoukang199191@gmail.com>
Date: Wed, 13 Dec 2017 11:47:33 -0800
Subject: [PATCH 107/356] =?UTF-8?q?[SPARK-22754][DEPLOY]=20Check=20whether?=
 =?UTF-8?q?=20spark.executor.heartbeatInterval=20bigger=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… than spark.network.timeout or not

## What changes were proposed in this pull request?

If spark.executor.heartbeatInterval bigger than spark.network.timeout,it will almost always cause exception below.
`Job aborted due to stage failure: Task 4763 in stage 3.0 failed 4 times, most recent failure: Lost task 4763.3 in stage 3.0 (TID 22383, executor id: 4761, host: xxx): ExecutorLostFailure (executor 4761 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 154022 ms`
Since many users do not get that point.He will set spark.executor.heartbeatInterval incorrectly.
This patch check this case when submit applications.

## How was this patch tested?
Test in cluster

Author: zhoukang <zhoukang199191@gmail.com>

Closes #19942 from caneGuy/zhoukang/check-heartbeat.
---
 core/src/main/scala/org/apache/spark/SparkConf.scala   |  8 ++++++++
 .../test/scala/org/apache/spark/SparkConfSuite.scala   | 10 ++++++++++
 2 files changed, 18 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 4b1286d91e8f3..d77303e6fdf8b 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -564,6 +564,14 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria
     val encryptionEnabled = get(NETWORK_ENCRYPTION_ENABLED) || get(SASL_ENCRYPTION_ENABLED)
     require(!encryptionEnabled || get(NETWORK_AUTH_ENABLED),
       s"${NETWORK_AUTH_ENABLED.key} must be enabled when enabling encryption.")
+
+    val executorTimeoutThreshold = getTimeAsSeconds("spark.network.timeout", "120s")
+    val executorHeartbeatInterval = getTimeAsSeconds("spark.executor.heartbeatInterval", "10s")
+    // If spark.executor.heartbeatInterval bigger than spark.network.timeout,
+    // it will almost always cause ExecutorLostFailure. See SPARK-22754.
+    require(executorTimeoutThreshold > executorHeartbeatInterval, "The value of " +
+      s"spark.network.timeout=${executorTimeoutThreshold}s must be no less than the value of " +
+      s"spark.executor.heartbeatInterval=${executorHeartbeatInterval}s.")
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index c771eb4ee3ef5..bff808eb540ac 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -329,6 +329,16 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst
     conf.validateSettings()
   }
 
+  test("spark.network.timeout should bigger than spark.executor.heartbeatInterval") {
+    val conf = new SparkConf()
+    conf.validateSettings()
+
+    conf.set("spark.network.timeout", "5s")
+    intercept[IllegalArgumentException] {
+      conf.validateSettings()
+    }
+  }
+
 }
 
 class Class1 {}

From c5a4701acc6972ed7ccb11c506fe718d5503f140 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 13 Dec 2017 11:50:04 -0800
Subject: [PATCH 108/356] Revert "[SPARK-21417][SQL] Infer join conditions
 using propagated constraints"

This reverts commit 6ac57fd0d1c82b834eb4bf0dd57596b92a99d6de.
---
 .../expressions/EquivalentExpressionMap.scala |  66 -----
 .../catalyst/expressions/ExpressionSet.scala  |   2 -
 .../sql/catalyst/optimizer/Optimizer.scala    |   1 -
 .../spark/sql/catalyst/optimizer/joins.scala  |  60 -----
 .../EquivalentExpressionMapSuite.scala        |  56 -----
 .../optimizer/EliminateCrossJoinSuite.scala   | 238 ------------------
 6 files changed, 423 deletions(-)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
deleted file mode 100644
index cf1614afb1a76..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMap.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import scala.collection.mutable
-
-import org.apache.spark.sql.catalyst.expressions.EquivalentExpressionMap.SemanticallyEqualExpr
-
-/**
- * A class that allows you to map an expression into a set of equivalent expressions. The keys are
- * handled based on their semantic meaning and ignoring cosmetic differences. The values are
- * represented as [[ExpressionSet]]s.
- *
- * The underlying representation of keys depends on the [[Expression.semanticHash]] and
- * [[Expression.semanticEquals]] methods.
- *
- * {{{
- *   val map = new EquivalentExpressionMap()
- *
- *   map.put(1 + 2, a)
- *   map.put(rand(), b)
- *
- *   map.get(2 + 1) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
- *   map.get(1 + 2) => Set(a) // 1 + 2 and 2 + 1 are semantically equivalent
- *   map.get(rand()) => Set() // non-deterministic expressions are not equivalent
- * }}}
- */
-class EquivalentExpressionMap {
-
-  private val equivalenceMap = mutable.HashMap.empty[SemanticallyEqualExpr, ExpressionSet]
-
-  def put(expression: Expression, equivalentExpression: Expression): Unit = {
-    val equivalentExpressions = equivalenceMap.getOrElseUpdate(expression, ExpressionSet.empty)
-    equivalenceMap(expression) = equivalentExpressions + equivalentExpression
-  }
-
-  def get(expression: Expression): Set[Expression] =
-    equivalenceMap.getOrElse(expression, ExpressionSet.empty)
-}
-
-object EquivalentExpressionMap {
-
-  private implicit class SemanticallyEqualExpr(val expr: Expression) {
-    override def equals(obj: Any): Boolean = obj match {
-      case other: SemanticallyEqualExpr => expr.semanticEquals(other.expr)
-      case _ => false
-    }
-
-    override def hashCode: Int = expr.semanticHash()
-  }
-}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
index e9890837af07d..7e8e7b8cd5f18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSet.scala
@@ -27,8 +27,6 @@ object ExpressionSet {
     expressions.foreach(set.add)
     set
   }
-
-  val empty: ExpressionSet = ExpressionSet(Nil)
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 577693506ed34..5acadf8cf330e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -87,7 +87,6 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       PushProjectionThroughUnion,
       ReorderJoin,
       EliminateOuterJoin,
-      EliminateCrossJoin,
       InferFiltersFromConstraints,
       BooleanSimplification,
       PushPredicateThroughJoin,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
index 29a3a7f109b80..edbeaf273fd6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import scala.annotation.tailrec
-import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractFiltersAndInnerJoins
@@ -153,62 +152,3 @@ object EliminateOuterJoin extends Rule[LogicalPlan] with PredicateHelper {
       if (j.joinType == newJoinType) f else Filter(condition, j.copy(joinType = newJoinType))
   }
 }
-
-/**
- * A rule that eliminates CROSS joins by inferring join conditions from propagated constraints.
- *
- * The optimization is applicable only to CROSS joins. For other join types, adding inferred join
- * conditions would potentially shuffle children as child node's partitioning won't satisfy the JOIN
- * node's requirements which otherwise could have.
- *
- * For instance, given a CROSS join with the constraint 'a = 1' from the left child and the
- * constraint 'b = 1' from the right child, this rule infers a new join predicate 'a = b' and
- * converts it to an Inner join.
- */
-object EliminateCrossJoin extends Rule[LogicalPlan] with PredicateHelper {
-
-  def apply(plan: LogicalPlan): LogicalPlan = {
-    if (SQLConf.get.constraintPropagationEnabled) {
-      eliminateCrossJoin(plan)
-    } else {
-      plan
-    }
-  }
-
-  private def eliminateCrossJoin(plan: LogicalPlan): LogicalPlan = plan transform {
-    case join @ Join(leftPlan, rightPlan, Cross, None) =>
-      val leftConstraints = join.constraints.filter(_.references.subsetOf(leftPlan.outputSet))
-      val rightConstraints = join.constraints.filter(_.references.subsetOf(rightPlan.outputSet))
-      val inferredJoinPredicates = inferJoinPredicates(leftConstraints, rightConstraints)
-      val joinConditionOpt = inferredJoinPredicates.reduceOption(And)
-      if (joinConditionOpt.isDefined) Join(leftPlan, rightPlan, Inner, joinConditionOpt) else join
-  }
-
-  private def inferJoinPredicates(
-      leftConstraints: Set[Expression],
-      rightConstraints: Set[Expression]): mutable.Set[EqualTo] = {
-
-    val equivalentExpressionMap = new EquivalentExpressionMap()
-
-    leftConstraints.foreach {
-      case EqualTo(attr: Attribute, expr: Expression) =>
-        equivalentExpressionMap.put(expr, attr)
-      case EqualTo(expr: Expression, attr: Attribute) =>
-        equivalentExpressionMap.put(expr, attr)
-      case _ =>
-    }
-
-    val joinConditions = mutable.Set.empty[EqualTo]
-
-    rightConstraints.foreach {
-      case EqualTo(attr: Attribute, expr: Expression) =>
-        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
-      case EqualTo(expr: Expression, attr: Attribute) =>
-        joinConditions ++= equivalentExpressionMap.get(expr).map(EqualTo(attr, _))
-      case _ =>
-    }
-
-    joinConditions
-  }
-
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
deleted file mode 100644
index bad7e17bb6cf2..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressionMapSuite.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.dsl.expressions._
-
-class EquivalentExpressionMapSuite extends SparkFunSuite {
-
-  private val onePlusTwo = Literal(1) + Literal(2)
-  private val twoPlusOne = Literal(2) + Literal(1)
-  private val rand = Rand(10)
-
-  test("behaviour of the equivalent expression map") {
-    val equivalentExpressionMap = new EquivalentExpressionMap()
-    equivalentExpressionMap.put(onePlusTwo, 'a)
-    equivalentExpressionMap.put(Literal(1) + Literal(3), 'b)
-    equivalentExpressionMap.put(rand, 'c)
-
-    // 1 + 2 should be equivalent to 2 + 1
-    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
-    // non-deterministic expressions should not be equivalent
-    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
-
-    // if the same (key, value) is added several times, the map still returns only one entry
-    equivalentExpressionMap.put(onePlusTwo, 'a)
-    equivalentExpressionMap.put(twoPlusOne, 'a)
-    assertResult(ExpressionSet(Seq('a)))(equivalentExpressionMap.get(twoPlusOne))
-
-    // get several equivalent attributes
-    equivalentExpressionMap.put(onePlusTwo, 'e)
-    assertResult(ExpressionSet(Seq('a, 'e)))(equivalentExpressionMap.get(onePlusTwo))
-    assertResult(2)(equivalentExpressionMap.get(onePlusTwo).size)
-
-    // several non-deterministic expressions should not be equivalent
-    equivalentExpressionMap.put(rand, 'd)
-    assertResult(ExpressionSet.empty)(equivalentExpressionMap.get(rand))
-    assertResult(0)(equivalentExpressionMap.get(rand).size)
-  }
-
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala
deleted file mode 100644
index e04dd28ee36a0..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/EliminateCrossJoinSuite.scala
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.optimizer
-
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, Literal, Not, Rand}
-import org.apache.spark.sql.catalyst.plans.{Cross, Inner, JoinType, PlanTest}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
-import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.internal.SQLConf.CONSTRAINT_PROPAGATION_ENABLED
-import org.apache.spark.sql.types.IntegerType
-
-class EliminateCrossJoinSuite extends PlanTest {
-
-  object Optimize extends RuleExecutor[LogicalPlan] {
-    val batches =
-      Batch("Eliminate cross joins", FixedPoint(10),
-        EliminateCrossJoin,
-        PushPredicateThroughJoin) :: Nil
-  }
-
-  val testRelation1 = LocalRelation('a.int, 'b.int)
-  val testRelation2 = LocalRelation('c.int, 'd.int)
-
-  test("successful elimination of cross joins (1)") {
-    checkJoinOptimization(
-      originalFilter = 'a === 1 && 'c === 1 && 'd === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === 1,
-      expectedRightRelationFilter = 'c === 1 && 'd === 1,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'c && 'a === 'd))
-  }
-
-  test("successful elimination of cross joins (2)") {
-    checkJoinOptimization(
-      originalFilter = 'a === 1 && 'b === 2 && 'd === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === 1 && 'b === 2,
-      expectedRightRelationFilter = 'd === 1,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'd))
-  }
-
-  test("successful elimination of cross joins (3)") {
-    // PushPredicateThroughJoin will push 'd === 'a into the join condition
-    // EliminateCrossJoin will NOT apply because the condition will be already present
-    // therefore, the join type will stay the same (i.e., CROSS)
-    checkJoinOptimization(
-      originalFilter = 'a === 1 && Literal(1) === 'd && 'd === 'a,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === 1,
-      expectedRightRelationFilter = Literal(1) === 'd,
-      expectedJoinType = Cross,
-      expectedJoinCondition = Some('a === 'd))
-  }
-
-  test("successful elimination of cross joins (4)") {
-    // Literal(1) * Literal(2) and Literal(2) * Literal(1) are semantically equal
-    checkJoinOptimization(
-      originalFilter = 'a === Literal(1) * Literal(2) && Literal(2) * Literal(1) === 'c,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === Literal(1) * Literal(2),
-      expectedRightRelationFilter = Literal(2) * Literal(1) === 'c,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'c))
-  }
-
-  test("successful elimination of cross joins (5)") {
-    checkJoinOptimization(
-      originalFilter = 'a === 1 && Literal(1) === 'a && 'c === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === 1 && Literal(1) === 'a,
-      expectedRightRelationFilter = 'c === 1,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'c))
-  }
-
-  test("successful elimination of cross joins (6)") {
-    checkJoinOptimization(
-      originalFilter = 'a === Cast("1", IntegerType) && 'c === Cast("1", IntegerType) && 'd === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === Cast("1", IntegerType),
-      expectedRightRelationFilter = 'c === Cast("1", IntegerType) && 'd === 1,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'c))
-  }
-
-  test("successful elimination of cross joins (7)") {
-    // The join condition appears due to PushPredicateThroughJoin
-    checkJoinOptimization(
-      originalFilter = (('a >= 1 && 'c === 1) || 'd === 10) && 'b === 10 && 'c === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'b === 10,
-      expectedRightRelationFilter = 'c === 1,
-      expectedJoinType = Cross,
-      expectedJoinCondition = Some(('a >= 1 && 'c === 1) || 'd === 10))
-  }
-
-  test("successful elimination of cross joins (8)") {
-    checkJoinOptimization(
-      originalFilter = 'a === 1 && 'c === 1 && Literal(1) === 'a && Literal(1) === 'c,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a === 1 && Literal(1) === 'a,
-      expectedRightRelationFilter = 'c === 1 && Literal(1) === 'c,
-      expectedJoinType = Inner,
-      expectedJoinCondition = Some('a === 'c))
-  }
-
-  test("inability to detect join conditions when constant propagation is disabled") {
-    withSQLConf(CONSTRAINT_PROPAGATION_ENABLED.key -> "false") {
-      checkJoinOptimization(
-        originalFilter = 'a === 1 && 'c === 1 && 'd === 1,
-        originalJoinType = Cross,
-        originalJoinCondition = None,
-        expectedFilter = None,
-        expectedLeftRelationFilter = 'a === 1,
-        expectedRightRelationFilter = 'c === 1 && 'd === 1,
-        expectedJoinType = Cross,
-        expectedJoinCondition = None)
-    }
-  }
-
-  test("inability to detect join conditions (1)") {
-    checkJoinOptimization(
-      originalFilter = 'a >= 1 && 'c === 1 && 'd >= 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = 'a >= 1,
-      expectedRightRelationFilter = 'c === 1 && 'd >= 1,
-      expectedJoinType = Cross,
-      expectedJoinCondition = None)
-  }
-
-  test("inability to detect join conditions (2)") {
-    checkJoinOptimization(
-      originalFilter = Literal(1) === 'b && ('c === 1 || 'd === 1),
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = Literal(1) === 'b,
-      expectedRightRelationFilter = 'c === 1 || 'd === 1,
-      expectedJoinType = Cross,
-      expectedJoinCondition = None)
-  }
-
-  test("inability to detect join conditions (3)") {
-    checkJoinOptimization(
-      originalFilter = Literal(1) === 'b && 'c === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = Some('c === 'b),
-      expectedFilter = None,
-      expectedLeftRelationFilter = Literal(1) === 'b,
-      expectedRightRelationFilter = 'c === 1,
-      expectedJoinType = Cross,
-      expectedJoinCondition = Some('c === 'b))
-  }
-
-  test("inability to detect join conditions (4)") {
-    checkJoinOptimization(
-      originalFilter = Not('a === 1) && 'd === 1,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = None,
-      expectedLeftRelationFilter = Not('a === 1),
-      expectedRightRelationFilter = 'd === 1,
-      expectedJoinType = Cross,
-      expectedJoinCondition = None)
-  }
-
-  test("inability to detect join conditions (5)") {
-    checkJoinOptimization(
-      originalFilter = 'a === Rand(10) && 'b === 1 && 'd === Rand(10) && 'c === 3,
-      originalJoinType = Cross,
-      originalJoinCondition = None,
-      expectedFilter = Some('a === Rand(10) && 'd === Rand(10)),
-      expectedLeftRelationFilter = 'b === 1,
-      expectedRightRelationFilter = 'c === 3,
-      expectedJoinType = Cross,
-      expectedJoinCondition = None)
-  }
-
-  private def checkJoinOptimization(
-      originalFilter: Expression,
-      originalJoinType: JoinType,
-      originalJoinCondition: Option[Expression],
-      expectedFilter: Option[Expression],
-      expectedLeftRelationFilter: Expression,
-      expectedRightRelationFilter: Expression,
-      expectedJoinType: JoinType,
-      expectedJoinCondition: Option[Expression]): Unit = {
-
-    val originalQuery = testRelation1
-      .join(testRelation2, originalJoinType, originalJoinCondition)
-      .where(originalFilter)
-    val optimizedQuery = Optimize.execute(originalQuery.analyze)
-
-    val left = testRelation1.where(expectedLeftRelationFilter)
-    val right = testRelation2.where(expectedRightRelationFilter)
-    val join = left.join(right, expectedJoinType, expectedJoinCondition)
-    val expectedQuery = expectedFilter.fold(join)(join.where(_)).analyze
-
-    comparePlans(optimizedQuery, expectedQuery)
-  }
-}

From 1abcbed678c2bc4f05640db2791fd2d84267d740 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 13 Dec 2017 11:54:22 -0800
Subject: [PATCH 109/356] [SPARK-22763][CORE] SHS: Ignore unknown events and
 parse through the file

## What changes were proposed in this pull request?

While spark code changes, there are new events in event log: #19649
And we used to maintain a whitelist to avoid exceptions: #15663
Currently Spark history server will stop parsing on unknown events or unrecognized properties. We may still see part of the UI data.
For better compatibility, we can ignore unknown events and parse through the log file.

## How was this patch tested?
Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #19953 from gengliangwang/ReplayListenerBus.
---
 .../spark/scheduler/ReplayListenerBus.scala   | 37 +++++++++----------
 .../spark/scheduler/ReplayListenerSuite.scala | 29 +++++++++++++++
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
index 26a6a3effc9ac..c9cd662f5709d 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ReplayListenerBus.scala
@@ -69,6 +69,8 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
       eventsFilter: ReplayEventsFilter): Unit = {
     var currentLine: String = null
     var lineNumber: Int = 0
+    val unrecognizedEvents = new scala.collection.mutable.HashSet[String]
+    val unrecognizedProperties = new scala.collection.mutable.HashSet[String]
 
     try {
       val lineEntries = lines
@@ -84,16 +86,22 @@ private[spark] class ReplayListenerBus extends SparkListenerBus with Logging {
 
           postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine)))
         } catch {
-          case e: ClassNotFoundException if KNOWN_REMOVED_CLASSES.contains(e.getMessage) =>
-            // Ignore events generated by Structured Streaming in Spark 2.0.0 and 2.0.1.
-            // It's safe since no place uses them.
-            logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
-          case e: UnrecognizedPropertyException if e.getMessage != null && e.getMessage.startsWith(
-            "Unrecognized field \"queryStatus\" " +
-              "(class org.apache.spark.sql.streaming.StreamingQueryListener$") =>
-            // Ignore events generated by Structured Streaming in Spark 2.0.2
-            // It's safe since no place uses them.
-            logWarning(s"Dropped incompatible Structured Streaming log: $currentLine")
+          case e: ClassNotFoundException =>
+            // Ignore unknown events, parse through the event log file.
+            // To avoid spamming, warnings are only displayed once for each unknown event.
+            if (!unrecognizedEvents.contains(e.getMessage)) {
+              logWarning(s"Drop unrecognized event: ${e.getMessage}")
+              unrecognizedEvents.add(e.getMessage)
+            }
+            logDebug(s"Drop incompatible event log: $currentLine")
+          case e: UnrecognizedPropertyException =>
+            // Ignore unrecognized properties, parse through the event log file.
+            // To avoid spamming, warnings are only displayed once for each unrecognized property.
+            if (!unrecognizedProperties.contains(e.getMessage)) {
+              logWarning(s"Drop unrecognized property: ${e.getMessage}")
+              unrecognizedProperties.add(e.getMessage)
+            }
+            logDebug(s"Drop incompatible event log: $currentLine")
           case jpe: JsonParseException =>
             // We can only ignore exception from last line of the file that might be truncated
             // the last entry may not be the very last line in the event log, but we treat it
@@ -125,13 +133,4 @@ private[spark] object ReplayListenerBus {
 
   // utility filter that selects all event logs during replay
   val SELECT_ALL_FILTER: ReplayEventsFilter = { (eventString: String) => true }
-
-  /**
-   * Classes that were removed. Structured Streaming doesn't use them any more. However, parsing
-   * old json may fail and we can just ignore these failures.
-   */
-  val KNOWN_REMOVED_CLASSES = Set(
-    "org.apache.spark.sql.streaming.StreamingQueryListener$QueryProgress",
-    "org.apache.spark.sql.streaming.StreamingQueryListener$QueryTerminated"
-  )
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
index d17e3864854a8..73e7b3fe8c1de 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ReplayListenerSuite.scala
@@ -128,6 +128,35 @@ class ReplayListenerSuite extends SparkFunSuite with BeforeAndAfter with LocalSp
     }
   }
 
+  test("Replay incompatible event log") {
+    val logFilePath = Utils.getFilePath(testDir, "incompatible.txt")
+    val fstream = fileSystem.create(logFilePath)
+    val writer = new PrintWriter(fstream)
+    val applicationStart = SparkListenerApplicationStart("Incompatible App", None,
+      125L, "UserUsingIncompatibleVersion", None)
+    val applicationEnd = SparkListenerApplicationEnd(1000L)
+    // scalastyle:off println
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationStart))))
+    writer.println("""{"Event":"UnrecognizedEventOnlyForTest","Timestamp":1477593059313}""")
+    writer.println(compact(render(JsonProtocol.sparkEventToJson(applicationEnd))))
+    // scalastyle:on println
+    writer.close()
+
+    val conf = EventLoggingListenerSuite.getLoggingConf(logFilePath)
+    val logData = fileSystem.open(logFilePath)
+    val eventMonster = new EventMonster(conf)
+    try {
+      val replayer = new ReplayListenerBus()
+      replayer.addListener(eventMonster)
+      replayer.replay(logData, logFilePath.toString)
+    } finally {
+      logData.close()
+    }
+    assert(eventMonster.loggedEvents.size === 2)
+    assert(eventMonster.loggedEvents(0) === JsonProtocol.sparkEventToJson(applicationStart))
+    assert(eventMonster.loggedEvents(1) === JsonProtocol.sparkEventToJson(applicationEnd))
+  }
+
   // This assumes the correctness of EventLoggingListener
   test("End-to-end replay") {
     testApplicationReplay()

From 0bdb4e516c425ea7bf941106ac6449b5a0a289e3 Mon Sep 17 00:00:00 2001
From: German Schiavon <germanschiavon@gmail.com>
Date: Wed, 13 Dec 2017 13:37:25 -0800
Subject: [PATCH 110/356] [SPARK-22574][MESOS][SUBMIT] Check submission request
 parameters

## What changes were proposed in this pull request?

PR closed with all the comments -> https://github.com/apache/spark/pull/19793

It solves the problem when submitting a wrong CreateSubmissionRequest to Spark Dispatcher was causing a bad state of Dispatcher and making it inactive as a Mesos framework.

https://issues.apache.org/jira/browse/SPARK-22574

## How was this patch tested?

All spark test passed successfully.

It was tested sending a wrong request (without appArgs) before and after the change. The point is easy, check if the value is null before being accessed.

This was before the change, leaving the dispatcher inactive:

```
Exception in thread "Thread-22" java.lang.NullPointerException
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.getDriverCommandValue(MesosClusterScheduler.scala:444)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.buildDriverCommand(MesosClusterScheduler.scala:451)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.org$apache$spark$scheduler$cluster$mesos$MesosClusterScheduler$$createTaskInfo(MesosClusterScheduler.scala:538)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler$$anonfun$scheduleTasks$1.apply(MesosClusterScheduler.scala:570)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler$$anonfun$scheduleTasks$1.apply(MesosClusterScheduler.scala:555)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.scheduleTasks(MesosClusterScheduler.scala:555)
	at org.apache.spark.scheduler.cluster.mesos.MesosClusterScheduler.resourceOffers(MesosClusterScheduler.scala:621)
```

And after:

```
  "message" : "Malformed request: org.apache.spark.deploy.rest.SubmitRestProtocolException: Validation of message CreateSubmissionRequest failed!\n\torg.apache.spark.deploy.rest.SubmitRestProtocolMessage.validate(SubmitRestProtocolMessage.scala:70)\n\torg.apache.spark.deploy.rest.SubmitRequestServlet.doPost(RestSubmissionServer.scala:272)\n\tjavax.servlet.http.HttpServlet.service(HttpServlet.java:707)\n\tjavax.servlet.http.HttpServlet.service(HttpServlet.java:790)\n\torg.spark_project.jetty.servlet.ServletHolder.handle(ServletHolder.java:845)\n\torg.spark_project.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:583)\n\torg.spark_project.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1180)\n\torg.spark_project.jetty.servlet.ServletHandler.doScope(ServletHandler.java:511)\n\torg.spark_project.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1112)\n\torg.spark_project.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n\torg.spark_project.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:134)\n\torg.spark_project.jetty.server.Server.handle(Server.java:524)\n\torg.spark_project.jetty.server.HttpChannel.handle(HttpChannel.java:319)\n\torg.spark_project.jetty.server.HttpConnection.onFillable(HttpConnection.java:253)\n\torg.spark_project.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:273)\n\torg.spark_project.jetty.io.FillInterest.fillable(FillInterest.java:95)\n\torg.spark_project.jetty.io.SelectChannelEndPoint$2.run(SelectChannelEndPoint.java:93)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.executeProduceConsume(ExecuteProduceConsume.java:303)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.produceConsume(ExecuteProduceConsume.java:148)\n\torg.spark_project.jetty.util.thread.strategy.ExecuteProduceConsume.run(ExecuteProduceConsume.java:136)\n\torg.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:671)\n\torg.spark_project.jetty.util.thread.QueuedThreadPool$2.run(QueuedThreadPool.java:589)\n\tjava.lang.Thread.run(Thread.java:745)"
```

Author: German Schiavon <germanschiavon@gmail.com>

Closes #19966 from Gschiavon/fix-submission-request.
---
 .../deploy/rest/SubmitRestProtocolRequest.scala     |  2 ++
 .../spark/deploy/rest/SubmitRestProtocolSuite.scala |  2 ++
 .../spark/deploy/rest/mesos/MesosRestServer.scala   | 13 +++++++++----
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
index 0d50a768942ed..86ddf954ca128 100644
--- a/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/rest/SubmitRestProtocolRequest.scala
@@ -46,6 +46,8 @@ private[rest] class CreateSubmissionRequest extends SubmitRestProtocolRequest {
     super.doValidate()
     assert(sparkProperties != null, "No Spark properties set!")
     assertFieldIsSet(appResource, "appResource")
+    assertFieldIsSet(appArgs, "appArgs")
+    assertFieldIsSet(environmentVariables, "environmentVariables")
     assertPropertyIsSet("spark.app.name")
     assertPropertyIsBoolean("spark.driver.supervise")
     assertPropertyIsNumeric("spark.driver.cores")
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
index 725b8848bc052..75c50af23c66a 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/SubmitRestProtocolSuite.scala
@@ -86,6 +86,8 @@ class SubmitRestProtocolSuite extends SparkFunSuite {
     message.clientSparkVersion = "1.2.3"
     message.appResource = "honey-walnut-cherry.jar"
     message.mainClass = "org.apache.spark.examples.SparkPie"
+    message.appArgs = Array("two slices")
+    message.environmentVariables = Map("PATH" -> "/dev/null")
     val conf = new SparkConf(false)
     conf.set("spark.app.name", "SparkPie")
     message.sparkProperties = conf.getAll.toMap
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
index ff60b88c6d533..68f6921153d89 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/deploy/rest/mesos/MesosRestServer.scala
@@ -77,10 +77,17 @@ private[mesos] class MesosSubmitRequestServlet(
   private def buildDriverDescription(request: CreateSubmissionRequest): MesosDriverDescription = {
     // Required fields, including the main class because python is not yet supported
     val appResource = Option(request.appResource).getOrElse {
-      throw new SubmitRestMissingFieldException("Application jar is missing.")
+      throw new SubmitRestMissingFieldException("Application jar 'appResource' is missing.")
     }
     val mainClass = Option(request.mainClass).getOrElse {
-      throw new SubmitRestMissingFieldException("Main class is missing.")
+      throw new SubmitRestMissingFieldException("Main class 'mainClass' is missing.")
+    }
+    val appArgs = Option(request.appArgs).getOrElse {
+      throw new SubmitRestMissingFieldException("Application arguments 'appArgs' are missing.")
+    }
+    val environmentVariables = Option(request.environmentVariables).getOrElse {
+      throw new SubmitRestMissingFieldException("Environment variables 'environmentVariables' " +
+        "are missing.")
     }
 
     // Optional fields
@@ -91,8 +98,6 @@ private[mesos] class MesosSubmitRequestServlet(
     val superviseDriver = sparkProperties.get("spark.driver.supervise")
     val driverMemory = sparkProperties.get("spark.driver.memory")
     val driverCores = sparkProperties.get("spark.driver.cores")
-    val appArgs = request.appArgs
-    val environmentVariables = request.environmentVariables
     val name = request.sparkProperties.getOrElse("spark.app.name", mainClass)
 
     // Construct driver description

From ba0e79f57caa279773fb014b7883ee5d69dd0a68 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Wed, 13 Dec 2017 13:54:16 -0800
Subject: [PATCH 111/356] [SPARK-22772][SQL] Use
 splitExpressionsWithCurrentInputs to split codes in elt

## What changes were proposed in this pull request?

In SPARK-22550 which fixes 64KB JVM bytecode limit problem with elt, `buildCodeBlocks` is used to split codes. However, we should use `splitExpressionsWithCurrentInputs` because it considers both normal and wholestage codgen (it is not supported yet, so it simply doesn't split the codes).

## How was this patch tested?

Existing tests.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19964 from viirya/SPARK-22772.
---
 .../expressions/codegen/CodeGenerator.scala   |  2 +-
 .../expressions/stringExpressions.scala       | 81 ++++++++++---------
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 257c3f10fa08b..b1d931117f99b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -878,7 +878,7 @@ class CodegenContext {
    *
    * @param expressions the codes to evaluate expressions.
    */
-  def buildCodeBlocks(expressions: Seq[String]): Seq[String] = {
+  private def buildCodeBlocks(expressions: Seq[String]): Seq[String] = {
     val blocks = new ArrayBuffer[String]()
     val blockBuilder = new StringBuilder()
     var length = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 47f0b5741f67f..8c4d2fd686be5 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -289,53 +289,56 @@ case class Elt(children: Seq[Expression])
     val index = indexExpr.genCode(ctx)
     val strings = stringExprs.map(_.genCode(ctx))
     val indexVal = ctx.freshName("index")
+    val indexMatched = ctx.freshName("eltIndexMatched")
+
     val stringVal = ctx.freshName("stringVal")
+    ctx.addMutableState(ctx.javaType(dataType), stringVal)
+
     val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
       s"""
-        case ${index + 1}:
-          ${eval.code}
-          $stringVal = ${eval.isNull} ? null : ${eval.value};
-          break;
-      """
+         |if ($indexVal == ${index + 1}) {
+         |  ${eval.code}
+         |  $stringVal = ${eval.isNull} ? null : ${eval.value};
+         |  $indexMatched = true;
+         |  continue;
+         |}
+      """.stripMargin
     }
 
-    val cases = ctx.buildCodeBlocks(assignStringValue)
-    val codes = if (cases.length == 1) {
-      s"""
-        UTF8String $stringVal = null;
-        switch ($indexVal) {
-          ${cases.head}
-        }
-       """
-    } else {
-      var prevFunc = "null"
-      for (c <- cases.reverse) {
-        val funcName = ctx.freshName("eltFunc")
-        val funcBody = s"""
-         private UTF8String $funcName(InternalRow ${ctx.INPUT_ROW}, int $indexVal) {
-           UTF8String $stringVal = null;
-           switch ($indexVal) {
-             $c
-             default:
-               return $prevFunc;
-           }
-           return $stringVal;
-         }
-        """
-        val fullFuncName = ctx.addNewFunction(funcName, funcBody)
-        prevFunc = s"$fullFuncName(${ctx.INPUT_ROW}, $indexVal)"
-      }
-      s"UTF8String $stringVal = $prevFunc;"
-    }
+    val codes = ctx.splitExpressionsWithCurrentInputs(
+      expressions = assignStringValue,
+      funcName = "eltFunc",
+      extraArguments = ("int", indexVal) :: Nil,
+      returnType = ctx.JAVA_BOOLEAN,
+      makeSplitFunction = body =>
+        s"""
+           |${ctx.JAVA_BOOLEAN} $indexMatched = false;
+           |do {
+           |  $body
+           |} while (false);
+           |return $indexMatched;
+         """.stripMargin,
+      foldFunctions = _.map { funcCall =>
+        s"""
+           |$indexMatched = $funcCall;
+           |if ($indexMatched) {
+           |  continue;
+           |}
+         """.stripMargin
+      }.mkString)
 
     ev.copy(
       s"""
-      ${index.code}
-      final int $indexVal = ${index.value};
-      $codes
-      UTF8String ${ev.value} = $stringVal;
-      final boolean ${ev.isNull} = ${ev.value} == null;
-    """)
+         |${index.code}
+         |final int $indexVal = ${index.value};
+         |${ctx.JAVA_BOOLEAN} $indexMatched = false;
+         |$stringVal = null;
+         |do {
+         |  $codes
+         |} while (false);
+         |final UTF8String ${ev.value} = $stringVal;
+         |final boolean ${ev.isNull} = ${ev.value} == null;
+       """.stripMargin)
   }
 }
 

From a83e8e6c223df8b819335cbabbfff9956942f2ad Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 13 Dec 2017 16:06:16 -0600
Subject: [PATCH 112/356] [SPARK-22764][CORE] Fix flakiness in
 SparkContextSuite.

Use a semaphore to synchronize the tasks with the listener code
that is trying to cancel the job or stage, so that the listener
won't try to cancel a job or stage that has already finished.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19956 from vanzin/SPARK-22764.
---
 .../org/apache/spark/SparkContextSuite.scala      | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 37fcc93c62fa8..b30bd74812b36 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark
 import java.io.File
 import java.net.{MalformedURLException, URI}
 import java.nio.charset.StandardCharsets
-import java.util.concurrent.TimeUnit
+import java.util.concurrent.{Semaphore, TimeUnit}
 
 import scala.concurrent.duration._
 
@@ -499,6 +499,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
   test("Cancelling stages/jobs with custom reasons.") {
     sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
     val REASON = "You shall not pass"
+    val slices = 10
 
     val listener = new SparkListener {
       override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
@@ -508,6 +509,7 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
           }
           sc.cancelStage(taskStart.stageId, REASON)
           SparkContextSuite.cancelStage = false
+          SparkContextSuite.semaphore.release(slices)
         }
       }
 
@@ -518,21 +520,25 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
           }
           sc.cancelJob(jobStart.jobId, REASON)
           SparkContextSuite.cancelJob = false
+          SparkContextSuite.semaphore.release(slices)
         }
       }
     }
     sc.addSparkListener(listener)
 
     for (cancelWhat <- Seq("stage", "job")) {
+      SparkContextSuite.semaphore.drainPermits()
       SparkContextSuite.isTaskStarted = false
       SparkContextSuite.cancelStage = (cancelWhat == "stage")
       SparkContextSuite.cancelJob = (cancelWhat == "job")
 
       val ex = intercept[SparkException] {
-        sc.range(0, 10000L).mapPartitions { x =>
-          org.apache.spark.SparkContextSuite.isTaskStarted = true
+        sc.range(0, 10000L, numSlices = slices).mapPartitions { x =>
+          SparkContextSuite.isTaskStarted = true
+          // Block waiting for the listener to cancel the stage or job.
+          SparkContextSuite.semaphore.acquire()
           x
-        }.cartesian(sc.range(0, 10L))count()
+        }.count()
       }
 
       ex.getCause() match {
@@ -636,4 +642,5 @@ object SparkContextSuite {
   @volatile var isTaskStarted = false
   @volatile var taskKilled = false
   @volatile var taskSucceeded = false
+  val semaphore = new Semaphore(0)
 }

From ef92999653f0e2a47752379a867647445d849aab Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Wed, 13 Dec 2017 15:55:16 -0800
Subject: [PATCH 113/356] [SPARK-22600][SQL][FOLLOW-UP] Fix a compilation error
 in TPCDS q75/q77

## What changes were proposed in this pull request?
This pr fixed a compilation error of TPCDS `q75`/`q77`  caused by #19813;
```
  java.util.concurrent.ExecutionException: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 371, Column 16: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 371, Column 16: Expression "bhj_matched" is not an rvalue
  at com.google.common.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:306)
  at com.google.common.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:293)
  at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:116)
  at com.google.common.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135)
```

## How was this patch tested?
Manually checked `q75`/`q77` can be properly compiled

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19969 from maropu/SPARK-22600-FOLLOWUP.
---
 .../sql/catalyst/expressions/codegen/ExpressionCodegen.scala    | 1 -
 .../spark/sql/execution/joins/BroadcastHashJoinExec.scala       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
index a2dda48e951d1..807cb94734b2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.DataType
 
 /**
  * Defines util methods used in expression code generation.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index c96ed6ef41016..634014af16e8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -192,7 +192,7 @@ case class BroadcastHashJoinExec(
           |  $value = ${ev.value};
           |}
          """.stripMargin
-        ExprCode(code, isNull, value)
+        ExprCode(code, isNull, value, inputRow = matched)
       }
     }
   }

From bc7e4a90c0c91970a94aa385971daac48db6264e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 14 Dec 2017 11:21:34 +0800
Subject: [PATCH 114/356] Revert "[SPARK-22600][SQL][FOLLOW-UP] Fix a
 compilation error in TPCDS q75/q77"

This reverts commit ef92999653f0e2a47752379a867647445d849aab.
---
 .../sql/catalyst/expressions/codegen/ExpressionCodegen.scala    | 1 +
 .../spark/sql/execution/joins/BroadcastHashJoinExec.scala       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
index 807cb94734b2a..a2dda48e951d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.codegen
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.types.DataType
 
 /**
  * Defines util methods used in expression code generation.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index 634014af16e8b..c96ed6ef41016 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -192,7 +192,7 @@ case class BroadcastHashJoinExec(
           |  $value = ${ev.value};
           |}
          """.stripMargin
-        ExprCode(code, isNull, value, inputRow = matched)
+        ExprCode(code, isNull, value)
       }
     }
   }

From 2a29a60da32a5ccd04cc7c5c22c4075b159389e3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 14 Dec 2017 11:22:23 +0800
Subject: [PATCH 115/356] Revert "[SPARK-22600][SQL] Fix 64kb limit for deeply
 nested expressions under wholestage codegen"

This reverts commit c7d0148615c921dca782ee3785b5d0cd59e42262.
---
 .../sql/catalyst/expressions/Expression.scala |  37 +--
 .../expressions/codegen/CodeGenerator.scala   |  44 +--
 .../codegen/ExpressionCodegen.scala           | 269 ------------------
 .../codegen/ExpressionCodegenSuite.scala      | 220 --------------
 .../sql/execution/ColumnarBatchScan.scala     |   5 +-
 .../execution/WholeStageCodegenSuite.scala    |  23 +-
 6 files changed, 13 insertions(+), 585 deletions(-)
 delete mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 329ea5d421509..743782a6453e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -105,12 +105,6 @@ abstract class Expression extends TreeNode[Expression] {
       val isNull = ctx.freshName("isNull")
       val value = ctx.freshName("value")
       val eval = doGenCode(ctx, ExprCode("", isNull, value))
-      eval.isNull = if (this.nullable) eval.isNull else "false"
-
-      // Records current input row and variables of this expression.
-      eval.inputRow = ctx.INPUT_ROW
-      eval.inputVars = findInputVars(ctx, eval)
-
       reduceCodeSize(ctx, eval)
       if (eval.code.nonEmpty) {
         // Add `this` in the comment.
@@ -121,29 +115,9 @@ abstract class Expression extends TreeNode[Expression] {
     }
   }
 
-  /**
-   * Returns the input variables to this expression.
-   */
-  private def findInputVars(ctx: CodegenContext, eval: ExprCode): Seq[ExprInputVar] = {
-    if (ctx.currentVars != null) {
-      this.collect {
-        case b @ BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null =>
-          ExprInputVar(exprCode = ctx.currentVars(ordinal),
-            dataType = b.dataType, nullable = b.nullable)
-      }
-    } else {
-      Seq.empty
-    }
-  }
-
-  /**
-   * In order to prevent 64kb compile error, reducing the size of generated codes by
-   * separating it into a function if the size exceeds a threshold.
-   */
   private def reduceCodeSize(ctx: CodegenContext, eval: ExprCode): Unit = {
-    lazy val funcParams = ExpressionCodegen.getExpressionInputParams(ctx, this)
-
-    if (eval.code.trim.length > 1024 && funcParams.isDefined) {
+    // TODO: support whole stage codegen too
+    if (eval.code.trim.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) {
       val setIsNull = if (eval.isNull != "false" && eval.isNull != "true") {
         val globalIsNull = ctx.freshName("globalIsNull")
         ctx.addMutableState(ctx.JAVA_BOOLEAN, globalIsNull)
@@ -158,12 +132,9 @@ abstract class Expression extends TreeNode[Expression] {
       val newValue = ctx.freshName("value")
 
       val funcName = ctx.freshName(nodeName)
-      val callParams = funcParams.map(_._1.mkString(", ")).get
-      val declParams = funcParams.map(_._2.mkString(", ")).get
-
       val funcFullName = ctx.addNewFunction(funcName,
         s"""
-           |private $javaType $funcName($declParams) {
+           |private $javaType $funcName(InternalRow ${ctx.INPUT_ROW}) {
            |  ${eval.code.trim}
            |  $setIsNull
            |  return ${eval.value};
@@ -171,7 +142,7 @@ abstract class Expression extends TreeNode[Expression] {
            """.stripMargin)
 
       eval.value = newValue
-      eval.code = s"$javaType $newValue = $funcFullName($callParams);"
+      eval.code = s"$javaType $newValue = $funcFullName(${ctx.INPUT_ROW});"
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index b1d931117f99b..3a03a65e1af92 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -55,24 +55,8 @@ import org.apache.spark.util.{ParentClassLoader, Utils}
  *                 to null.
  * @param value A term for a (possibly primitive) value of the result of the evaluation. Not
  *              valid if `isNull` is set to `true`.
- * @param inputRow A term that holds the input row name when generating this code.
- * @param inputVars A list of [[ExprInputVar]] that holds input variables when generating this code.
  */
-case class ExprCode(
-    var code: String,
-    var isNull: String,
-    var value: String,
-    var inputRow: String = null,
-    var inputVars: Seq[ExprInputVar] = Seq.empty)
-
-/**
- * Represents an input variable [[ExprCode]] to an evaluation of an [[Expression]].
- *
- * @param exprCode The [[ExprCode]] that represents the evaluation result for the input variable.
- * @param dataType The data type of the input variable.
- * @param nullable Whether the input variable can be null or not.
- */
-case class ExprInputVar(exprCode: ExprCode, dataType: DataType, nullable: Boolean)
+case class ExprCode(var code: String, var isNull: String, var value: String)
 
 /**
  * State used for subexpression elimination.
@@ -1028,25 +1012,16 @@ class CodegenContext {
     commonExprs.foreach { e =>
       val expr = e.head
       val fnName = freshName("evalExpr")
-      val isNull = if (expr.nullable) {
-        s"${fnName}IsNull"
-      } else {
-        ""
-      }
+      val isNull = s"${fnName}IsNull"
       val value = s"${fnName}Value"
 
       // Generate the code for this expression tree and wrap it in a function.
       val eval = expr.genCode(this)
-      val assignIsNull = if (expr.nullable) {
-        s"$isNull = ${eval.isNull};"
-      } else {
-        ""
-      }
       val fn =
         s"""
            |private void $fnName(InternalRow $INPUT_ROW) {
            |  ${eval.code.trim}
-           |  $assignIsNull
+           |  $isNull = ${eval.isNull};
            |  $value = ${eval.value};
            |}
            """.stripMargin
@@ -1064,17 +1039,12 @@ class CodegenContext {
       //   2. Less code.
       // Currently, we will do this for all non-leaf only expression trees (i.e. expr trees with
       // at least two nodes) as the cost of doing it is expected to be low.
-      if (expr.nullable) {
-        addMutableState(JAVA_BOOLEAN, isNull)
-      }
-      addMutableState(javaType(expr.dataType), value)
+      addMutableState(JAVA_BOOLEAN, isNull, s"$isNull = false;")
+      addMutableState(javaType(expr.dataType), value,
+        s"$value = ${defaultValue(expr.dataType)};")
 
       subexprFunctions += s"${addNewFunction(fnName, fn)}($INPUT_ROW);"
-      val state = if (expr.nullable) {
-        SubExprEliminationState(isNull, value)
-      } else {
-        SubExprEliminationState("false", value)
-      }
+      val state = SubExprEliminationState(isNull, value)
       e.foreach(subExprEliminationExprs.put(_, state))
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
deleted file mode 100644
index a2dda48e951d1..0000000000000
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegen.scala
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions.codegen
-
-import scala.collection.mutable
-
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.DataType
-
-/**
- * Defines util methods used in expression code generation.
- */
-object ExpressionCodegen {
-
-  /**
-   * Given an expression, returns the all necessary parameters to evaluate it, so the generated
-   * code of this expression can be split in a function.
-   * The 1st string in returned tuple is the parameter strings used to call the function.
-   * The 2nd string in returned tuple is the parameter strings used to declare the function.
-   *
-   * Returns `None` if it can't produce valid parameters.
-   *
-   * Params to include:
-   * 1. Evaluated columns referred by this, children or deferred expressions.
-   * 2. Rows referred by this, children or deferred expressions.
-   * 3. Eliminated subexpressions referred by children expressions.
-   */
-  def getExpressionInputParams(
-      ctx: CodegenContext,
-      expr: Expression): Option[(Seq[String], Seq[String])] = {
-    val subExprs = getSubExprInChildren(ctx, expr)
-    val subExprCodes = getSubExprCodes(ctx, subExprs)
-    val subVars = subExprs.zip(subExprCodes).map { case (subExpr, subExprCode) =>
-      ExprInputVar(subExprCode, subExpr.dataType, subExpr.nullable)
-    }
-    val paramsFromSubExprs = prepareFunctionParams(ctx, subVars)
-
-    val inputVars = getInputVarsForChildren(ctx, expr)
-    val paramsFromColumns = prepareFunctionParams(ctx, inputVars)
-
-    val inputRows = ctx.INPUT_ROW +: getInputRowsForChildren(ctx, expr)
-    val paramsFromRows = inputRows.distinct.filter(_ != null).map { row =>
-      (row, s"InternalRow $row")
-    }
-
-    val paramsLength = getParamLength(ctx, inputVars ++ subVars) + paramsFromRows.length
-    // Maximum allowed parameter number for Java's method descriptor.
-    if (paramsLength > 255) {
-      None
-    } else {
-      val allParams = (paramsFromRows ++ paramsFromColumns ++ paramsFromSubExprs).unzip
-      val callParams = allParams._1.distinct
-      val declParams = allParams._2.distinct
-      Some((callParams, declParams))
-    }
-  }
-
-  /**
-   * Returns the eliminated subexpressions in the children expressions.
-   */
-  def getSubExprInChildren(ctx: CodegenContext, expr: Expression): Seq[Expression] = {
-    expr.children.flatMap { child =>
-      child.collect {
-        case e if ctx.subExprEliminationExprs.contains(e) => e
-      }
-    }.distinct
-  }
-
-  /**
-   * A small helper function to return `ExprCode`s that represent subexpressions.
-   */
-  def getSubExprCodes(ctx: CodegenContext, subExprs: Seq[Expression]): Seq[ExprCode] = {
-    subExprs.map { subExpr =>
-      val state = ctx.subExprEliminationExprs(subExpr)
-      ExprCode(code = "", value = state.value, isNull = state.isNull)
-    }
-  }
-
-  /**
-   * Retrieves previous input rows referred by children and deferred expressions.
-   */
-  def getInputRowsForChildren(ctx: CodegenContext, expr: Expression): Seq[String] = {
-    expr.children.flatMap(getInputRows(ctx, _)).distinct
-  }
-
-  /**
-   * Given a child expression, retrieves previous input rows referred by it or deferred expressions
-   * which are needed to evaluate it.
-   */
-  def getInputRows(ctx: CodegenContext, child: Expression): Seq[String] = {
-    child.flatMap {
-      // An expression directly evaluates on current input row.
-      case BoundReference(ordinal, _, _) if ctx.currentVars == null ||
-          ctx.currentVars(ordinal) == null =>
-        Seq(ctx.INPUT_ROW)
-
-      // An expression which is not evaluated yet. Tracks down to find input rows.
-      case BoundReference(ordinal, _, _) if !isEvaluated(ctx.currentVars(ordinal)) =>
-        trackDownRow(ctx, ctx.currentVars(ordinal))
-
-      case _ => Seq.empty
-    }.distinct
-  }
-
-  /**
-   * Tracks down input rows referred by the generated code snippet.
-   */
-  def trackDownRow(ctx: CodegenContext, exprCode: ExprCode): Seq[String] = {
-    val exprCodes = mutable.Queue[ExprCode](exprCode)
-    val inputRows = mutable.ArrayBuffer.empty[String]
-
-    while (exprCodes.nonEmpty) {
-      val curExprCode = exprCodes.dequeue()
-      if (curExprCode.inputRow != null) {
-        inputRows += curExprCode.inputRow
-      }
-      curExprCode.inputVars.foreach { inputVar =>
-        if (!isEvaluated(inputVar.exprCode)) {
-          exprCodes.enqueue(inputVar.exprCode)
-        }
-      }
-    }
-    inputRows
-  }
-
-  /**
-   * Retrieves previously evaluated columns referred by children and deferred expressions.
-   * Returned tuple contains the list of expressions and the list of generated codes.
-   */
-  def getInputVarsForChildren(
-      ctx: CodegenContext,
-      expr: Expression): Seq[ExprInputVar] = {
-    expr.children.flatMap(getInputVars(ctx, _)).distinct
-  }
-
-  /**
-   * Given a child expression, retrieves previously evaluated columns referred by it or
-   * deferred expressions which are needed to evaluate it.
-   */
-  def getInputVars(ctx: CodegenContext, child: Expression): Seq[ExprInputVar] = {
-    if (ctx.currentVars == null) {
-      return Seq.empty
-    }
-
-    child.flatMap {
-      // An evaluated variable.
-      case b @ BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null &&
-          isEvaluated(ctx.currentVars(ordinal)) =>
-        Seq(ExprInputVar(ctx.currentVars(ordinal), b.dataType, b.nullable))
-
-      // An input variable which is not evaluated yet. Tracks down to find any evaluated variables
-      // in the expression path.
-      // E.g., if this expression is "d = c + 1" and "c" is not evaluated. We need to track to
-      // "c = a + b" and see if "a" and "b" are evaluated. If they are, we need to return them so
-      // to include them into parameters, if not, we track down further.
-      case BoundReference(ordinal, _, _) if ctx.currentVars(ordinal) != null =>
-        trackDownVar(ctx, ctx.currentVars(ordinal))
-
-      case _ => Seq.empty
-    }.distinct
-  }
-
-  /**
-   * Tracks down previously evaluated columns referred by the generated code snippet.
-   */
-  def trackDownVar(ctx: CodegenContext, exprCode: ExprCode): Seq[ExprInputVar] = {
-    val exprCodes = mutable.Queue[ExprCode](exprCode)
-    val inputVars = mutable.ArrayBuffer.empty[ExprInputVar]
-
-    while (exprCodes.nonEmpty) {
-      exprCodes.dequeue().inputVars.foreach { inputVar =>
-        if (isEvaluated(inputVar.exprCode)) {
-          inputVars += inputVar
-        } else {
-          exprCodes.enqueue(inputVar.exprCode)
-        }
-      }
-    }
-    inputVars
-  }
-
-  /**
-   * Helper function to calculate the size of an expression as function parameter.
-   */
-  def calculateParamLength(ctx: CodegenContext, input: ExprInputVar): Int = {
-    (if (input.nullable) 1 else 0) + ctx.javaType(input.dataType) match {
-      case ctx.JAVA_LONG | ctx.JAVA_DOUBLE => 2
-      case _ => 1
-    }
-  }
-
-  /**
-   * In Java, a method descriptor is valid only if it represents method parameters with a total
-   * length of 255 or less. `this` contributes one unit and a parameter of type long or double
-   * contributes two units.
-   */
-  def getParamLength(ctx: CodegenContext, inputs: Seq[ExprInputVar]): Int = {
-    // Initial value is 1 for `this`.
-    1 + inputs.map(calculateParamLength(ctx, _)).sum
-  }
-
-  /**
-   * Given the lists of input attributes and variables to this expression, returns the strings of
-   * funtion parameters. The first is the variable names used to call the function, the second is
-   * the parameters used to declare the function in generated code.
-   */
-  def prepareFunctionParams(
-      ctx: CodegenContext,
-      inputVars: Seq[ExprInputVar]): Seq[(String, String)] = {
-    inputVars.flatMap { inputVar =>
-      val params = mutable.ArrayBuffer.empty[(String, String)]
-      val ev = inputVar.exprCode
-
-      // Only include the expression value if it is not a literal.
-      if (!isLiteral(ev)) {
-        val argType = ctx.javaType(inputVar.dataType)
-        params += ((ev.value, s"$argType ${ev.value}"))
-      }
-
-      // If it is a nullable expression and `isNull` is not a literal.
-      if (inputVar.nullable && ev.isNull != "true" && ev.isNull != "false") {
-        params += ((ev.isNull, s"boolean ${ev.isNull}"))
-      }
-
-      params
-    }.distinct
-  }
-
-  /**
-   * Only applied to the `ExprCode` in `ctx.currentVars`.
-   * Returns true if this value is a literal.
-   */
-  def isLiteral(exprCode: ExprCode): Boolean = {
-    assert(exprCode.value.nonEmpty, "ExprCode.value can't be empty string.")
-
-    if (exprCode.value == "true" || exprCode.value == "false" || exprCode.value == "null") {
-      true
-    } else {
-      // The valid characters for the first character of a Java variable is [a-zA-Z_$].
-      exprCode.value.head match {
-        case v if v >= 'a' && v <= 'z' => false
-        case v if v >= 'A' && v <= 'Z' => false
-        case '_' | '$' => false
-        case _ => true
-      }
-    }
-  }
-
-  /**
-   * Only applied to the `ExprCode` in `ctx.currentVars`.
-   * The code is emptied after evaluation.
-   */
-  def isEvaluated(exprCode: ExprCode): Boolean = exprCode.code == ""
-}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala
deleted file mode 100644
index 39d58cabff228..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/ExpressionCodegenSuite.scala
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions.codegen
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.types.IntegerType
-
-class ExpressionCodegenSuite extends SparkFunSuite {
-
-  test("Returns eliminated subexpressions for expression") {
-    val ctx = new CodegenContext()
-    val subExpr = Add(Literal(1), Literal(2))
-    val exprs = Seq(Add(subExpr, Literal(3)), Add(subExpr, Literal(4)))
-
-    ctx.generateExpressions(exprs, doSubexpressionElimination = true)
-    val subexpressions = ExpressionCodegen.getSubExprInChildren(ctx, exprs(0))
-    assert(subexpressions.length == 1 && subexpressions(0) == subExpr)
-  }
-
-  test("Gets parameters for subexpressions") {
-    val ctx = new CodegenContext()
-    val subExprs = Seq(
-      Add(Literal(1), AttributeReference("a", IntegerType, nullable = false)()), // non-nullable
-      Add(Literal(2), AttributeReference("b", IntegerType, nullable = true)()))  // nullable
-
-    ctx.subExprEliminationExprs.put(subExprs(0), SubExprEliminationState("false", "value1"))
-    ctx.subExprEliminationExprs.put(subExprs(1), SubExprEliminationState("isNull2", "value2"))
-
-    val subExprCodes = ExpressionCodegen.getSubExprCodes(ctx, subExprs)
-    val subVars = subExprs.zip(subExprCodes).map { case (expr, exprCode) =>
-      ExprInputVar(exprCode, expr.dataType, expr.nullable)
-    }
-    val params = ExpressionCodegen.prepareFunctionParams(ctx, subVars)
-    assert(params.length == 3)
-    assert(params(0) == Tuple2("value1", "int value1"))
-    assert(params(1) == Tuple2("value2", "int value2"))
-    assert(params(2) == Tuple2("isNull2", "boolean isNull2"))
-  }
-
-  test("Returns input variables for expression: current variables") {
-    val ctx = new CodegenContext()
-    val currentVars = Seq(
-      ExprCode("", isNull = "false", value = "value1"),             // evaluated
-      ExprCode("", isNull = "isNull2", value = "value2"),           // evaluated
-      ExprCode("fake code;", isNull = "isNull3", value = "value3")) // not evaluated
-    ctx.currentVars = currentVars
-    ctx.INPUT_ROW = null
-
-    val expr = If(Literal(false),
-      Add(BoundReference(0, IntegerType, nullable = false),
-          BoundReference(1, IntegerType, nullable = true)),
-        BoundReference(2, IntegerType, nullable = true))
-
-    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
-    // Only two evaluated variables included.
-    assert(inputVars.length == 2)
-    assert(inputVars(0).dataType == IntegerType && inputVars(0).nullable == false)
-    assert(inputVars(1).dataType == IntegerType && inputVars(1).nullable == true)
-    assert(inputVars(0).exprCode == currentVars(0))
-    assert(inputVars(1).exprCode == currentVars(1))
-
-    val params = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
-    assert(params.length == 3)
-    assert(params(0) == Tuple2("value1", "int value1"))
-    assert(params(1) == Tuple2("value2", "int value2"))
-    assert(params(2) == Tuple2("isNull2", "boolean isNull2"))
-  }
-
-  test("Returns input variables for expression: deferred variables") {
-    val ctx = new CodegenContext()
-
-    // The referred column is not evaluated yet. But it depends on an evaluated column from
-    // other operator.
-    val currentVars = Seq(ExprCode("fake code;", isNull = "isNull1", value = "value1"))
-
-    // currentVars(0) depends on this evaluated column.
-    currentVars(0).inputVars = Seq(ExprInputVar(ExprCode("", isNull = "isNull2", value = "value2"),
-      dataType = IntegerType, nullable = true))
-    ctx.currentVars = currentVars
-    ctx.INPUT_ROW = null
-
-    val expr = Add(Literal(1), BoundReference(0, IntegerType, nullable = false))
-    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
-    assert(inputVars.length == 1)
-    assert(inputVars(0).dataType == IntegerType && inputVars(0).nullable == true)
-
-    val params = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
-    assert(params.length == 2)
-    assert(params(0) == Tuple2("value2", "int value2"))
-    assert(params(1) == Tuple2("isNull2", "boolean isNull2"))
-  }
-
-  test("Returns input rows for expression") {
-    val ctx = new CodegenContext()
-    ctx.currentVars = null
-    ctx.INPUT_ROW = "i"
-
-    val expr = Add(BoundReference(0, IntegerType, nullable = false),
-      BoundReference(1, IntegerType, nullable = true))
-    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
-    assert(inputRows.length == 1)
-    assert(inputRows(0) == "i")
-  }
-
-  test("Returns input rows for expression: deferred expression") {
-    val ctx = new CodegenContext()
-
-    // The referred column is not evaluated yet. But it depends on an input row from
-    // other operator.
-    val currentVars = Seq(ExprCode("fake code;", isNull = "isNull1", value = "value1"))
-    currentVars(0).inputRow = "inputadaptor_row1"
-    ctx.currentVars = currentVars
-    ctx.INPUT_ROW = null
-
-    val expr = Add(Literal(1), BoundReference(0, IntegerType, nullable = false))
-    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
-    assert(inputRows.length == 1)
-    assert(inputRows(0) == "inputadaptor_row1")
-  }
-
-  test("Returns both input rows and variables for expression") {
-    val ctx = new CodegenContext()
-    // 5 input variables in currentVars:
-    //   1 evaluated variable (value1).
-    //   3 not evaluated variables.
-    //     value2 depends on an evaluated column from other operator.
-    //     value3 depends on an input row from other operator.
-    //     value4 depends on a not evaluated yet column from other operator.
-    //   1 null indicating to use input row "i".
-    val currentVars = Seq(
-      ExprCode("", isNull = "false", value = "value1"),
-      ExprCode("fake code;", isNull = "isNull2", value = "value2"),
-      ExprCode("fake code;", isNull = "isNull3", value = "value3"),
-      ExprCode("fake code;", isNull = "isNull4", value = "value4"),
-      null)
-    // value2 depends on this evaluated column.
-    currentVars(1).inputVars = Seq(ExprInputVar(ExprCode("", isNull = "isNull5", value = "value5"),
-      dataType = IntegerType, nullable = true))
-    // value3 depends on an input row "inputadaptor_row1".
-    currentVars(2).inputRow = "inputadaptor_row1"
-    // value4 depends on another not evaluated yet column.
-    currentVars(3).inputVars = Seq(ExprInputVar(ExprCode("fake code;",
-      isNull = "isNull6", value = "value6"), dataType = IntegerType, nullable = true))
-    ctx.currentVars = currentVars
-    ctx.INPUT_ROW = "i"
-
-    // expr: if (false) { value1 + value2 } else { (value3 + value4) + i[5] }
-    val expr = If(Literal(false),
-      Add(BoundReference(0, IntegerType, nullable = false),
-          BoundReference(1, IntegerType, nullable = true)),
-      Add(Add(BoundReference(2, IntegerType, nullable = true),
-              BoundReference(3, IntegerType, nullable = true)),
-          BoundReference(4, IntegerType, nullable = true))) // this is based on input row "i".
-
-    // input rows: "i", "inputadaptor_row1".
-    val inputRows = ExpressionCodegen.getInputRowsForChildren(ctx, expr)
-    assert(inputRows.length == 2)
-    assert(inputRows(0) == "inputadaptor_row1")
-    assert(inputRows(1) == "i")
-
-    // input variables: value1 and value5
-    val inputVars = ExpressionCodegen.getInputVarsForChildren(ctx, expr)
-    assert(inputVars.length == 2)
-
-    // value1 has inlined isNull "false", so don't need to include it in the params.
-    val inputVarParams = ExpressionCodegen.prepareFunctionParams(ctx, inputVars)
-    assert(inputVarParams.length == 3)
-    assert(inputVarParams(0) == Tuple2("value1", "int value1"))
-    assert(inputVarParams(1) == Tuple2("value5", "int value5"))
-    assert(inputVarParams(2) == Tuple2("isNull5", "boolean isNull5"))
-  }
-
-  test("isLiteral: literals") {
-    val literals = Seq(
-      ExprCode("", "", "true"),
-      ExprCode("", "", "false"),
-      ExprCode("", "", "1"),
-      ExprCode("", "", "-1"),
-      ExprCode("", "", "1L"),
-      ExprCode("", "", "-1L"),
-      ExprCode("", "", "1.0f"),
-      ExprCode("", "", "-1.0f"),
-      ExprCode("", "", "0.1f"),
-      ExprCode("", "", "-0.1f"),
-      ExprCode("", "", """"string""""),
-      ExprCode("", "", "(byte)-1"),
-      ExprCode("", "", "(short)-1"),
-      ExprCode("", "", "null"))
-
-    literals.foreach(l => assert(ExpressionCodegen.isLiteral(l) == true))
-  }
-
-  test("isLiteral: non literals") {
-    val variables = Seq(
-      ExprCode("", "", "var1"),
-      ExprCode("", "", "_var2"),
-      ExprCode("", "", "$var3"),
-      ExprCode("", "", "v1a2r3"),
-      ExprCode("", "", "_1v2a3r"),
-      ExprCode("", "", "$1v2a3r"))
-
-    variables.foreach(v => assert(ExpressionCodegen.isLiteral(v) == false))
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 05186c4472566..a9bfb634fbdea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -108,10 +108,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
          |}""".stripMargin)
 
     ctx.currentVars = null
-    // `rowIdx` isn't in `ctx.currentVars`. If the expressions are split later, we can't track it.
-    // So making it as global variable.
     val rowidx = ctx.freshName("rowIdx")
-    ctx.addMutableState(ctx.JAVA_INT, rowidx)
     val columnsBatchInput = (output zip colVars).map { case (attr, colVar) =>
       genCodeColumnVector(ctx, colVar, rowidx, attr.dataType, attr.nullable)
     }
@@ -131,7 +128,7 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
        |  int $numRows = $batch.numRows();
        |  int $localEnd = $numRows - $idx;
        |  for (int $localIdx = 0; $localIdx < $localEnd; $localIdx++) {
-       |    $rowidx = $idx + $localIdx;
+       |    int $rowidx = $idx + $localIdx;
        |    ${consume(ctx, columnsBatchInput).trim}
        |    $shouldStop
        |  }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
index 1281169b607c9..bc05dca578c47 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.sql.{Column, QueryTest, Row, SaveMode}
-import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
@@ -237,24 +236,4 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext {
       }
     }
   }
-
-  test("SPARK-22551: Fix 64kb limit for deeply nested expressions under wholestage codegen") {
-    import testImplicits._
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      val df = Seq(("abc", 1)).toDF("key", "int")
-      df.write.parquet(path)
-
-      var strExpr: Expression = col("key").expr
-      for (_ <- 1 to 150) {
-        strExpr = Decode(Encode(strExpr, Literal("utf-8")), Literal("utf-8"))
-      }
-      val expressions = Seq(If(EqualTo(strExpr, strExpr), strExpr, strExpr))
-
-      val df2 = spark.read.parquet(path).select(expressions.map(Column(_)): _*)
-      val plan = df2.queryExecution.executedPlan
-      assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
-      df2.collect()
-    }
-  }
 }

From 1e44dd004425040912f2cf16362d2c13f12e1689 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 13 Dec 2017 21:19:14 -0800
Subject: [PATCH 116/356] [SPARK-3181][ML] Implement huber loss for
 LinearRegression.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
MLlib ```LinearRegression``` supports _huber_ loss addition to _leastSquares_ loss. The huber loss objective function is:
![image](https://user-images.githubusercontent.com/1962026/29554124-9544d198-8750-11e7-8afa-33579ec419d5.png)
Refer Eq.(6) and Eq.(8) in [A robust hybrid of lasso and ridge regression](http://statweb.stanford.edu/~owen/reports/hhu.pdf). This objective is jointly convex as a function of (w, σ) ∈ R × (0,∞), we can use L-BFGS-B to solve it.

The current implementation is a straight forward porting for Python scikit-learn [```HuberRegressor```](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html). There are some differences:
* We use mean loss (```lossSum/weightSum```), but sklearn uses total loss (```lossSum```).
* We multiply the loss function and L2 regularization by 1/2. It does not affect the result if we multiply the whole formula by a factor, we just keep consistent with _leastSquares_ loss.

So if fitting w/o regularization, MLlib and sklearn produce the same output. If fitting w/ regularization, MLlib should set ```regParam``` divide by the number of instances to match the output of sklearn.

## How was this patch tested?
Unit tests.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19020 from yanboliang/spark-3181.
---
 .../ml/optim/aggregator/HuberAggregator.scala | 150 +++++++++
 .../ml/param/shared/SharedParamsCodeGen.scala |   3 +-
 .../spark/ml/param/shared/sharedParams.scala  |  17 +
 .../ml/regression/LinearRegression.scala      | 299 ++++++++++++++----
 .../aggregator/HuberAggregatorSuite.scala     | 170 ++++++++++
 .../ml/regression/LinearRegressionSuite.scala | 244 +++++++++++++-
 project/MimaExcludes.scala                    |   5 +
 7 files changed, 823 insertions(+), 65 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala
new file mode 100644
index 0000000000000..13f64d2d50424
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/optim/aggregator/HuberAggregator.scala
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg.Vector
+
+/**
+ * HuberAggregator computes the gradient and loss for a huber loss function,
+ * as used in robust regression for samples in sparse or dense vector in an online fashion.
+ *
+ * The huber loss function based on:
+ * <a href="http://statweb.stanford.edu/~owen/reports/hhu.pdf">Art B. Owen (2006),
+ * A robust hybrid of lasso and ridge regression</a>.
+ *
+ * Two HuberAggregator can be merged together to have a summary of loss and gradient of
+ * the corresponding joint dataset.
+ *
+ * The huber loss function is given by
+ *
+ * <blockquote>
+ *   $$
+ *   \begin{align}
+ *   \min_{w, \sigma}\frac{1}{2n}{\sum_{i=1}^n\left(\sigma +
+ *   H_m\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \frac{1}{2}\lambda {||w||_2}^2}
+ *   \end{align}
+ *   $$
+ * </blockquote>
+ *
+ * where
+ *
+ * <blockquote>
+ *   $$
+ *   \begin{align}
+ *   H_m(z) = \begin{cases}
+ *            z^2, & \text {if } |z| &lt; \epsilon, \\
+ *            2\epsilon|z| - \epsilon^2, & \text{otherwise}
+ *            \end{cases}
+ *   \end{align}
+ *   $$
+ * </blockquote>
+ *
+ * It is advised to set the parameter $\epsilon$ to 1.35 to achieve 95% statistical efficiency
+ * for normally distributed data. Please refer to chapter 2 of
+ * <a href="http://statweb.stanford.edu/~owen/reports/hhu.pdf">
+ * A robust hybrid of lasso and ridge regression</a> for more detail.
+ *
+ * @param fitIntercept Whether to fit an intercept term.
+ * @param epsilon The shape parameter to control the amount of robustness.
+ * @param bcFeaturesStd The broadcast standard deviation values of the features.
+ * @param bcParameters including three parts: the regression coefficients corresponding
+ *                     to the features, the intercept (if fitIntercept is ture)
+ *                     and the scale parameter (sigma).
+ */
+private[ml] class HuberAggregator(
+    fitIntercept: Boolean,
+    epsilon: Double,
+    bcFeaturesStd: Broadcast[Array[Double]])(bcParameters: Broadcast[Vector])
+  extends DifferentiableLossAggregator[Instance, HuberAggregator] {
+
+  protected override val dim: Int = bcParameters.value.size
+  private val numFeatures: Int = if (fitIntercept) dim - 2 else dim - 1
+  private val sigma: Double = bcParameters.value(dim - 1)
+  private val intercept: Double = if (fitIntercept) {
+    bcParameters.value(dim - 2)
+  } else {
+    0.0
+  }
+
+  /**
+   * Add a new training instance to this HuberAggregator, and update the loss and gradient
+   * of the objective function.
+   *
+   * @param instance The instance of data point to be added.
+   * @return This HuberAggregator object.
+   */
+  def add(instance: Instance): HuberAggregator = {
+    instance match { case Instance(label, weight, features) =>
+      require(numFeatures == features.size, s"Dimensions mismatch when adding new sample." +
+        s" Expecting $numFeatures but got ${features.size}.")
+      require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
+
+      if (weight == 0.0) return this
+      val localFeaturesStd = bcFeaturesStd.value
+      val localCoefficients = bcParameters.value.toArray.slice(0, numFeatures)
+      val localGradientSumArray = gradientSumArray
+
+      val margin = {
+        var sum = 0.0
+        features.foreachActive { (index, value) =>
+          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+            sum += localCoefficients(index) * (value / localFeaturesStd(index))
+          }
+        }
+        if (fitIntercept) sum += intercept
+        sum
+      }
+      val linearLoss = label - margin
+
+      if (math.abs(linearLoss) <= sigma * epsilon) {
+        lossSum += 0.5 * weight * (sigma + math.pow(linearLoss, 2.0) / sigma)
+        val linearLossDivSigma = linearLoss / sigma
+
+        features.foreachActive { (index, value) =>
+          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+            localGradientSumArray(index) +=
+              -1.0 * weight * linearLossDivSigma * (value / localFeaturesStd(index))
+          }
+        }
+        if (fitIntercept) {
+          localGradientSumArray(dim - 2) += -1.0 * weight * linearLossDivSigma
+        }
+        localGradientSumArray(dim - 1) += 0.5 * weight * (1.0 - math.pow(linearLossDivSigma, 2.0))
+      } else {
+        val sign = if (linearLoss >= 0) -1.0 else 1.0
+        lossSum += 0.5 * weight *
+          (sigma + 2.0 * epsilon * math.abs(linearLoss) - sigma * epsilon * epsilon)
+
+        features.foreachActive { (index, value) =>
+          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
+            localGradientSumArray(index) +=
+              weight * sign * epsilon * (value / localFeaturesStd(index))
+          }
+        }
+        if (fitIntercept) {
+          localGradientSumArray(dim - 2) += weight * sign * epsilon
+        }
+        localGradientSumArray(dim - 1) += 0.5 * weight * (1.0 - epsilon * epsilon)
+      }
+
+      weightSum += weight
+      this
+    }
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index a267bbc874322..a5d57a15317e6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -88,7 +88,8 @@ private[shared] object SharedParamsCodeGen {
         "during tuning. If set to false, then only the single best sub-model will be available " +
         "after fitting. If set to true, then all sub-models will be available. Warning: For " +
         "large models, collecting all sub-models can cause OOMs on the Spark driver",
-        Some("false"), isExpertParam = true)
+        Some("false"), isExpertParam = true),
+      ParamDesc[String]("loss", "the loss function to be optimized", finalFields = false)
     )
 
     val code = genSharedParams(params)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 0004f085d10f2..13425dacc9f18 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -487,4 +487,21 @@ trait HasCollectSubModels extends Params {
   /** @group expertGetParam */
   final def getCollectSubModels: Boolean = $(collectSubModels)
 }
+
+/**
+ * Trait for shared param loss. This trait may be changed or
+ * removed between minor versions.
+ */
+@DeveloperApi
+trait HasLoss extends Params {
+
+  /**
+   * Param for the loss function to be optimized.
+   * @group param
+   */
+  val loss: Param[String] = new Param[String](this, "loss", "the loss function to be optimized")
+
+  /** @group getParam */
+  final def getLoss: String = $(loss)
+}
 // scalastyle:on
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index da6bcf07e4742..a5873d03b4161 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.regression
 import scala.collection.mutable
 
 import breeze.linalg.{DenseVector => BDV}
-import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, OWLQN => BreezeOWLQN}
+import breeze.optimize.{CachedDiffFunction, LBFGS => BreezeLBFGS, LBFGSB => BreezeLBFGSB, OWLQN => BreezeOWLQN}
 import breeze.stats.distributions.StudentsT
 import org.apache.hadoop.fs.Path
 
@@ -32,9 +32,9 @@ import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.linalg.BLAS._
 import org.apache.spark.ml.optim.WeightedLeastSquares
-import org.apache.spark.ml.optim.aggregator.LeastSquaresAggregator
+import org.apache.spark.ml.optim.aggregator.{HuberAggregator, LeastSquaresAggregator}
 import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
-import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.evaluation.RegressionMetrics
@@ -44,8 +44,9 @@ import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.DoubleType
+import org.apache.spark.sql.types.{DataType, DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
+import org.apache.spark.util.VersionUtils.majorMinorVersion
 
 /**
  * Params for linear regression.
@@ -53,7 +54,7 @@ import org.apache.spark.storage.StorageLevel
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
     with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver
-    with HasAggregationDepth {
+    with HasAggregationDepth with HasLoss {
 
   import LinearRegression._
 
@@ -69,25 +70,105 @@ private[regression] trait LinearRegressionParams extends PredictorParams
     "The solver algorithm for optimization. Supported options: " +
       s"${supportedSolvers.mkString(", ")}. (Default auto)",
     ParamValidators.inArray[String](supportedSolvers))
+
+  /**
+   * The loss function to be optimized.
+   * Supported options: "squaredError" and "huber".
+   * Default: "squaredError"
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  final override val loss: Param[String] = new Param[String](this, "loss", "The loss function to" +
+    s" be optimized. Supported options: ${supportedLosses.mkString(", ")}. (Default squaredError)",
+    ParamValidators.inArray[String](supportedLosses))
+
+  /**
+   * The shape parameter to control the amount of robustness. Must be &gt; 1.0.
+   * At larger values of epsilon, the huber criterion becomes more similar to least squares
+   * regression; for small values of epsilon, the criterion is more similar to L1 regression.
+   * Default is 1.35 to get as much robustness as possible while retaining
+   * 95% statistical efficiency for normally distributed data. It matches sklearn
+   * HuberRegressor and is "M" from <a href="http://statweb.stanford.edu/~owen/reports/hhu.pdf">
+   * A robust hybrid of lasso and ridge regression</a>.
+   * Only valid when "loss" is "huber".
+   *
+   * @group expertParam
+   */
+  @Since("2.3.0")
+  final val epsilon = new DoubleParam(this, "epsilon", "The shape parameter to control the " +
+    "amount of robustness. Must be > 1.0.", ParamValidators.gt(1.0))
+
+  /** @group getExpertParam */
+  @Since("2.3.0")
+  def getEpsilon: Double = $(epsilon)
+
+  override protected def validateAndTransformSchema(
+      schema: StructType,
+      fitting: Boolean,
+      featuresDataType: DataType): StructType = {
+    if ($(loss) == Huber) {
+      require($(solver)!= Normal, "LinearRegression with huber loss doesn't support " +
+        "normal solver, please change solver to auto or l-bfgs.")
+      require($(elasticNetParam) == 0.0, "LinearRegression with huber loss only supports " +
+        s"L2 regularization, but got elasticNetParam = $getElasticNetParam.")
+
+    }
+    super.validateAndTransformSchema(schema, fitting, featuresDataType)
+  }
 }
 
 /**
  * Linear regression.
  *
- * The learning objective is to minimize the squared error, with regularization.
- * The specific squared error loss function used is:
- *
- * <blockquote>
- *    $$
- *    L = 1/2n ||A coefficients - y||^2^
- *    $$
- * </blockquote>
+ * The learning objective is to minimize the specified loss function, with regularization.
+ * This supports two kinds of loss:
+ *  - squaredError (a.k.a squared loss)
+ *  - huber (a hybrid of squared error for relatively small errors and absolute error for
+ *  relatively large ones, and we estimate the scale parameter from training data)
  *
  * This supports multiple types of regularization:
  *  - none (a.k.a. ordinary least squares)
  *  - L2 (ridge regression)
  *  - L1 (Lasso)
  *  - L2 + L1 (elastic net)
+ *
+ * The squared error objective function is:
+ *
+ * <blockquote>
+ *   $$
+ *   \begin{align}
+ *   \min_{w}\frac{1}{2n}{\sum_{i=1}^n(X_{i}w - y_{i})^{2} +
+ *   \lambda\left[\frac{1-\alpha}{2}{||w||_{2}}^{2} + \alpha{||w||_{1}}\right]}
+ *   \end{align}
+ *   $$
+ * </blockquote>
+ *
+ * The huber objective function is:
+ *
+ * <blockquote>
+ *   $$
+ *   \begin{align}
+ *   \min_{w, \sigma}\frac{1}{2n}{\sum_{i=1}^n\left(\sigma +
+ *   H_m\left(\frac{X_{i}w - y_{i}}{\sigma}\right)\sigma\right) + \frac{1}{2}\lambda {||w||_2}^2}
+ *   \end{align}
+ *   $$
+ * </blockquote>
+ *
+ * where
+ *
+ * <blockquote>
+ *   $$
+ *   \begin{align}
+ *   H_m(z) = \begin{cases}
+ *            z^2, & \text {if } |z| &lt; \epsilon, \\
+ *            2\epsilon|z| - \epsilon^2, & \text{otherwise}
+ *            \end{cases}
+ *   \end{align}
+ *   $$
+ * </blockquote>
+ *
+ * Note: Fitting with huber loss only supports none and L2 regularization.
  */
 @Since("1.3.0")
 class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String)
@@ -142,6 +223,9 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    * For alpha in (0,1), the penalty is a combination of L1 and L2.
    * Default is 0.0 which is an L2 penalty.
    *
+   * Note: Fitting with huber loss only supports None and L2 regularization,
+   * so throws exception if this param is non-zero value.
+   *
    * @group setParam
    */
   @Since("1.4.0")
@@ -190,6 +274,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
    *    The Normal Equations solver will be used when possible, but this will automatically fall
    *    back to iterative optimization methods when needed.
    *
+   * Note: Fitting with huber loss doesn't support normal solver,
+   * so throws exception if this param was set with "normal".
    * @group setParam
    */
   @Since("1.6.0")
@@ -208,6 +294,26 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
   def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
   setDefault(aggregationDepth -> 2)
 
+  /**
+   * Sets the value of param [[loss]].
+   * Default is "squaredError".
+   *
+   * @group setParam
+   */
+  @Since("2.3.0")
+  def setLoss(value: String): this.type = set(loss, value)
+  setDefault(loss -> SquaredError)
+
+  /**
+   * Sets the value of param [[epsilon]].
+   * Default is 1.35.
+   *
+   * @group setExpertParam
+   */
+  @Since("2.3.0")
+  def setEpsilon(value: Double): this.type = set(epsilon, value)
+  setDefault(epsilon -> 1.35)
+
   override protected def train(dataset: Dataset[_]): LinearRegressionModel = {
     // Extract the number of features before deciding optimization solver.
     val numFeatures = dataset.select(col($(featuresCol))).first().getAs[Vector](0).size
@@ -220,12 +326,12 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     }
 
     val instr = Instrumentation.create(this, dataset)
-    instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol,
-      elasticNetParam, fitIntercept, maxIter, regParam, standardization, aggregationDepth)
+    instr.logParams(labelCol, featuresCol, weightCol, predictionCol, solver, tol, elasticNetParam,
+      fitIntercept, maxIter, regParam, standardization, aggregationDepth, loss, epsilon)
     instr.logNumFeatures(numFeatures)
 
-    if (($(solver) == Auto &&
-      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal) {
+    if ($(loss) == SquaredError && (($(solver) == Auto &&
+      numFeatures <= WeightedLeastSquares.MAX_NUM_FEATURES) || $(solver) == Normal)) {
       // For low dimensional data, WeightedLeastSquares is more efficient since the
       // training algorithm only requires one pass through the data. (SPARK-10668)
 
@@ -330,12 +436,13 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
 
     // Since we implicitly do the feature scaling when we compute the cost function
     // to improve the convergence, the effective regParam will be changed.
-    val effectiveRegParam = $(regParam) / yStd
+    val effectiveRegParam = $(loss) match {
+      case SquaredError => $(regParam) / yStd
+      case Huber => $(regParam)
+    }
     val effectiveL1RegParam = $(elasticNetParam) * effectiveRegParam
     val effectiveL2RegParam = (1.0 - $(elasticNetParam)) * effectiveRegParam
 
-    val getAggregatorFunc = new LeastSquaresAggregator(yStd, yMean, $(fitIntercept),
-      bcFeaturesStd, bcFeaturesMean)(_)
     val getFeaturesStd = (j: Int) => if (j >= 0 && j < numFeatures) featuresStd(j) else 0.0
     val regularization = if (effectiveL2RegParam != 0.0) {
       val shouldApply = (idx: Int) => idx >= 0 && idx < numFeatures
@@ -344,33 +451,58 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
     } else {
       None
     }
-    val costFun = new RDDLossFunction(instances, getAggregatorFunc, regularization,
-      $(aggregationDepth))
 
-    val optimizer = if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
-      new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
-    } else {
-      val standardizationParam = $(standardization)
-      def effectiveL1RegFun = (index: Int) => {
-        if (standardizationParam) {
-          effectiveL1RegParam
+    val costFun = $(loss) match {
+      case SquaredError =>
+        val getAggregatorFunc = new LeastSquaresAggregator(yStd, yMean, $(fitIntercept),
+          bcFeaturesStd, bcFeaturesMean)(_)
+        new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth))
+      case Huber =>
+        val getAggregatorFunc = new HuberAggregator($(fitIntercept), $(epsilon), bcFeaturesStd)(_)
+        new RDDLossFunction(instances, getAggregatorFunc, regularization, $(aggregationDepth))
+    }
+
+    val optimizer = $(loss) match {
+      case SquaredError =>
+        if ($(elasticNetParam) == 0.0 || effectiveRegParam == 0.0) {
+          new BreezeLBFGS[BDV[Double]]($(maxIter), 10, $(tol))
         } else {
-          // If `standardization` is false, we still standardize the data
-          // to improve the rate of convergence; as a result, we have to
-          // perform this reverse standardization by penalizing each component
-          // differently to get effectively the same objective function when
-          // the training dataset is not standardized.
-          if (featuresStd(index) != 0.0) effectiveL1RegParam / featuresStd(index) else 0.0
+          val standardizationParam = $(standardization)
+          def effectiveL1RegFun = (index: Int) => {
+            if (standardizationParam) {
+              effectiveL1RegParam
+            } else {
+              // If `standardization` is false, we still standardize the data
+              // to improve the rate of convergence; as a result, we have to
+              // perform this reverse standardization by penalizing each component
+              // differently to get effectively the same objective function when
+              // the training dataset is not standardized.
+              if (featuresStd(index) != 0.0) effectiveL1RegParam / featuresStd(index) else 0.0
+            }
+          }
+          new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegFun, $(tol))
         }
-      }
-      new BreezeOWLQN[Int, BDV[Double]]($(maxIter), 10, effectiveL1RegFun, $(tol))
+      case Huber =>
+        val dim = if ($(fitIntercept)) numFeatures + 2 else numFeatures + 1
+        val lowerBounds = BDV[Double](Array.fill(dim)(Double.MinValue))
+        // Optimize huber loss in space "\sigma > 0"
+        lowerBounds(dim - 1) = Double.MinPositiveValue
+        val upperBounds = BDV[Double](Array.fill(dim)(Double.MaxValue))
+        new BreezeLBFGSB(lowerBounds, upperBounds, $(maxIter), 10, $(tol))
+    }
+
+    val initialValues = $(loss) match {
+      case SquaredError =>
+        Vectors.zeros(numFeatures)
+      case Huber =>
+        val dim = if ($(fitIntercept)) numFeatures + 2 else numFeatures + 1
+        Vectors.dense(Array.fill(dim)(1.0))
     }
 
-    val initialCoefficients = Vectors.zeros(numFeatures)
     val states = optimizer.iterations(new CachedDiffFunction(costFun),
-      initialCoefficients.asBreeze.toDenseVector)
+      initialValues.asBreeze.toDenseVector)
 
-    val (coefficients, objectiveHistory) = {
+    val (coefficients, intercept, scale, objectiveHistory) = {
       /*
          Note that in Linear Regression, the objective history (loss + regularization) returned
          from optimizer is computed in the scaled space given by the following formula.
@@ -396,35 +528,54 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
       bcFeaturesMean.destroy(blocking = false)
       bcFeaturesStd.destroy(blocking = false)
 
+      val parameters = state.x.toArray.clone()
+
       /*
          The coefficients are trained in the scaled space; we're converting them back to
          the original space.
        */
-      val rawCoefficients = state.x.toArray.clone()
+      val rawCoefficients: Array[Double] = $(loss) match {
+        case SquaredError => parameters
+        case Huber => parameters.slice(0, numFeatures)
+      }
+
       var i = 0
       val len = rawCoefficients.length
+      val multiplier = $(loss) match {
+        case SquaredError => yStd
+        case Huber => 1.0
+      }
       while (i < len) {
-        rawCoefficients(i) *= { if (featuresStd(i) != 0.0) yStd / featuresStd(i) else 0.0 }
+        rawCoefficients(i) *= { if (featuresStd(i) != 0.0) multiplier / featuresStd(i) else 0.0 }
         i += 1
       }
 
-      (Vectors.dense(rawCoefficients).compressed, arrayBuilder.result())
-    }
+      val interceptValue: Double = if ($(fitIntercept)) {
+        $(loss) match {
+          case SquaredError =>
+            /*
+            The intercept of squared error in R's GLMNET is computed using closed form
+            after the coefficients are converged. See the following discussion for detail.
+            http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
+            */
+            yMean - dot(Vectors.dense(rawCoefficients), Vectors.dense(featuresMean))
+          case Huber => parameters(numFeatures)
+        }
+      } else {
+        0.0
+      }
 
-    /*
-       The intercept in R's GLMNET is computed using closed form after the coefficients are
-       converged. See the following discussion for detail.
-       http://stats.stackexchange.com/questions/13617/how-is-the-intercept-computed-in-glmnet
-     */
-    val intercept = if ($(fitIntercept)) {
-      yMean - dot(coefficients, Vectors.dense(featuresMean))
-    } else {
-      0.0
+      val scaleValue: Double = $(loss) match {
+        case SquaredError => 1.0
+        case Huber => parameters.last
+      }
+
+      (Vectors.dense(rawCoefficients).compressed, interceptValue, scaleValue, arrayBuilder.result())
     }
 
     if (handlePersistence) instances.unpersist()
 
-    val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept))
+    val model = copyValues(new LinearRegressionModel(uid, coefficients, intercept, scale))
     // Handle possible missing or invalid prediction columns
     val (summaryModel, predictionColName) = model.findSummaryModelAndPredictionCol()
 
@@ -471,6 +622,15 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
 
   /** Set of solvers that LinearRegression supports. */
   private[regression] val supportedSolvers = Array(Auto, Normal, LBFGS)
+
+  /** String name for "squaredError". */
+  private[regression] val SquaredError = "squaredError"
+
+  /** String name for "huber". */
+  private[regression] val Huber = "huber"
+
+  /** Set of loss function names that LinearRegression supports. */
+  private[regression] val supportedLosses = Array(SquaredError, Huber)
 }
 
 /**
@@ -480,10 +640,14 @@ object LinearRegression extends DefaultParamsReadable[LinearRegression] {
 class LinearRegressionModel private[ml] (
     @Since("1.4.0") override val uid: String,
     @Since("2.0.0") val coefficients: Vector,
-    @Since("1.3.0") val intercept: Double)
+    @Since("1.3.0") val intercept: Double,
+    @Since("2.3.0") val scale: Double)
   extends RegressionModel[Vector, LinearRegressionModel]
   with LinearRegressionParams with MLWritable {
 
+  def this(uid: String, coefficients: Vector, intercept: Double) =
+    this(uid, coefficients, intercept, 1.0)
+
   private var trainingSummary: Option[LinearRegressionTrainingSummary] = None
 
   override val numFeatures: Int = coefficients.size
@@ -570,13 +734,13 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] {
   private[LinearRegressionModel] class LinearRegressionModelWriter(instance: LinearRegressionModel)
     extends MLWriter with Logging {
 
-    private case class Data(intercept: Double, coefficients: Vector)
+    private case class Data(intercept: Double, coefficients: Vector, scale: Double)
 
     override protected def saveImpl(path: String): Unit = {
       // Save metadata and Params
       DefaultParamsWriter.saveMetadata(instance, path, sc)
-      // Save model data: intercept, coefficients
-      val data = Data(instance.intercept, instance.coefficients)
+      // Save model data: intercept, coefficients, scale
+      val data = Data(instance.intercept, instance.coefficients, instance.scale)
       val dataPath = new Path(path, "data").toString
       sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
     }
@@ -592,11 +756,20 @@ object LinearRegressionModel extends MLReadable[LinearRegressionModel] {
 
       val dataPath = new Path(path, "data").toString
       val data = sparkSession.read.format("parquet").load(dataPath)
-      val Row(intercept: Double, coefficients: Vector) =
-        MLUtils.convertVectorColumnsToML(data, "coefficients")
-          .select("intercept", "coefficients")
-          .head()
-      val model = new LinearRegressionModel(metadata.uid, coefficients, intercept)
+      val (majorVersion, minorVersion) = majorMinorVersion(metadata.sparkVersion)
+      val model = if (majorVersion < 2 || (majorVersion == 2 && minorVersion <= 2)) {
+        // Spark 2.2 and before
+        val Row(intercept: Double, coefficients: Vector) =
+          MLUtils.convertVectorColumnsToML(data, "coefficients")
+            .select("intercept", "coefficients")
+            .head()
+        new LinearRegressionModel(metadata.uid, coefficients, intercept)
+      } else {
+        // Spark 2.3 and later
+        val Row(intercept: Double, coefficients: Vector, scale: Double) =
+          data.select("intercept", "coefficients", "scale").head()
+        new LinearRegressionModel(metadata.uid, coefficients, intercept, scale)
+      }
 
       DefaultParamsReader.getAndSetParams(model, metadata)
       model
diff --git a/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala
new file mode 100644
index 0000000000000..718ffa230a749
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/optim/aggregator/HuberAggregatorSuite.scala
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.optim.aggregator
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
+import org.apache.spark.ml.util.TestingUtils._
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+
+class HuberAggregatorSuite extends SparkFunSuite with MLlibTestSparkContext {
+
+  import DifferentiableLossAggregatorSuite.getRegressionSummarizers
+
+  @transient var instances: Array[Instance] = _
+  @transient var instancesConstantFeature: Array[Instance] = _
+  @transient var instancesConstantFeatureFiltered: Array[Instance] = _
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    instances = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
+      Instance(2.0, 0.3, Vectors.dense(4.0, 0.5))
+    )
+    instancesConstantFeature = Array(
+      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0, 1.0)),
+      Instance(2.0, 0.3, Vectors.dense(1.0, 0.5))
+    )
+    instancesConstantFeatureFiltered = Array(
+      Instance(0.0, 0.1, Vectors.dense(2.0)),
+      Instance(1.0, 0.5, Vectors.dense(1.0)),
+      Instance(2.0, 0.3, Vectors.dense(0.5))
+    )
+  }
+
+  /** Get summary statistics for some data and create a new HuberAggregator. */
+  private def getNewAggregator(
+      instances: Array[Instance],
+      parameters: Vector,
+      fitIntercept: Boolean,
+      epsilon: Double): HuberAggregator = {
+    val (featuresSummarizer, _) = getRegressionSummarizers(instances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val bcFeaturesStd = spark.sparkContext.broadcast(featuresStd)
+    val bcParameters = spark.sparkContext.broadcast(parameters)
+    new HuberAggregator(fitIntercept, epsilon, bcFeaturesStd)(bcParameters)
+  }
+
+  test("aggregator add method should check input size") {
+    val parameters = Vectors.dense(1.0, 2.0, 3.0, 4.0)
+    val agg = getNewAggregator(instances, parameters, fitIntercept = true, epsilon = 1.35)
+    withClue("HuberAggregator features dimension must match parameters dimension") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, 1.0, Vectors.dense(2.0)))
+      }
+    }
+  }
+
+  test("negative weight") {
+    val parameters = Vectors.dense(1.0, 2.0, 3.0, 4.0)
+    val agg = getNewAggregator(instances, parameters, fitIntercept = true, epsilon = 1.35)
+    withClue("HuberAggregator does not support negative instance weights.") {
+      intercept[IllegalArgumentException] {
+        agg.add(Instance(1.0, -1.0, Vectors.dense(2.0, 1.0)))
+      }
+    }
+  }
+
+  test("check sizes") {
+    val paramWithIntercept = Vectors.dense(1.0, 2.0, 3.0, 4.0)
+    val paramWithoutIntercept = Vectors.dense(1.0, 2.0, 4.0)
+    val aggIntercept = getNewAggregator(instances, paramWithIntercept,
+      fitIntercept = true, epsilon = 1.35)
+    val aggNoIntercept = getNewAggregator(instances, paramWithoutIntercept,
+      fitIntercept = false, epsilon = 1.35)
+    instances.foreach(aggIntercept.add)
+    instances.foreach(aggNoIntercept.add)
+
+    assert(aggIntercept.gradient.size === 4)
+    assert(aggNoIntercept.gradient.size === 3)
+  }
+
+  test("check correctness") {
+    val parameters = Vectors.dense(1.0, 2.0, 3.0, 4.0)
+    val numFeatures = 2
+    val (featuresSummarizer, _) = getRegressionSummarizers(instances)
+    val featuresStd = featuresSummarizer.variance.toArray.map(math.sqrt)
+    val epsilon = 1.35
+    val weightSum = instances.map(_.weight).sum
+
+    val agg = getNewAggregator(instances, parameters, fitIntercept = true, epsilon)
+    instances.foreach(agg.add)
+
+    // compute expected loss sum
+    val coefficients = parameters.toArray.slice(0, 2)
+    val intercept = parameters(2)
+    val sigma = parameters(3)
+    val stdCoef = coefficients.indices.map(i => coefficients(i) / featuresStd(i)).toArray
+    val lossSum = instances.map { case Instance(label, weight, features) =>
+      val margin = BLAS.dot(Vectors.dense(stdCoef), features) + intercept
+      val linearLoss = label - margin
+      if (math.abs(linearLoss) <= sigma * epsilon) {
+        0.5 * weight * (sigma +  math.pow(linearLoss, 2.0) / sigma)
+      } else {
+        0.5 * weight * (sigma + 2.0 * epsilon * math.abs(linearLoss) - sigma * epsilon * epsilon)
+      }
+    }.sum
+    val loss = lossSum / weightSum
+
+    // compute expected gradients
+    val gradientCoef = new Array[Double](numFeatures + 2)
+    instances.foreach { case Instance(label, weight, features) =>
+      val margin = BLAS.dot(Vectors.dense(stdCoef), features) + intercept
+      val linearLoss = label - margin
+      if (math.abs(linearLoss) <= sigma * epsilon) {
+        features.toArray.indices.foreach { i =>
+          gradientCoef(i) +=
+            -1.0 * weight * (linearLoss / sigma) * (features(i) / featuresStd(i))
+        }
+        gradientCoef(2) += -1.0 * weight * (linearLoss / sigma)
+        gradientCoef(3) += 0.5 * weight * (1.0 - math.pow(linearLoss / sigma, 2.0))
+      } else {
+        val sign = if (linearLoss >= 0) -1.0 else 1.0
+        features.toArray.indices.foreach { i =>
+          gradientCoef(i) += weight * sign * epsilon * (features(i) / featuresStd(i))
+        }
+        gradientCoef(2) += weight * sign * epsilon
+        gradientCoef(3) += 0.5 * weight * (1.0 - epsilon * epsilon)
+      }
+    }
+    val gradient = Vectors.dense(gradientCoef.map(_ / weightSum))
+
+    assert(loss ~== agg.loss relTol 0.01)
+    assert(gradient ~== agg.gradient relTol 0.01)
+  }
+
+  test("check with zero standard deviation") {
+    val parameters = Vectors.dense(1.0, 2.0, 3.0, 4.0)
+    val parametersFiltered = Vectors.dense(2.0, 3.0, 4.0)
+    val aggConstantFeature = getNewAggregator(instancesConstantFeature, parameters,
+      fitIntercept = true, epsilon = 1.35)
+    val aggConstantFeatureFiltered = getNewAggregator(instancesConstantFeatureFiltered,
+      parametersFiltered, fitIntercept = true, epsilon = 1.35)
+    instances.foreach(aggConstantFeature.add)
+    instancesConstantFeatureFiltered.foreach(aggConstantFeatureFiltered.add)
+    // constant features should not affect gradient
+    def validateGradient(grad: Vector, gradFiltered: Vector): Unit = {
+      assert(grad(0) === 0.0)
+      assert(grad(1) ~== gradFiltered(0) relTol 0.01)
+    }
+
+    validateGradient(aggConstantFeature.gradient, aggConstantFeatureFiltered.gradient)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index aec5ac0e75896..9bb2895858f33 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -41,6 +41,7 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
   @transient var datasetWithWeight: DataFrame = _
   @transient var datasetWithWeightConstantLabel: DataFrame = _
   @transient var datasetWithWeightZeroLabel: DataFrame = _
+  @transient var datasetWithOutlier: DataFrame = _
 
   override def beforeAll(): Unit = {
     super.beforeAll()
@@ -107,6 +108,16 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       Instance(0.0, 3.0, Vectors.dense(2.0, 11.0)),
       Instance(0.0, 4.0, Vectors.dense(3.0, 13.0))
     ), 2).toDF()
+
+    datasetWithOutlier = {
+      val inlierData = LinearDataGenerator.generateLinearInput(
+        intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(0.7, 1.2), nPoints = 900, seed, eps = 0.1)
+      val outlierData = LinearDataGenerator.generateLinearInput(
+        intercept = -2.1, weights = Array(0.6, -1.2), xMean = Array(0.9, -1.3),
+        xVariance = Array(1.5, 0.8), nPoints = 100, seed, eps = 0.1)
+      sc.parallelize(inlierData ++ outlierData, 2).map(_.asML).toDF()
+    }
   }
 
   /**
@@ -127,6 +138,10 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
     datasetWithSparseFeature.rdd.map { case Row(label: Double, features: Vector) =>
       label + "," + features.toArray.mkString(",")
     }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithSparseFeature")
+
+    datasetWithOutlier.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithOutlier")
   }
 
   test("params") {
@@ -144,7 +159,9 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(lir.getElasticNetParam === 0.0)
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
-    assert(lir.getSolver == "auto")
+    assert(lir.getSolver === "auto")
+    assert(lir.getLoss === "squaredError")
+    assert(lir.getEpsilon === 1.35)
     val model = lir.fit(datasetWithDenseFeature)
 
     MLTestingUtils.checkCopyAndUids(lir, model)
@@ -160,11 +177,27 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
     assert(model.getFeaturesCol === "features")
     assert(model.getPredictionCol === "prediction")
     assert(model.intercept !== 0.0)
+    assert(model.scale === 1.0)
     assert(model.hasParent)
     val numFeatures = datasetWithDenseFeature.select("features").first().getAs[Vector](0).size
     assert(model.numFeatures === numFeatures)
   }
 
+  test("linear regression: illegal params") {
+    withClue("LinearRegression with huber loss only supports L2 regularization") {
+      intercept[IllegalArgumentException] {
+        new LinearRegression().setLoss("huber").setElasticNetParam(0.5)
+          .fit(datasetWithDenseFeature)
+      }
+    }
+
+    withClue("LinearRegression with huber loss doesn't support normal solver") {
+      intercept[IllegalArgumentException] {
+        new LinearRegression().setLoss("huber").setSolver("normal").fit(datasetWithDenseFeature)
+      }
+    }
+  }
+
   test("linear regression handles singular matrices") {
     // check for both constant columns with intercept (zero std) and collinear
     val singularDataConstantColumn = sc.parallelize(Seq(
@@ -837,6 +870,7 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       (1.0, 0.21, true, true)
     )
 
+    // For squaredError loss
     for (solver <- Seq("auto", "l-bfgs", "normal");
          (elasticNetParam, regParam, fitIntercept, standardization) <- testParams) {
       val estimator = new LinearRegression()
@@ -852,6 +886,22 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
         datasetWithStrongNoise.as[LabeledPoint], estimator, modelEquals, seed)
     }
+
+    // For huber loss
+    for ((_, regParam, fitIntercept, standardization) <- testParams) {
+      val estimator = new LinearRegression()
+        .setLoss("huber")
+        .setFitIntercept(fitIntercept)
+        .setStandardization(standardization)
+        .setRegParam(regParam)
+      MLTestingUtils.testArbitrarilyScaledWeights[LinearRegressionModel, LinearRegression](
+        datasetWithOutlier.as[LabeledPoint], estimator, modelEquals)
+      MLTestingUtils.testOutliersWithSmallWeights[LinearRegressionModel, LinearRegression](
+        datasetWithOutlier.as[LabeledPoint], estimator, numClasses, modelEquals,
+        outlierRatio = 3)
+      MLTestingUtils.testOversamplingVsWeighting[LinearRegressionModel, LinearRegression](
+        datasetWithOutlier.as[LabeledPoint], estimator, modelEquals, seed)
+    }
   }
 
   test("linear regression model with l-bfgs with big feature datasets") {
@@ -1004,6 +1054,198 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       }
     }
   }
+
+  test("linear regression (huber loss) with intercept without regularization") {
+    val trainer1 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(true).setStandardization(true)
+    val trainer2 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(true).setStandardization(false)
+
+    val model1 = trainer1.fit(datasetWithOutlier)
+    val model2 = trainer2.fit(datasetWithOutlier)
+
+    /*
+      Using the following Python code to load the data and train the model using
+      scikit-learn package.
+
+      import pandas as pd
+      import numpy as np
+      from sklearn.linear_model import HuberRegressor
+      df = pd.read_csv("path", header = None)
+      X = df[df.columns[1:3]]
+      y = np.array(df[df.columns[0]])
+      huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100, epsilon=1.35)
+      huber.fit(X, y)
+
+      >>> huber.coef_
+      array([ 4.68998007,  7.19429011])
+      >>> huber.intercept_
+      6.3002404351083037
+      >>> huber.scale_
+      0.077810159205220747
+     */
+    val coefficientsPy = Vectors.dense(4.68998007, 7.19429011)
+    val interceptPy = 6.30024044
+    val scalePy = 0.07781016
+
+    assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
+    assert(model1.intercept ~== interceptPy relTol 1E-3)
+    assert(model1.scale ~== scalePy relTol 1E-3)
+
+    // Without regularization, with or without standardization will converge to the same solution.
+    assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
+    assert(model2.intercept ~== interceptPy relTol 1E-3)
+    assert(model2.scale ~== scalePy relTol 1E-3)
+  }
+
+  test("linear regression (huber loss) without intercept without regularization") {
+    val trainer1 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(false).setStandardization(true)
+    val trainer2 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(false).setStandardization(false)
+
+    val model1 = trainer1.fit(datasetWithOutlier)
+    val model2 = trainer2.fit(datasetWithOutlier)
+
+    /*
+      huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100, epsilon=1.35)
+      huber.fit(X, y)
+
+      >>> huber.coef_
+      array([ 6.71756703,  5.08873222])
+      >>> huber.intercept_
+      0.0
+      >>> huber.scale_
+      2.5560209922722317
+     */
+    val coefficientsPy = Vectors.dense(6.71756703, 5.08873222)
+    val interceptPy = 0.0
+    val scalePy = 2.55602099
+
+    assert(model1.coefficients ~= coefficientsPy relTol 1E-3)
+    assert(model1.intercept === interceptPy)
+    assert(model1.scale ~== scalePy relTol 1E-3)
+
+    // Without regularization, with or without standardization will converge to the same solution.
+    assert(model2.coefficients ~= coefficientsPy relTol 1E-3)
+    assert(model2.intercept === interceptPy)
+    assert(model2.scale ~== scalePy relTol 1E-3)
+  }
+
+  test("linear regression (huber loss) with intercept with L2 regularization") {
+    val trainer1 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(true).setRegParam(0.21).setStandardization(true)
+    val trainer2 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(true).setRegParam(0.21).setStandardization(false)
+
+    val model1 = trainer1.fit(datasetWithOutlier)
+    val model2 = trainer2.fit(datasetWithOutlier)
+
+    /*
+      Since scikit-learn HuberRegressor does not support standardization,
+      we do it manually out of the estimator.
+
+      xStd = np.std(X, axis=0)
+      scaledX = X / xStd
+      huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35)
+      huber.fit(scaledX, y)
+
+      >>> np.array(huber.coef_ / xStd)
+      array([ 1.97732633,  3.38816722])
+      >>> huber.intercept_
+      3.7527581430531227
+      >>> huber.scale_
+      3.787363673371801
+     */
+    val coefficientsPy1 = Vectors.dense(1.97732633, 3.38816722)
+    val interceptPy1 = 3.75275814
+    val scalePy1 = 3.78736367
+
+    assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
+    assert(model1.intercept ~== interceptPy1 relTol 1E-2)
+    assert(model1.scale ~== scalePy1 relTol 1E-2)
+
+    /*
+      huber = HuberRegressor(fit_intercept=True, alpha=210, max_iter=100, epsilon=1.35)
+      huber.fit(X, y)
+
+      >>> huber.coef_
+      array([ 1.73346444,  3.63746999])
+      >>> huber.intercept_
+      4.3017134790781739
+      >>> huber.scale_
+      3.6472742809286793
+     */
+    val coefficientsPy2 = Vectors.dense(1.73346444, 3.63746999)
+    val interceptPy2 = 4.30171347
+    val scalePy2 = 3.64727428
+
+    assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
+    assert(model2.intercept ~== interceptPy2 relTol 1E-3)
+    assert(model2.scale ~== scalePy2 relTol 1E-3)
+  }
+
+  test("linear regression (huber loss) without intercept with L2 regularization") {
+    val trainer1 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(false).setRegParam(0.21).setStandardization(true)
+    val trainer2 = (new LinearRegression).setLoss("huber")
+      .setFitIntercept(false).setRegParam(0.21).setStandardization(false)
+
+    val model1 = trainer1.fit(datasetWithOutlier)
+    val model2 = trainer2.fit(datasetWithOutlier)
+
+    /*
+      Since scikit-learn HuberRegressor does not support standardization,
+      we do it manually out of the estimator.
+
+      xStd = np.std(X, axis=0)
+      scaledX = X / xStd
+      huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35)
+      huber.fit(scaledX, y)
+
+      >>> np.array(huber.coef_ / xStd)
+      array([ 2.59679008,  2.26973102])
+      >>> huber.intercept_
+      0.0
+      >>> huber.scale_
+      4.5766311924091791
+     */
+    val coefficientsPy1 = Vectors.dense(2.59679008, 2.26973102)
+    val interceptPy1 = 0.0
+    val scalePy1 = 4.57663119
+
+    assert(model1.coefficients ~= coefficientsPy1 relTol 1E-2)
+    assert(model1.intercept === interceptPy1)
+    assert(model1.scale ~== scalePy1 relTol 1E-2)
+
+    /*
+      huber = HuberRegressor(fit_intercept=False, alpha=210, max_iter=100, epsilon=1.35)
+      huber.fit(X, y)
+
+      >>> huber.coef_
+      array([ 2.28423908,  2.25196887])
+      >>> huber.intercept_
+      0.0
+      >>> huber.scale_
+      4.5979643506051753
+     */
+    val coefficientsPy2 = Vectors.dense(2.28423908, 2.25196887)
+    val interceptPy2 = 0.0
+    val scalePy2 = 4.59796435
+
+    assert(model2.coefficients ~= coefficientsPy2 relTol 1E-3)
+    assert(model2.intercept === interceptPy2)
+    assert(model2.scale ~== scalePy2 relTol 1E-3)
+  }
+
+  test("huber loss model match squared error for large epsilon") {
+    val trainer1 = new LinearRegression().setLoss("huber").setEpsilon(1E5)
+    val model1 = trainer1.fit(datasetWithOutlier)
+    val trainer2 = new LinearRegression()
+    val model2 = trainer2.fit(datasetWithOutlier)
+    assert(model1.coefficients ~== model2.coefficients relTol 1E-3)
+    assert(model1.intercept ~== model2.intercept relTol 1E-3)
+  }
 }
 
 object LinearRegressionSuite {
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9be01f617217b..9902fedb65d59 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -1065,6 +1065,11 @@ object MimaExcludes {
       // [SPARK-21680][ML][MLLIB]optimzie Vector coompress
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.mllib.linalg.Vector.toSparseWithSize"),
       ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.ml.linalg.Vector.toSparseWithSize")
+    ) ++ Seq(
+      // [SPARK-3181][ML]Implement huber loss for LinearRegression.
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasLoss.org$apache$spark$ml$param$shared$HasLoss$_setter_$loss_="),
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasLoss.getLoss"),
+      ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasLoss.loss")
     )
   }
 

From f8c7c1f21aa9d1fd38b584ca8c4adf397966e9f7 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Wed, 13 Dec 2017 22:31:39 -0800
Subject: [PATCH 117/356] [SPARK-22732] Add Structured Streaming APIs to
 DataSourceV2

## What changes were proposed in this pull request?

This PR provides DataSourceV2 API support for structured streaming, including new pieces needed to support continuous processing [SPARK-20928]. High level summary:

- DataSourceV2 includes new mixins to support micro-batch and continuous reads and writes. For reads, we accept an optional user specified schema rather than using the ReadSupportWithSchema model, because doing so would severely complicate the interface.

- DataSourceV2Reader includes new interfaces to read a specific microbatch or read continuously from a given offset. These follow the same setter pattern as the existing Supports* mixins so that they can work with SupportsScanUnsafeRow.

- DataReader (the per-partition reader) has a new subinterface ContinuousDataReader only for continuous processing. This reader has a special method to check progress, and next() blocks for new input rather than returning false.

- Offset, an abstract representation of position in a streaming query, is ported to the public API. (Each type of reader will define its own Offset implementation.)

- DataSourceV2Writer has a new subinterface ContinuousWriter only for continuous processing. Commits to this interface come tagged with an epoch number, as the execution engine will continue to produce new epoch commits as the task continues indefinitely.

Note that this PR does not propose to change the existing DataSourceV2 batch API, or deprecate the existing streaming source/sink internal APIs in spark.sql.execution.streaming.

## How was this patch tested?

Toy implementations of the new interfaces with unit tests.

Author: Jose Torres <jose@databricks.com>

Closes #19925 from joseph-torres/continuous-api.
---
 .../spark/sql/kafka010/KafkaSource.scala      |   1 +
 .../sql/kafka010/KafkaSourceOffset.scala      |   3 +-
 .../spark/sql/kafka010/KafkaSourceSuite.scala |   1 +
 .../sql/sources/v2/ContinuousReadSupport.java |  42 +++++
 .../sources/v2/ContinuousWriteSupport.java    |  54 ++++++
 .../sql/sources/v2/DataSourceV2Options.java   |   8 +
 .../sql/sources/v2/MicroBatchReadSupport.java |  52 +++++
 .../sources/v2/MicroBatchWriteSupport.java    |  58 ++++++
 .../v2/reader/ContinuousDataReader.java       |  36 ++++
 .../sources/v2/reader/ContinuousReader.java   |  68 +++++++
 .../sources/v2/reader/MicroBatchReader.java   |  64 +++++++
 .../spark/sql/sources/v2/reader/Offset.java   |  60 ++++++
 .../sources/v2/reader/PartitionOffset.java    |  30 +++
 .../sources/v2/writer/ContinuousWriter.java   |  41 ++++
 .../streaming/BaseStreamingSink.java          |  27 +++
 .../streaming/BaseStreamingSource.java        |  37 ++++
 .../streaming/FileStreamSource.scala          |   1 +
 .../streaming/FileStreamSourceOffset.scala    |   3 +
 .../sql/execution/streaming/LongOffset.scala  |   2 +
 .../sql/execution/streaming/Offset.scala      |  34 +---
 .../sql/execution/streaming/OffsetSeq.scala   |   1 +
 .../execution/streaming/OffsetSeqLog.scala    |   1 +
 .../streaming/RateSourceProvider.scala        |  22 ++-
 .../streaming/RateStreamOffset.scala          |  29 +++
 .../streaming/RateStreamSourceV2.scala        | 162 ++++++++++++++++
 .../sql/execution/streaming/Source.scala      |   1 +
 .../execution/streaming/StreamExecution.scala |   1 +
 .../execution/streaming/StreamProgress.scala  |   2 +
 .../ContinuousRateStreamSource.scala          | 152 +++++++++++++++
 .../sql/execution/streaming/memory.scala      |   1 +
 .../sql/execution/streaming/memoryV2.scala    | 178 ++++++++++++++++++
 .../sql/execution/streaming/socket.scala      |   1 +
 .../streaming/MemorySinkV2Suite.scala         |  82 ++++++++
 .../streaming/OffsetSeqLogSuite.scala         |   1 +
 .../execution/streaming/RateSourceSuite.scala |   1 +
 .../streaming/RateSourceV2Suite.scala         | 155 +++++++++++++++
 .../sql/streaming/FileStreamSourceSuite.scala |   1 +
 .../spark/sql/streaming/OffsetSuite.scala     |   3 +-
 .../spark/sql/streaming/StreamSuite.scala     |   1 +
 .../spark/sql/streaming/StreamTest.scala      |   1 +
 .../streaming/StreamingAggregationSuite.scala |   1 +
 .../StreamingQueryListenerSuite.scala         |   1 +
 .../sql/streaming/StreamingQuerySuite.scala   |   1 +
 .../test/DataStreamReaderWriterSuite.scala    |   1 +
 .../sql/streaming/util/BlockingSource.scala   |   3 +-
 45 files changed, 1390 insertions(+), 35 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSink.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index e9cff04ba5f2e..87f31fcc20ae6 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.kafka010.KafkaSource._
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5da415b3097e..6e24423df4abc 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql.kafka010
 
 import org.apache.kafka.common.TopicPartition
 
-import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
+import org.apache.spark.sql.execution.streaming.SerializedOffset
+import org.apache.spark.sql.sources.v2.reader.Offset
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2034b9be07f24..9cac0e5ae7117 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
new file mode 100644
index 0000000000000..ae4f85820649f
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.Optional;
+
+import org.apache.spark.sql.sources.v2.reader.ContinuousReader;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data reading ability for continuous stream processing.
+ */
+public interface ContinuousReadSupport extends DataSourceV2 {
+  /**
+   * Creates a {@link ContinuousReader} to scan the data from this data source.
+   *
+   * @param schema the user provided schema, or empty() if none was provided
+   * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
+   *                           recovery. Readers for the same logical source in the same query
+   *                           will be given the same checkpointLocation.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
+   */
+  ContinuousReader createContinuousReader(Optional<StructType> schema, String checkpointLocation, DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java
new file mode 100644
index 0000000000000..362d5f52b4d00
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.Optional;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
+import org.apache.spark.sql.sources.v2.writer.ContinuousWriter;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.streaming.OutputMode;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data writing ability for continuous stream processing.
+ */
+@InterfaceStability.Evolving
+public interface ContinuousWriteSupport extends BaseStreamingSink {
+
+    /**
+     * Creates an optional {@link ContinuousWriter} to save the data to this data source. Data
+     * sources can return None if there is no writing needed to be done.
+     *
+     * @param queryId A unique string for the writing query. It's possible that there are many writing
+     *                queries running at the same time, and the returned {@link DataSourceV2Writer}
+     *                can use this id to distinguish itself from others.
+     * @param schema the schema of the data to be written.
+     * @param mode the output mode which determines what successive epoch output means to this
+     *             sink, please refer to {@link OutputMode} for more details.
+     * @param options the options for the returned data source writer, which is an immutable
+     *                case-insensitive string-to-string map.
+     */
+    Optional<ContinuousWriter> createContinuousWriter(
+        String queryId,
+        StructType schema,
+        OutputMode mode,
+        DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
index e98c04517c3db..ddc2acca693ac 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/DataSourceV2Options.java
@@ -36,6 +36,10 @@ private String toLowerCase(String key) {
     return key.toLowerCase(Locale.ROOT);
   }
 
+  public static DataSourceV2Options empty() {
+    return new DataSourceV2Options(new HashMap<>());
+  }
+
   public DataSourceV2Options(Map<String, String> originalMap) {
     keyLowerCasedMap = new HashMap<>(originalMap.size());
     for (Map.Entry<String, String> entry : originalMap.entrySet()) {
@@ -43,6 +47,10 @@ public DataSourceV2Options(Map<String, String> originalMap) {
     }
   }
 
+  public Map<String, String> asMap() {
+    return new HashMap<>(keyLowerCasedMap);
+  }
+
   /**
    * Returns the option value to which the specified key is mapped, case-insensitively.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
new file mode 100644
index 0000000000000..442cad029d211
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.Optional;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.reader.MicroBatchReader;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide streaming micro-batch data reading ability.
+ */
+@InterfaceStability.Evolving
+public interface MicroBatchReadSupport extends DataSourceV2 {
+  /**
+   * Creates a {@link MicroBatchReader} to read batches of data from this data source in a
+   * streaming query.
+   *
+   * The execution engine will create a micro-batch reader at the start of a streaming query,
+   * alternate calls to setOffsetRange and createReadTasks for each batch to process, and then
+   * call stop() when the execution is complete. Note that a single query may have multiple
+   * executions due to restart or failure recovery.
+   *
+   * @param schema the user provided schema, or empty() if none was provided
+   * @param checkpointLocation a path to Hadoop FS scratch space that can be used for failure
+   *                           recovery. Readers for the same logical source in the same query
+   *                           will be given the same checkpointLocation.
+   * @param options the options for the returned data source reader, which is an immutable
+   *                case-insensitive string-to-string map.
+   */
+  MicroBatchReader createMicroBatchReader(
+      Optional<StructType> schema,
+      String checkpointLocation,
+      DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java
new file mode 100644
index 0000000000000..63640779b955c
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import java.util.Optional;
+
+import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.streaming.OutputMode;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * provide data writing ability and save the data from a microbatch to the data source.
+ */
+@InterfaceStability.Evolving
+public interface MicroBatchWriteSupport extends BaseStreamingSink {
+
+  /**
+   * Creates an optional {@link DataSourceV2Writer} to save the data to this data source. Data
+   * sources can return None if there is no writing needed to be done.
+   *
+   * @param queryId A unique string for the writing query. It's possible that there are many writing
+   *                queries running at the same time, and the returned {@link DataSourceV2Writer}
+   *                can use this id to distinguish itself from others.
+   * @param epochId The uniquenumeric ID of the batch within this writing query. This is an
+   *                incrementing counter representing a consistent set of data; the same batch may
+   *                be started multiple times in failure recovery scenarios, but it will always
+   *                contain the same records.
+   * @param schema the schema of the data to be written.
+   * @param mode the output mode which determines what successive batch output means to this
+   *             sink, please refer to {@link OutputMode} for more details.
+   * @param options the options for the returned data source writer, which is an immutable
+   *                case-insensitive string-to-string map.
+   */
+  Optional<DataSourceV2Writer> createMicroBatchWriter(
+      String queryId,
+      long epochId,
+      StructType schema,
+      OutputMode mode,
+      DataSourceV2Options options);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java
new file mode 100644
index 0000000000000..11b99a93f1494
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.sql.sources.v2.reader.PartitionOffset;
+
+import java.io.IOException;
+
+/**
+ * A variation on {@link DataReader} for use with streaming in continuous processing mode.
+ */
+public interface ContinuousDataReader<T> extends DataReader<T> {
+    /**
+     * Get the offset of the current record, or the start offset if no records have been read.
+     *
+     * The execution engine will call this method along with get() to keep track of the current
+     * offset. When an epoch ends, the offset of the previous record in each partition will be saved
+     * as a restart checkpoint.
+     */
+    PartitionOffset getOffset();
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
new file mode 100644
index 0000000000000..1baf82c2df762
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.sql.sources.v2.reader.PartitionOffset;
+import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
+
+import java.util.Optional;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to allow reading in a continuous processing mode stream.
+ *
+ * Implementations must ensure each read task output is a {@link ContinuousDataReader}.
+ */
+public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reader {
+    /**
+     * Merge offsets coming from {@link ContinuousDataReader} instances in each partition to
+     * a single global offset.
+     */
+    Offset mergeOffsets(PartitionOffset[] offsets);
+
+    /**
+     * Deserialize a JSON string into an Offset of the implementation-defined offset type.
+     * @throws IllegalArgumentException if the JSON does not encode a valid offset for this reader
+     */
+    Offset deserializeOffset(String json);
+
+    /**
+     * Set the desired start offset for read tasks created from this reader. The scan will start
+     * from the first record after the provided offset, or from an implementation-defined inferred
+     * starting point if no offset is provided.
+     */
+    void setOffset(Optional<Offset> start);
+
+    /**
+     * Return the specified or inferred start offset for this reader.
+     *
+     * @throws IllegalStateException if setOffset has not been called
+     */
+    Offset getStartOffset();
+
+    /**
+     * The execution engine will call this method in every epoch to determine if new read tasks need
+     * to be generated, which may be required if for example the underlying source system has had
+     * partitions added or removed.
+     *
+     * If true, the query will be shut down and restarted with a new reader.
+     */
+    default boolean needsReconfiguration() {
+        return false;
+    }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
new file mode 100644
index 0000000000000..438e3f55b7bcf
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import org.apache.spark.sql.sources.v2.reader.Offset;
+import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
+
+import java.util.Optional;
+
+/**
+ * A mix-in interface for {@link DataSourceV2Reader}. Data source readers can implement this
+ * interface to indicate they allow micro-batch streaming reads.
+ */
+public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSource {
+    /**
+     * Set the desired offset range for read tasks created from this reader. Read tasks will
+     * generate only data within (`start`, `end`]; that is, from the first record after `start` to
+     * the record with offset `end`.
+     *
+     * @param start The initial offset to scan from. If not specified, scan from an
+     *              implementation-specified start point, such as the earliest available record.
+     * @param end The last offset to include in the scan. If not specified, scan up to an
+     *            implementation-defined endpoint, such as the last available offset
+     *            or the start offset plus a target batch size.
+     */
+    void setOffsetRange(Optional<Offset> start, Optional<Offset> end);
+
+    /**
+     * Returns the specified (if explicitly set through setOffsetRange) or inferred start offset
+     * for this reader.
+     *
+     * @throws IllegalStateException if setOffsetRange has not been called
+     */
+    Offset getStartOffset();
+
+    /**
+     * Return the specified (if explicitly set through setOffsetRange) or inferred end offset
+     * for this reader.
+     *
+     * @throws IllegalStateException if setOffsetRange has not been called
+     */
+    Offset getEndOffset();
+
+    /**
+     * Deserialize a JSON string into an Offset of the implementation-defined offset type.
+     * @throws IllegalArgumentException if the JSON does not encode a valid offset for this reader
+     */
+    Offset deserializeOffset(String json);
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
new file mode 100644
index 0000000000000..1ebd35356f1a3
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+/**
+ * An abstract representation of progress through a [[MicroBatchReader]] or [[ContinuousReader]].
+ * During execution, Offsets provided by the data source implementation will be logged and used as
+ * restart checkpoints. Sources should provide an Offset implementation which they can use to
+ * reconstruct the stream position where the offset was taken.
+ */
+public abstract class Offset {
+    /**
+     * A JSON-serialized representation of an Offset that is
+     * used for saving offsets to the offset log.
+     * Note: We assume that equivalent/equal offsets serialize to
+     * identical JSON strings.
+     *
+     * @return JSON string encoding
+     */
+    public abstract String json();
+
+    /**
+     * Equality based on JSON string representation. We leverage the
+     * JSON representation for normalization between the Offset's
+     * in memory and on disk representations.
+     */
+    @Override
+    public boolean equals(Object obj) {
+        if (obj instanceof Offset) {
+            return this.json().equals(((Offset) obj).json());
+        } else {
+            return false;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        return this.json().hashCode();
+    }
+
+    @Override
+    public String toString() {
+        return this.json();
+    }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java
new file mode 100644
index 0000000000000..07826b6688476
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.reader;
+
+import java.io.Serializable;
+
+/**
+ * Used for per-partition offsets in continuous processing. ContinuousReader implementations will
+ * provide a method to merge these into a global Offset.
+ *
+ * These offsets must be serializable.
+ */
+public interface PartitionOffset extends Serializable {
+    
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java
new file mode 100644
index 0000000000000..618f47ed79ca5
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2.writer;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+/**
+ * A {@link DataSourceV2Writer} for use with continuous stream processing.
+ */
+@InterfaceStability.Evolving
+public interface ContinuousWriter extends DataSourceV2Writer {
+  /**
+   * Commits this writing job for the specified epoch with a list of commit messages. The commit
+   * messages are collected from successful data writers and are produced by
+   * {@link DataWriter#commit()}.
+   *
+   * If this method fails (by throwing an exception), this writing job is considered to have been
+   * failed, and the execution engine will attempt to call {@link #abort(WriterCommitMessage[])}.
+   */
+  void commit(long epochId, WriterCommitMessage[] messages);
+
+  default void commit(WriterCommitMessage[] messages) {
+    throw new UnsupportedOperationException(
+       "Commit without epoch should not be called with ContinuousWriter");
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSink.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSink.java
new file mode 100644
index 0000000000000..ac96c2765368f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSink.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming;
+
+/**
+ * The shared interface between V1 and V2 streaming sinks.
+ *
+ * This is a temporary interface for compatibility during migration. It should not be implemented
+ * directly, and will be removed in future versions.
+ */
+public interface BaseStreamingSink {
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
new file mode 100644
index 0000000000000..3a02cbfe7afe3
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming;
+
+import org.apache.spark.sql.sources.v2.reader.Offset;
+
+/**
+ * The shared interface between V1 streaming sources and V2 streaming readers.
+ *
+ * This is a temporary interface for compatibility during migration. It should not be implemented
+ * directly, and will be removed in future versions.
+ */
+public interface BaseStreamingSource {
+  /**
+   * Informs the source that Spark has completed processing all data for offsets less than or
+   * equal to `end` and will only request offsets greater than `end` in the future.
+   */
+  void commit(Offset end);
+
+  /** Stop this source and free any resources it has allocated. */
+  void stop();
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index 0debd7db84757..a33b785126765 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -27,6 +27,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 import org.apache.spark.sql.execution.datasources.{DataSource, InMemoryFileIndex, LogicalRelation}
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
index 06d0fe6c18c1e..431e5b99e3e98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
@@ -22,8 +22,11 @@ import scala.util.control.Exception._
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
+import org.apache.spark.sql.sources.v2.reader.Offset
+
 /**
  * Offset for the [[FileStreamSource]].
+ *
  * @param logOffset  Position in the [[FileStreamSourceLog]]
  */
 case class FileStreamSourceOffset(logOffset: Long) extends Offset {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
index 5f0b195fcfcb8..7ea31462ca7b0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import org.apache.spark.sql.sources.v2.reader.Offset
+
 /**
  * A simple offset for sources that produce a single linear stream of data.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
index 4efcee0f8f9d6..73f0c6221c5c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
@@ -17,38 +17,8 @@
 
 package org.apache.spark.sql.execution.streaming
 
-/**
- * An offset is a monotonically increasing metric used to track progress in the computation of a
- * stream. Since offsets are retrieved from a [[Source]] by a single thread, we know the global
- * ordering of two [[Offset]] instances.  We do assume that if two offsets are `equal` then no
- * new data has arrived.
- */
-abstract class Offset {
-
-  /**
-   * Equality based on JSON string representation. We leverage the
-   * JSON representation for normalization between the Offset's
-   * in memory and on disk representations.
-   */
-  override def equals(obj: Any): Boolean = obj match {
-    case o: Offset => this.json == o.json
-    case _ => false
-  }
-
-  override def hashCode(): Int = this.json.hashCode
+import org.apache.spark.sql.sources.v2.reader.Offset
 
-  override def toString(): String = this.json.toString
-
-  /**
-   * A JSON-serialized representation of an Offset that is
-   * used for saving offsets to the offset log.
-   * Note: We assume that equivalent/equal offsets serialize to
-   * identical JSON strings.
-   *
-   * @return JSON string encoding
-   */
-  def json: String
-}
 
 /**
  * Used when loading a JSON serialized offset from external storage.
@@ -58,3 +28,5 @@ abstract class Offset {
  * that accepts a [[SerializedOffset]] for doing the conversion.
  */
 case class SerializedOffset(override val json: String) extends Offset
+
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index 4e0a468b962a2..dcc5935890c8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -23,6 +23,7 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.RuntimeConfig
 import org.apache.spark.sql.internal.SQLConf.{SHUFFLE_PARTITIONS, STATE_STORE_PROVIDER_CLASS}
+import org.apache.spark.sql.sources.v2.reader.Offset
 
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index e3f4abcf9f1dc..bfdbc65296165 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -24,6 +24,7 @@ import java.nio.charset.StandardCharsets._
 import scala.io.{Source => IOSource}
 
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.sources.v2.reader.Offset
 
 /**
  * This class is used to log offsets to persistent files in HDFS.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 077a4778e34a8..50671a46599e6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.streaming
 
 import java.io._
 import java.nio.charset.StandardCharsets
+import java.util.Optional
 import java.util.concurrent.TimeUnit
 
 import org.apache.commons.io.IOUtils
@@ -28,7 +29,10 @@ import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
+import org.apache.spark.sql.sources.v2._
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, MicroBatchReader, Offset}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
@@ -46,7 +50,8 @@ import org.apache.spark.util.{ManualClock, SystemClock}
  *    generated rows. The source will try its best to reach `rowsPerSecond`, but the query may
  *    be resource constrained, and `numPartitions` can be tweaked to help reach the desired speed.
  */
-class RateSourceProvider extends StreamSourceProvider with DataSourceRegister {
+class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
+  with DataSourceV2 with MicroBatchReadSupport with ContinuousReadSupport{
 
   override def sourceSchema(
       sqlContext: SQLContext,
@@ -100,6 +105,21 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister {
       params.get("useManualClock").map(_.toBoolean).getOrElse(false) // Only for testing
     )
   }
+
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = {
+    new RateStreamV2Reader(options)
+  }
+
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): ContinuousReader = {
+    new ContinuousRateStreamReader(options)
+  }
+
   override def shortName(): String = "rate"
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
new file mode 100644
index 0000000000000..13679dfbe446b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.sql.sources.v2.reader.Offset
+
+case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, (Long, Long)])
+  extends Offset {
+  implicit val defaultFormats: DefaultFormats = DefaultFormats
+  override val json = Serialization.write(partitionToValueAndRunTimeMs)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala
new file mode 100644
index 0000000000000..102551c238bfb
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.util.Optional
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
+import org.apache.spark.util.SystemClock
+
+class RateStreamV2Reader(options: DataSourceV2Options)
+  extends MicroBatchReader {
+  implicit val defaultFormats: DefaultFormats = DefaultFormats
+
+  val clock = new SystemClock
+
+  private val numPartitions =
+    options.get(RateStreamSourceV2.NUM_PARTITIONS).orElse("5").toInt
+  private val rowsPerSecond =
+    options.get(RateStreamSourceV2.ROWS_PER_SECOND).orElse("6").toLong
+
+  // The interval (in milliseconds) between rows in each partition.
+  // e.g. if there are 4 global rows per second, and 2 partitions, each partition
+  // should output rows every (1000 * 2 / 4) = 500 ms.
+  private val msPerPartitionBetweenRows = (1000 * numPartitions) / rowsPerSecond
+
+  override def readSchema(): StructType = {
+    StructType(
+      StructField("timestamp", TimestampType, false) ::
+      StructField("value", LongType, false) :: Nil)
+  }
+
+  val creationTimeMs = clock.getTimeMillis()
+
+  private var start: RateStreamOffset = _
+  private var end: RateStreamOffset = _
+
+  override def setOffsetRange(start: Optional[Offset], end: Optional[Offset]): Unit = {
+    this.start = start.orElse(
+      RateStreamSourceV2.createInitialOffset(numPartitions, creationTimeMs))
+      .asInstanceOf[RateStreamOffset]
+
+    this.end = end.orElse {
+      val currentTime = clock.getTimeMillis()
+      RateStreamOffset(
+        this.start.partitionToValueAndRunTimeMs.map {
+          case startOffset @ (part, (currentVal, currentReadTime)) =>
+            // Calculate the number of rows we should advance in this partition (based on the
+            // current time), and output a corresponding offset.
+            val readInterval = currentTime - currentReadTime
+            val numNewRows = readInterval / msPerPartitionBetweenRows
+            if (numNewRows <= 0) {
+              startOffset
+            } else {
+              (part,
+                (currentVal + (numNewRows * numPartitions),
+                currentReadTime + (numNewRows * msPerPartitionBetweenRows)))
+            }
+        }
+      )
+    }.asInstanceOf[RateStreamOffset]
+  }
+
+  override def getStartOffset(): Offset = {
+    if (start == null) throw new IllegalStateException("start offset not set")
+    start
+  }
+  override def getEndOffset(): Offset = {
+    if (end == null) throw new IllegalStateException("end offset not set")
+    end
+  }
+
+  override def deserializeOffset(json: String): Offset = {
+    RateStreamOffset(Serialization.read[Map[Int, (Long, Long)]](json))
+  }
+
+  override def createReadTasks(): java.util.List[ReadTask[Row]] = {
+    val startMap = start.partitionToValueAndRunTimeMs
+    val endMap = end.partitionToValueAndRunTimeMs
+    endMap.keys.toSeq.map { part =>
+      val (endVal, _) = endMap(part)
+      val (startVal, startTimeMs) = startMap(part)
+
+      val packedRows = mutable.ListBuffer[(Long, Long)]()
+      var outVal = startVal + numPartitions
+      var outTimeMs = startTimeMs + msPerPartitionBetweenRows
+      while (outVal <= endVal) {
+        packedRows.append((outTimeMs, outVal))
+        outVal += numPartitions
+        outTimeMs += msPerPartitionBetweenRows
+      }
+
+      RateStreamBatchTask(packedRows).asInstanceOf[ReadTask[Row]]
+    }.toList.asJava
+  }
+
+  override def commit(end: Offset): Unit = {}
+  override def stop(): Unit = {}
+}
+
+case class RateStreamBatchTask(vals: Seq[(Long, Long)]) extends ReadTask[Row] {
+  override def createDataReader(): DataReader[Row] = new RateStreamBatchReader(vals)
+}
+
+class RateStreamBatchReader(vals: Seq[(Long, Long)]) extends DataReader[Row] {
+  var currentIndex = -1
+
+  override def next(): Boolean = {
+    // Return true as long as the new index is in the seq.
+    currentIndex += 1
+    currentIndex < vals.size
+  }
+
+  override def get(): Row = {
+    Row(
+      DateTimeUtils.toJavaTimestamp(DateTimeUtils.fromMillis(vals(currentIndex)._1)),
+      vals(currentIndex)._2)
+  }
+
+  override def close(): Unit = {}
+}
+
+object RateStreamSourceV2 {
+  val NUM_PARTITIONS = "numPartitions"
+  val ROWS_PER_SECOND = "rowsPerSecond"
+
+  private[sql] def createInitialOffset(numPartitions: Int, creationTimeMs: Long) = {
+    RateStreamOffset(
+      Range(0, numPartitions).map { i =>
+        // Note that the starting offset is exclusive, so we have to decrement the starting value
+        // by the increment that will later be applied. The first row output in each
+        // partition will have a value equal to the partition index.
+        (i,
+          ((i - numPartitions).toLong,
+            creationTimeMs))
+      }.toMap)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index 311942f6dbd84..dbb408ffc98d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 406560c260f07..16063c02ce06b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
 import org.apache.spark.sql.execution.command.StreamingExplainCommand
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming._
 import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index a3f3662e6f4c9..770db401c9fd7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.execution.streaming
 
 import scala.collection.{immutable, GenTraversableOnce}
 
+import org.apache.spark.sql.sources.v2.reader.Offset
+
 /**
  * A helper class that looks like a Map[Source, Offset].
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
new file mode 100644
index 0000000000000..77fc26730e52c
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import scala.collection.JavaConverters._
+
+import org.json4s.DefaultFormats
+import org.json4s.jackson.Serialization
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
+
+case class ContinuousRateStreamPartitionOffset(
+   partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset
+
+class ContinuousRateStreamReader(options: DataSourceV2Options)
+  extends ContinuousReader {
+  implicit val defaultFormats: DefaultFormats = DefaultFormats
+
+  val creationTime = System.currentTimeMillis()
+
+  val numPartitions = options.get(RateStreamSourceV2.NUM_PARTITIONS).orElse("5").toInt
+  val rowsPerSecond = options.get(RateStreamSourceV2.ROWS_PER_SECOND).orElse("6").toLong
+  val perPartitionRate = rowsPerSecond.toDouble / numPartitions.toDouble
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    assert(offsets.length == numPartitions)
+    val tuples = offsets.map {
+      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) => (i, (currVal, nextRead))
+    }
+    RateStreamOffset(Map(tuples: _*))
+  }
+
+  override def deserializeOffset(json: String): Offset = {
+    RateStreamOffset(Serialization.read[Map[Int, (Long, Long)]](json))
+  }
+
+  override def readSchema(): StructType = RateSourceProvider.SCHEMA
+
+  private var offset: Offset = _
+
+  override def setOffset(offset: java.util.Optional[Offset]): Unit = {
+    this.offset = offset.orElse(RateStreamSourceV2.createInitialOffset(numPartitions, creationTime))
+  }
+
+  override def getStartOffset(): Offset = offset
+
+  override def createReadTasks(): java.util.List[ReadTask[Row]] = {
+    val partitionStartMap = offset match {
+      case off: RateStreamOffset => off.partitionToValueAndRunTimeMs
+      case off =>
+        throw new IllegalArgumentException(
+          s"invalid offset type ${off.getClass()} for ContinuousRateSource")
+    }
+    if (partitionStartMap.keySet.size != numPartitions) {
+      throw new IllegalArgumentException(
+        s"The previous run contained ${partitionStartMap.keySet.size} partitions, but" +
+        s" $numPartitions partitions are currently configured. The numPartitions option" +
+        " cannot be changed.")
+    }
+
+    Range(0, numPartitions).map { i =>
+      val start = partitionStartMap(i)
+      // Have each partition advance by numPartitions each row, with starting points staggered
+      // by their partition index.
+      RateStreamReadTask(
+        start._1, // starting row value
+        start._2, // starting time in ms
+        i,
+        numPartitions,
+        perPartitionRate)
+        .asInstanceOf[ReadTask[Row]]
+    }.asJava
+  }
+
+  override def commit(end: Offset): Unit = {}
+  override def stop(): Unit = {}
+
+}
+
+case class RateStreamReadTask(
+    startValue: Long,
+    startTimeMs: Long,
+    partitionIndex: Int,
+    increment: Long,
+    rowsPerSecond: Double)
+  extends ReadTask[Row] {
+  override def createDataReader(): DataReader[Row] =
+    new RateStreamDataReader(startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
+}
+
+class RateStreamDataReader(
+    startValue: Long,
+    startTimeMs: Long,
+    partitionIndex: Int,
+    increment: Long,
+    rowsPerSecond: Double)
+  extends ContinuousDataReader[Row] {
+  private var nextReadTime: Long = startTimeMs
+  private val readTimeIncrement: Long = (1000 / rowsPerSecond).toLong
+
+  private var currentValue = startValue
+  private var currentRow: Row = null
+
+  override def next(): Boolean = {
+    currentValue += increment
+    nextReadTime += readTimeIncrement
+
+    try {
+      while (System.currentTimeMillis < nextReadTime) {
+        Thread.sleep(nextReadTime - System.currentTimeMillis)
+      }
+    } catch {
+      case _: InterruptedException =>
+        // Someone's trying to end the task; just let them.
+        return false
+    }
+
+    currentRow = Row(
+      DateTimeUtils.toJavaTimestamp(DateTimeUtils.fromMillis(nextReadTime)),
+      currentValue)
+
+    true
+  }
+
+  override def get: Row = currentRow
+
+  override def close(): Unit = {}
+
+  override def getOffset(): PartitionOffset =
+    ContinuousRateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index 3041d4d703cb4..db0717510a2cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
new file mode 100644
index 0000000000000..437040cc12472
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import javax.annotation.concurrent.GuardedBy
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.util.control.NonFatal
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
+import org.apache.spark.sql.sources.v2.{ContinuousWriteSupport, DataSourceV2, DataSourceV2Options, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.StructType
+
+/**
+ * A sink that stores the results in memory. This [[Sink]] is primarily intended for use in unit
+ * tests and does not provide durability.
+ */
+class MemorySinkV2 extends DataSourceV2
+  with MicroBatchWriteSupport with ContinuousWriteSupport with Logging {
+
+  override def createMicroBatchWriter(
+      queryId: String,
+      batchId: Long,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): java.util.Optional[DataSourceV2Writer] = {
+    java.util.Optional.of(new MemoryWriter(this, batchId, mode))
+  }
+
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): java.util.Optional[ContinuousWriter] = {
+    java.util.Optional.of(new ContinuousMemoryWriter(this, mode))
+  }
+
+  private case class AddedData(batchId: Long, data: Array[Row])
+
+  /** An order list of batches that have been written to this [[Sink]]. */
+  @GuardedBy("this")
+  private val batches = new ArrayBuffer[AddedData]()
+
+  /** Returns all rows that are stored in this [[Sink]]. */
+  def allData: Seq[Row] = synchronized {
+    batches.flatMap(_.data)
+  }
+
+  def latestBatchId: Option[Long] = synchronized {
+    batches.lastOption.map(_.batchId)
+  }
+
+  def latestBatchData: Seq[Row] = synchronized {
+    batches.lastOption.toSeq.flatten(_.data)
+  }
+
+  def toDebugString: String = synchronized {
+    batches.map { case AddedData(batchId, data) =>
+      val dataStr = try data.mkString(" ") catch {
+        case NonFatal(e) => "[Error converting to string]"
+      }
+      s"$batchId: $dataStr"
+    }.mkString("\n")
+  }
+
+  def write(batchId: Long, outputMode: OutputMode, newRows: Array[Row]): Unit = {
+    val notCommitted = synchronized {
+      latestBatchId.isEmpty || batchId > latestBatchId.get
+    }
+    if (notCommitted) {
+      logDebug(s"Committing batch $batchId to $this")
+      outputMode match {
+        case Append | Update =>
+          val rows = AddedData(batchId, newRows)
+          synchronized { batches += rows }
+
+        case Complete =>
+          val rows = AddedData(batchId, newRows)
+          synchronized {
+            batches.clear()
+            batches += rows
+          }
+
+        case _ =>
+          throw new IllegalArgumentException(
+            s"Output mode $outputMode is not supported by MemorySink")
+      }
+    } else {
+      logDebug(s"Skipping already committed batch: $batchId")
+    }
+  }
+
+  def clear(): Unit = synchronized {
+    batches.clear()
+  }
+
+  override def toString(): String = "MemorySink"
+}
+
+case class MemoryWriterCommitMessage(partition: Int, data: Seq[Row]) extends WriterCommitMessage {}
+
+class MemoryWriter(sink: MemorySinkV2, batchId: Long, outputMode: OutputMode)
+  extends DataSourceV2Writer with Logging {
+
+  override def createWriterFactory: MemoryWriterFactory = MemoryWriterFactory(outputMode)
+
+  def commit(messages: Array[WriterCommitMessage]): Unit = {
+    val newRows = messages.flatMap {
+      case message: MemoryWriterCommitMessage => message.data
+    }
+    sink.write(batchId, outputMode, newRows)
+  }
+
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {
+    // Don't accept any of the new input.
+  }
+}
+
+class ContinuousMemoryWriter(val sink: MemorySinkV2, outputMode: OutputMode)
+  extends ContinuousWriter {
+
+  override def createWriterFactory: MemoryWriterFactory = MemoryWriterFactory(outputMode)
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
+    val newRows = messages.flatMap {
+      case message: MemoryWriterCommitMessage => message.data
+    }
+    sink.write(epochId, outputMode, newRows)
+  }
+
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {
+    // Don't accept any of the new input.
+  }
+}
+
+case class MemoryWriterFactory(outputMode: OutputMode) extends DataWriterFactory[Row] {
+  def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
+    new MemoryDataWriter(partitionId, outputMode)
+  }
+}
+
+class MemoryDataWriter(partition: Int, outputMode: OutputMode)
+  extends DataWriter[Row] with Logging {
+
+  private val data = mutable.Buffer[Row]()
+
+  override def write(row: Row): Unit = {
+    data.append(row)
+  }
+
+  override def commit(): MemoryWriterCommitMessage = {
+    val msg = MemoryWriterCommitMessage(partition, data.clone())
+    data.clear()
+    msg
+  }
+
+  override def abort(): Unit = {}
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index 0b22cbc46e6bf..440cae016a173 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -31,6 +31,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
new file mode 100644
index 0000000000000..be4b490754986
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
+
+class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
+  test("data writer") {
+    val partition = 1234
+    val writer = new MemoryDataWriter(partition, OutputMode.Append())
+    writer.write(Row(1))
+    writer.write(Row(2))
+    writer.write(Row(44))
+    val msg = writer.commit()
+    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
+    assert(msg.partition == partition)
+
+    // Buffer should be cleared, so repeated commits should give empty.
+    assert(writer.commit().data.isEmpty)
+  }
+
+  test("continuous writer") {
+    val sink = new MemorySinkV2
+    val writer = new ContinuousMemoryWriter(sink, OutputMode.Append())
+    writer.commit(0,
+      Array(
+        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
+        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
+        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
+      ))
+    assert(sink.latestBatchId.contains(0))
+    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
+    writer.commit(19,
+      Array(
+        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
+        MemoryWriterCommitMessage(0, Seq(Row(33)))
+      ))
+    assert(sink.latestBatchId.contains(19))
+    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))
+
+    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
+  }
+
+  test("microbatch writer") {
+    val sink = new MemorySinkV2
+    new MemoryWriter(sink, 0, OutputMode.Append()).commit(
+      Array(
+        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
+        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
+        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
+      ))
+    assert(sink.latestBatchId.contains(0))
+    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
+    new MemoryWriter(sink, 19, OutputMode.Append()).commit(
+      Array(
+        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
+        MemoryWriterCommitMessage(0, Seq(Row(33)))
+      ))
+    assert(sink.latestBatchId.contains(19))
+    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))
+
+    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index e6cdc063c4e9f..4868ba4e68934 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -22,6 +22,7 @@ import java.io.File
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.stringToFile
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.test.SharedSQLContext
 
 class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
index 03d0f63fa4d7f..ceba27b26e578 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
@@ -21,6 +21,7 @@ import java.util.concurrent.TimeUnit
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{StreamingQueryException, StreamTest}
 import org.apache.spark.util.ManualClock
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
new file mode 100644
index 0000000000000..ef801ceb1310c
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import java.util.Optional
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.execution.streaming.continuous._
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Options, MicroBatchReadSupport}
+import org.apache.spark.sql.streaming.StreamTest
+
+class RateSourceV2Suite extends StreamTest {
+  test("microbatch in registry") {
+    DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
+      case ds: MicroBatchReadSupport =>
+        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
+        assert(reader.isInstanceOf[RateStreamV2Reader])
+      case _ =>
+        throw new IllegalStateException("Could not find v2 read support for rate")
+    }
+  }
+
+  test("microbatch - numPartitions propagated") {
+    val reader = new RateStreamV2Reader(
+      new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
+    reader.setOffsetRange(Optional.empty(), Optional.empty())
+    val tasks = reader.createReadTasks()
+    assert(tasks.size == 11)
+  }
+
+  test("microbatch - set offset") {
+    val reader = new RateStreamV2Reader(DataSourceV2Options.empty())
+    val startOffset = RateStreamOffset(Map((0, (0, 1000))))
+    val endOffset = RateStreamOffset(Map((0, (0, 2000))))
+    reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
+    assert(reader.getStartOffset() == startOffset)
+    assert(reader.getEndOffset() == endOffset)
+  }
+
+  test("microbatch - infer offsets") {
+    val reader = new RateStreamV2Reader(
+      new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
+    reader.clock.waitTillTime(reader.clock.getTimeMillis() + 100)
+    reader.setOffsetRange(Optional.empty(), Optional.empty())
+    reader.getStartOffset() match {
+      case r: RateStreamOffset =>
+        assert(r.partitionToValueAndRunTimeMs(0)._2 == reader.creationTimeMs)
+      case _ => throw new IllegalStateException("unexpected offset type")
+    }
+    reader.getEndOffset() match {
+      case r: RateStreamOffset =>
+        // End offset may be a bit beyond 100 ms/9 rows after creation if the wait lasted
+        // longer than 100ms. It should never be early.
+        assert(r.partitionToValueAndRunTimeMs(0)._1 >= 9)
+        assert(r.partitionToValueAndRunTimeMs(0)._2 >= reader.creationTimeMs + 100)
+
+      case _ => throw new IllegalStateException("unexpected offset type")
+    }
+  }
+
+  test("microbatch - predetermined batch size") {
+    val reader = new RateStreamV2Reader(
+      new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
+    val startOffset = RateStreamOffset(Map((0, (0, 1000))))
+    val endOffset = RateStreamOffset(Map((0, (20, 2000))))
+    reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
+    val tasks = reader.createReadTasks()
+    assert(tasks.size == 1)
+    assert(tasks.get(0).asInstanceOf[RateStreamBatchTask].vals.size == 20)
+  }
+
+  test("microbatch - data read") {
+    val reader = new RateStreamV2Reader(
+      new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
+    val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
+    val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
+      case (part, (currentVal, currentReadTime)) =>
+        (part, (currentVal + 33, currentReadTime + 1000))
+    }.toMap)
+
+    reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
+    val tasks = reader.createReadTasks()
+    assert(tasks.size == 11)
+
+    val readData = tasks.asScala
+      .map(_.createDataReader())
+      .flatMap { reader =>
+        val buf = scala.collection.mutable.ListBuffer[Row]()
+        while (reader.next()) buf.append(reader.get())
+        buf
+      }
+
+    assert(readData.map(_.getLong(1)).sorted == Range(0, 33))
+  }
+
+  test("continuous in registry") {
+    DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
+      case ds: ContinuousReadSupport =>
+        val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceV2Options.empty())
+        assert(reader.isInstanceOf[ContinuousRateStreamReader])
+      case _ =>
+        throw new IllegalStateException("Could not find v2 read support for rate")
+    }
+  }
+
+  test("continuous data") {
+    val reader = new ContinuousRateStreamReader(
+      new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
+    reader.setOffset(Optional.empty())
+    val tasks = reader.createReadTasks()
+    assert(tasks.size == 2)
+
+    val data = scala.collection.mutable.ListBuffer[Row]()
+    tasks.asScala.foreach {
+      case t: RateStreamReadTask =>
+        val startTimeMs = reader.getStartOffset()
+          .asInstanceOf[RateStreamOffset]
+          .partitionToValueAndRunTimeMs(t.partitionIndex)
+          ._2
+        val r = t.createDataReader().asInstanceOf[RateStreamDataReader]
+        for (rowIndex <- 0 to 9) {
+          r.next()
+          data.append(r.get())
+          assert(r.getOffset() ==
+            ContinuousRateStreamPartitionOffset(
+              t.partitionIndex,
+              t.partitionIndex + rowIndex * 2,
+              startTimeMs + (rowIndex + 1) * 100))
+        }
+        assert(System.currentTimeMillis() >= startTimeMs + 1000)
+
+      case _ => throw new IllegalStateException("Unexpected task type")
+    }
+
+    assert(data.map(_.getLong(1)).toSeq.sorted == Range(0, 20))
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index b6baaed1927e4..7a2d9e3728208 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, SeenFilesMap}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.ExistsThrowsExceptionFileSystem._
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.SharedSQLContext
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
index f208f9bd9b6e3..429748261f1ea 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
@@ -18,7 +18,8 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}
+import org.apache.spark.sql.execution.streaming.{LongOffset, SerializedOffset}
+import org.apache.spark.sql.sources.v2.reader.Offset
 
 trait OffsetSuite extends SparkFunSuite {
   /** Creates test to check all the comparisons of offsets given a `one` that is less than `two`. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 3d687d2214e90..8163a1f91e1ca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreCon
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.StreamSourceProvider
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index dc5b998ad68b5..7a1ff89f2f3b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.{Clock, SystemClock, Utils}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 1b4d8556f6ae5..fa0313592b8e7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -34,6 +34,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode._
 import org.apache.spark.sql.streaming.util.{MockSourceProvider, StreamManualClock}
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 9ff02dee288fb..fc9ac2a56c4e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.scheduler._
 import org.apache.spark.sql.{Encoder, SparkSession}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.util.JsonProtocol
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index cc693909270f8..f813b77e3ce6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.util.{BlockingSource, MockSourceProvider, StreamManualClock}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.ManualClock
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index aa163d2211c38..952908f21ca60 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{ProcessingTime => DeprecatedProcessingTime, _}
 import org.apache.spark.sql.streaming.Trigger._
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
index 19ab2ff13e14e..9a35f097e6e40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.streaming.util
 import java.util.concurrent.CountDownLatch
 
 import org.apache.spark.sql.{SQLContext, _}
-import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Sink, Source}
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
+import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 

From c3dd2a26deaadf508b4e163eab2c0544cd922540 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 13 Dec 2017 22:46:20 -0800
Subject: [PATCH 118/356] [SPARK-22779][SQL] Resolve default values for
 fallback configs.

SQLConf allows some callers to define a custom default value for
configs, and that complicates a little bit the handling of fallback
config entries, since most of the default value resolution is
hidden by the config code.

This change peaks into the internals of these fallback configs
to figure out the correct default value, and also returns the
current human-readable default when showing the default value
(e.g. through "set -v").

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19974 from vanzin/SPARK-22779.
---
 .../spark/internal/config/ConfigEntry.scala   |  8 +++--
 .../apache/spark/sql/internal/SQLConf.scala   | 16 +++++++---
 .../spark/sql/internal/SQLConfSuite.scala     | 30 +++++++++++++++++++
 3 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
index f1190289244e9..ede3ace4f9aac 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala
@@ -139,7 +139,7 @@ private[spark] class OptionalConfigEntry[T](
     s => Some(rawValueConverter(s)),
     v => v.map(rawStringConverter).orNull, doc, isPublic) {
 
-  override def defaultValueString: String = "<undefined>"
+  override def defaultValueString: String = ConfigEntry.UNDEFINED
 
   override def readFrom(reader: ConfigReader): Option[T] = {
     readString(reader).map(rawValueConverter)
@@ -149,12 +149,12 @@ private[spark] class OptionalConfigEntry[T](
 /**
  * A config entry whose default value is defined by another config entry.
  */
-private class FallbackConfigEntry[T] (
+private[spark] class FallbackConfigEntry[T] (
     key: String,
     alternatives: List[String],
     doc: String,
     isPublic: Boolean,
-    private[config] val fallback: ConfigEntry[T])
+    val fallback: ConfigEntry[T])
   extends ConfigEntry[T](key, alternatives,
     fallback.valueConverter, fallback.stringConverter, doc, isPublic) {
 
@@ -167,6 +167,8 @@ private class FallbackConfigEntry[T] (
 
 private[spark] object ConfigEntry {
 
+  val UNDEFINED = "<undefined>"
+
   private val knownConfigs = new java.util.concurrent.ConcurrentHashMap[String, ConfigEntry[_]]()
 
   def registerEntry(entry: ConfigEntry[_]): Unit = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1121444cc938a..cf7e3ebce7411 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1379,7 +1379,7 @@ class SQLConf extends Serializable with Logging {
     Option(settings.get(key)).
       orElse {
         // Try to use the default value
-        Option(sqlConfEntries.get(key)).map(_.defaultValueString)
+        Option(sqlConfEntries.get(key)).map { e => e.stringConverter(e.readFrom(reader)) }
       }.
       getOrElse(throw new NoSuchElementException(key))
   }
@@ -1417,14 +1417,21 @@ class SQLConf extends Serializable with Logging {
    * not set yet, return `defaultValue`.
    */
   def getConfString(key: String, defaultValue: String): String = {
-    if (defaultValue != null && defaultValue != "<undefined>") {
+    if (defaultValue != null && defaultValue != ConfigEntry.UNDEFINED) {
       val entry = sqlConfEntries.get(key)
       if (entry != null) {
         // Only verify configs in the SQLConf object
         entry.valueConverter(defaultValue)
       }
     }
-    Option(settings.get(key)).getOrElse(defaultValue)
+    Option(settings.get(key)).getOrElse {
+      // If the key is not set, need to check whether the config entry is registered and is
+      // a fallback conf, so that we can check its parent.
+      sqlConfEntries.get(key) match {
+        case e: FallbackConfigEntry[_] => getConfString(e.fallback.key, defaultValue)
+        case _ => defaultValue
+      }
+    }
   }
 
   /**
@@ -1440,7 +1447,8 @@ class SQLConf extends Serializable with Logging {
    */
   def getAllDefinedConfs: Seq[(String, String, String)] = sqlConfEntries.synchronized {
     sqlConfEntries.values.asScala.filter(_.isPublic).map { entry =>
-      (entry.key, getConfString(entry.key, entry.defaultValueString), entry.doc)
+      val displayValue = Option(getConfString(entry.key, null)).getOrElse(entry.defaultValueString)
+      (entry.key, displayValue, entry.doc)
     }.toSeq
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 8b1521bacea49..c9a6975da6be8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -280,4 +280,34 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
 
     spark.sessionState.conf.clear()
   }
+
+  test("SPARK-22779: correctly compute default value for fallback configs") {
+    val fallback = SQLConf.buildConf("spark.sql.__test__.spark_22779")
+      .fallbackConf(SQLConf.PARQUET_COMPRESSION)
+
+    assert(spark.sessionState.conf.getConfString(fallback.key) ===
+      SQLConf.PARQUET_COMPRESSION.defaultValue.get)
+    assert(spark.sessionState.conf.getConfString(fallback.key, "lzo") === "lzo")
+
+    val displayValue = spark.sessionState.conf.getAllDefinedConfs
+      .find { case (key, _, _) => key == fallback.key }
+      .map { case (_, v, _) => v }
+      .get
+    assert(displayValue === fallback.defaultValueString)
+
+    spark.sessionState.conf.setConf(SQLConf.PARQUET_COMPRESSION, "gzip")
+    assert(spark.sessionState.conf.getConfString(fallback.key) === "gzip")
+
+    spark.sessionState.conf.setConf(fallback, "lzo")
+    assert(spark.sessionState.conf.getConfString(fallback.key) === "lzo")
+
+    val newDisplayValue = spark.sessionState.conf.getAllDefinedConfs
+      .find { case (key, _, _) => key == fallback.key }
+      .map { case (_, v, _) => v }
+      .get
+    assert(newDisplayValue === "lzo")
+
+    SQLConf.unregister(fallback)
+  }
+
 }

From 7d8e2ca7f8667d809034073aad2ea67b3b082fc2 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 14 Dec 2017 19:33:54 +0800
Subject: [PATCH 119/356] [SPARK-22775][SQL] move dictionary related APIs from
 ColumnVector to WritableColumnVector

## What changes were proposed in this pull request?

These dictionary related APIs are special to `WritableColumnVector` and should not be in `ColumnVector`, which will be public soon.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19970 from cloud-fan/final.
---
 .../parquet/VectorizedColumnReader.java       |  2 +-
 .../vectorized/ArrowColumnVector.java         |  5 --
 .../execution/vectorized/ColumnVector.java    | 66 ++++----------
 .../vectorized/WritableColumnVector.java      | 88 ++++++++++++-------
 4 files changed, 73 insertions(+), 88 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index b7646969bcf3d..3ba180860c325 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -239,7 +239,7 @@ private void decodeDictionaryIds(
       int rowId,
       int num,
       WritableColumnVector column,
-      ColumnVector dictionaryIds) {
+      WritableColumnVector dictionaryIds) {
     switch (descriptor.getType()) {
       case INT32:
         if (column.dataType() == DataTypes.IntegerType ||
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 1f1347ccd315e..e99201f6372fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -159,11 +159,6 @@ public int[] getInts(int rowId, int count) {
     return array;
   }
 
-  @Override
-  public int getDictId(int rowId) {
-    throw new UnsupportedOperationException();
-  }
-
   //
   // APIs dealing with Longs
   //
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index e6b87519239dd..fd5caf3bf170b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -22,24 +22,22 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents a column of values and provides the main APIs to access the data
- * values. It supports all the types and contains get APIs as well as their batched versions.
- * The batched versions are preferable whenever possible.
+ * This class represents in-memory values of a column and provides the main APIs to access the data.
+ * It supports all the types and contains get APIs as well as their batched versions. The batched
+ * versions are considered to be faster and preferable whenever possible.
  *
  * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
- * columns have child columns. All of the data is stored in the child columns and the parent column
- * contains nullability, and in the case of Arrays, the lengths and offsets into the child column.
- * Lengths and offsets are encoded identically to INTs.
+ * columns have child columns. All of the data are stored in the child columns and the parent column
+ * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
+ * column and are encoded identically to INTs.
+ *
  * Maps are just a special case of a two field struct.
  *
  * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
- * in the current RowBatch.
- *
- * A ColumnVector should be considered immutable once originally created.
- *
- * ColumnVectors are intended to be reused.
+ * in the current batch.
  */
 public abstract class ColumnVector implements AutoCloseable {
+
   /**
    * Returns the data type of this column.
    */
@@ -47,7 +45,6 @@ public abstract class ColumnVector implements AutoCloseable {
 
   /**
    * Cleans up memory for this column. The column is not usable after this.
-   * TODO: this should probably have ref-counted semantics.
    */
   public abstract void close();
 
@@ -107,13 +104,6 @@ public abstract class ColumnVector implements AutoCloseable {
    */
   public abstract int[] getInts(int rowId, int count);
 
-  /**
-   * Returns the dictionary Id for rowId.
-   * This should only be called when the ColumnVector is dictionaryIds.
-   * We have this separate method for dictionaryIds as per SPARK-16928.
-   */
-  public abstract int getDictId(int rowId);
-
   /**
    * Returns the value for rowId.
    */
@@ -145,39 +135,39 @@ public abstract class ColumnVector implements AutoCloseable {
   public abstract double[] getDoubles(int rowId, int count);
 
   /**
-   * Returns the length of the array at rowid.
+   * Returns the length of the array for rowId.
    */
   public abstract int getArrayLength(int rowId);
 
   /**
-   * Returns the offset of the array at rowid.
+   * Returns the offset of the array for rowId.
    */
   public abstract int getArrayOffset(int rowId);
 
   /**
-   * Returns a utility object to get structs.
+   * Returns the struct for rowId.
    */
   public final ColumnarRow getStruct(int rowId) {
     return new ColumnarRow(this, rowId);
   }
 
   /**
-   * Returns a utility object to get structs.
-   * provided to keep API compatibility with InternalRow for code generation
+   * A special version of {@link #getStruct(int)}, which is only used as an adapter for Spark
+   * codegen framework, the second parameter is totally ignored.
    */
   public final ColumnarRow getStruct(int rowId, int size) {
     return getStruct(rowId);
   }
 
   /**
-   * Returns the array at rowid.
+   * Returns the array for rowId.
    */
   public final ColumnarArray getArray(int rowId) {
     return new ColumnarArray(arrayData(), getArrayOffset(rowId), getArrayLength(rowId));
   }
 
   /**
-   * Returns the value for rowId.
+   * Returns the map for rowId.
    */
   public MapData getMap(int ordinal) {
     throw new UnsupportedOperationException();
@@ -214,30 +204,6 @@ public MapData getMap(int ordinal) {
    */
   protected DataType type;
 
-  /**
-   * The Dictionary for this column.
-   *
-   * If it's not null, will be used to decode the value in getXXX().
-   */
-  protected Dictionary dictionary;
-
-  /**
-   * Reusable column for ids of dictionary.
-   */
-  protected ColumnVector dictionaryIds;
-
-  /**
-   * Returns true if this column has a dictionary.
-   */
-  public boolean hasDictionary() { return this.dictionary != null; }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  public ColumnVector getDictionaryIds() {
-    return dictionaryIds;
-  }
-
   /**
    * Sets up the common state and also handles creating the child columns if this is a nested
    * type.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 7c053b579442c..63cf60818a855 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -36,8 +36,10 @@
  * elements. This means that the put() APIs do not check as in common cases (i.e. flat schemas),
  * the lengths are known up front.
  *
- * A ColumnVector should be considered immutable once originally created. In other words, it is not
- * valid to call put APIs after reads until reset() is called.
+ * A WritableColumnVector should be considered immutable once originally created. In other words,
+ * it is not valid to call put APIs after reads until reset() is called.
+ *
+ * WritableColumnVector are intended to be reused.
  */
 public abstract class WritableColumnVector extends ColumnVector {
 
@@ -105,6 +107,58 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   @Override
   public boolean anyNullsSet() { return anyNullsSet; }
 
+  /**
+   * Returns the dictionary Id for rowId.
+   *
+   * This should only be called when this `WritableColumnVector` represents dictionaryIds.
+   * We have this separate method for dictionaryIds as per SPARK-16928.
+   */
+  public abstract int getDictId(int rowId);
+
+  /**
+   * The Dictionary for this column.
+   *
+   * If it's not null, will be used to decode the value in getXXX().
+   */
+  protected Dictionary dictionary;
+
+  /**
+   * Reusable column for ids of dictionary.
+   */
+  protected WritableColumnVector dictionaryIds;
+
+  /**
+   * Returns true if this column has a dictionary.
+   */
+  public boolean hasDictionary() { return this.dictionary != null; }
+
+  /**
+   * Returns the underlying integer column for ids of dictionary.
+   */
+  public WritableColumnVector getDictionaryIds() {
+    return dictionaryIds;
+  }
+
+  /**
+   * Update the dictionary.
+   */
+  public void setDictionary(Dictionary dictionary) {
+    this.dictionary = dictionary;
+  }
+
+  /**
+   * Reserve a integer column for ids of dictionary.
+   */
+  public WritableColumnVector reserveDictionaryIds(int capacity) {
+    if (dictionaryIds == null) {
+      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
+    } else {
+      dictionaryIds.reset();
+      dictionaryIds.reserve(capacity);
+    }
+    return dictionaryIds;
+  }
+
   /**
    * Ensures that there is enough storage to store capacity elements. That is, the put() APIs
    * must work for all rowIds < capacity.
@@ -613,36 +667,6 @@ public final int appendStruct(boolean isNull) {
    */
   protected WritableColumnVector[] childColumns;
 
-  /**
-   * Update the dictionary.
-   */
-  public void setDictionary(Dictionary dictionary) {
-    this.dictionary = dictionary;
-  }
-
-  /**
-   * Reserve a integer column for ids of dictionary.
-   */
-  public WritableColumnVector reserveDictionaryIds(int capacity) {
-    WritableColumnVector dictionaryIds = (WritableColumnVector) this.dictionaryIds;
-    if (dictionaryIds == null) {
-      dictionaryIds = reserveNewColumn(capacity, DataTypes.IntegerType);
-      this.dictionaryIds = dictionaryIds;
-    } else {
-      dictionaryIds.reset();
-      dictionaryIds.reserve(capacity);
-    }
-    return dictionaryIds;
-  }
-
-  /**
-   * Returns the underlying integer column for ids of dictionary.
-   */
-  @Override
-  public WritableColumnVector getDictionaryIds() {
-    return (WritableColumnVector) dictionaryIds;
-  }
-
   /**
    * Reserve a new column.
    */

From d0957954398658b025fdac3b9a07b477e79520b9 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 15 Dec 2017 00:29:44 +0800
Subject: [PATCH 120/356] [SPARK-22785][SQL] remove ColumnVector.anyNullsSet

## What changes were proposed in this pull request?
`ColumnVector.anyNullsSet` is not called anywhere except tests, and we can easily replace it with `ColumnVector.numNulls > 0`

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19980 from cloud-fan/minor.
---
 .../execution/vectorized/ArrowColumnVector.java    |  5 -----
 .../sql/execution/vectorized/ColumnVector.java     |  6 ------
 .../execution/vectorized/OffHeapColumnVector.java  |  4 +---
 .../execution/vectorized/OnHeapColumnVector.java   |  4 +---
 .../execution/vectorized/WritableColumnVector.java | 14 ++------------
 .../vectorized/ArrowColumnVectorSuite.scala        | 11 -----------
 .../execution/vectorized/ColumnarBatchSuite.scala  |  9 ---------
 7 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index e99201f6372fe..d53e1fcab0c5a 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -54,11 +54,6 @@ public int numNulls() {
     return accessor.getNullCount();
   }
 
-  @Override
-  public boolean anyNullsSet() {
-    return numNulls() > 0;
-  }
-
   @Override
   public void close() {
     if (childColumns != null) {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
index fd5caf3bf170b..dc7c1269bedd9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
@@ -53,12 +53,6 @@ public abstract class ColumnVector implements AutoCloseable {
    */
   public abstract int numNulls();
 
-  /**
-   * Returns true if any of the nulls indicator are set for this column. This can be used
-   * as an optimization to prevent setting nulls.
-   */
-  public abstract boolean anyNullsSet();
-
   /**
    * Returns whether the value at rowId is NULL.
    */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
index 5f1b9885334b7..1c45b846790b6 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OffHeapColumnVector.java
@@ -110,7 +110,6 @@ public void putNotNull(int rowId) {
   public void putNull(int rowId) {
     Platform.putByte(null, nulls + rowId, (byte) 1);
     ++numNulls;
-    anyNullsSet = true;
   }
 
   @Override
@@ -119,13 +118,12 @@ public void putNulls(int rowId, int count) {
     for (int i = 0; i < count; ++i, ++offset) {
       Platform.putByte(null, offset, (byte) 1);
     }
-    anyNullsSet = true;
     numNulls += count;
   }
 
   @Override
   public void putNotNulls(int rowId, int count) {
-    if (!anyNullsSet) return;
+    if (numNulls == 0) return;
     long offset = nulls + rowId;
     for (int i = 0; i < count; ++i, ++offset) {
       Platform.putByte(null, offset, (byte) 0);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
index f12772ede575d..1d538fe4181b7 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/OnHeapColumnVector.java
@@ -107,7 +107,6 @@ public void putNotNull(int rowId) {
   public void putNull(int rowId) {
     nulls[rowId] = (byte)1;
     ++numNulls;
-    anyNullsSet = true;
   }
 
   @Override
@@ -115,13 +114,12 @@ public void putNulls(int rowId, int count) {
     for (int i = 0; i < count; ++i) {
       nulls[rowId + i] = (byte)1;
     }
-    anyNullsSet = true;
     numNulls += count;
   }
 
   @Override
   public void putNotNulls(int rowId, int count) {
-    if (!anyNullsSet) return;
+    if (numNulls == 0) return;
     for (int i = 0; i < count; ++i) {
       nulls[rowId + i] = (byte)0;
     }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 63cf60818a855..5f6f125976e12 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -54,12 +54,11 @@ public void reset() {
         ((WritableColumnVector) c).reset();
       }
     }
-    numNulls = 0;
     elementsAppended = 0;
-    if (anyNullsSet) {
+    if (numNulls > 0) {
       putNotNulls(0, capacity);
-      anyNullsSet = false;
     }
+    numNulls = 0;
   }
 
   @Override
@@ -104,9 +103,6 @@ private void throwUnsupportedException(int requiredCapacity, Throwable cause) {
   @Override
   public int numNulls() { return numNulls; }
 
-  @Override
-  public boolean anyNullsSet() { return anyNullsSet; }
-
   /**
    * Returns the dictionary Id for rowId.
    *
@@ -640,12 +636,6 @@ public final int appendStruct(boolean isNull) {
    */
   protected int numNulls;
 
-  /**
-   * True if there is at least one NULL byte set. This is an optimization for the writer, to skip
-   * having to clear NULL bits.
-   */
-  protected boolean anyNullsSet;
-
   /**
    * True if this column's values are fixed. This means the column values never change, even
    * across resets.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 068a17bf772e1..e460d0721e7bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -42,7 +42,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BooleanType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -71,7 +70,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ByteType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -100,7 +98,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ShortType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -129,7 +126,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === IntegerType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -158,7 +154,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === LongType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -187,7 +182,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === FloatType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -216,7 +210,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === DoubleType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -246,7 +239,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === StringType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -274,7 +266,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BinaryType)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     (0 until 10).foreach { i =>
@@ -319,7 +310,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ArrayType(IntegerType))
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     val array0 = columnVector.getArray(0)
@@ -383,7 +373,6 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === schema)
-    assert(columnVector.anyNullsSet)
     assert(columnVector.numNulls === 1)
 
     val row0 = columnVector.getStruct(0, 2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index c9c6bee513b53..d3ed8276b8f10 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -65,27 +65,22 @@ class ColumnarBatchSuite extends SparkFunSuite {
     column =>
       val reference = mutable.ArrayBuffer.empty[Boolean]
       var idx = 0
-      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNotNull()
       reference += false
-      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNotNulls(3)
       (1 to 3).foreach(_ => reference += false)
-      assert(!column.anyNullsSet())
       assert(column.numNulls() == 0)
 
       column.appendNull()
       reference += true
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 1)
 
       column.appendNulls(3)
       (1 to 3).foreach(_ => reference += true)
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 4)
 
       idx = column.elementsAppended
@@ -93,13 +88,11 @@ class ColumnarBatchSuite extends SparkFunSuite {
       column.putNotNull(idx)
       reference += false
       idx += 1
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 4)
 
       column.putNull(idx)
       reference += true
       idx += 1
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 5)
 
       column.putNulls(idx, 3)
@@ -107,7 +100,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
       reference += true
       reference += true
       idx += 3
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 8)
 
       column.putNotNulls(idx, 4)
@@ -116,7 +108,6 @@ class ColumnarBatchSuite extends SparkFunSuite {
       reference += false
       reference += false
       idx += 4
-      assert(column.anyNullsSet())
       assert(column.numNulls() == 8)
 
       reference.zipWithIndex.foreach { v =>

From 606ae491e41017c117301aeccfdf7221adec7f23 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Fri, 15 Dec 2017 02:14:08 +0800
Subject: [PATCH 121/356] [SPARK-22774][SQL][TEST] Add compilation check into
 TPCDSQuerySuite

## What changes were proposed in this pull request?

This PR adds check whether Java code generated by Catalyst can be compiled by `janino` correctly or not into `TPCDSQuerySuite`. Before this PR, this suite only checks whether analysis can be performed correctly or not.

This check will be able to avoid unexpected performance degrade by interpreter execution due to a Java compilation error.

## How was this patch tested?

Existing a test case, but updated it.

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19971 from kiszk/SPARK-22774.
---
 .../apache/spark/sql/TPCDSQuerySuite.scala    | 39 +++++++++++++++++--
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
index a58000da1543d..dd427a5c52edf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -19,8 +19,10 @@ package org.apache.spark.sql
 
 import org.scalatest.BeforeAndAfterAll
 
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.resourceToString
+import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
@@ -348,13 +350,41 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
     "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
     "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
 
+  private def checkGeneratedCode(plan: SparkPlan): Unit = {
+    val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]()
+    plan foreach {
+      case s: WholeStageCodegenExec =>
+        codegenSubtrees += s
+      case s => s
+    }
+    codegenSubtrees.toSeq.foreach { subtree =>
+      val code = subtree.doCodeGen()._2
+      try {
+        // Just check the generated code can be properly compiled
+        CodeGenerator.compile(code)
+      } catch {
+        case e: Exception =>
+          val msg =
+            s"""
+               |failed to compile:
+               |Subtree:
+               |$subtree
+               |Generated code:
+               |${CodeFormatter.format(code)}
+             """.stripMargin
+          throw new Exception(msg, e)
+      }
+    }
+  }
+
   tpcdsQueries.foreach { name =>
     val queryString = resourceToString(s"tpcds/$name.sql",
       classLoader = Thread.currentThread().getContextClassLoader)
     test(name) {
       withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
-        // Just check the plans can be properly generated
-        sql(queryString).queryExecution.executedPlan
+        // check the plans can be properly generated
+        val plan = sql(queryString).queryExecution.executedPlan
+        checkGeneratedCode(plan)
       }
     }
   }
@@ -368,8 +398,9 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
     val queryString = resourceToString(s"tpcds-modifiedQueries/$name.sql",
       classLoader = Thread.currentThread().getContextClassLoader)
     test(s"modified-$name") {
-      // Just check the plans can be properly generated
-      sql(queryString).queryExecution.executedPlan
+      // check the plans can be properly generated
+      val plan = sql(queryString).queryExecution.executedPlan
+      checkGeneratedCode(plan)
     }
   }
 }

From 40de176c93c5aa05bcbb1328721118b6b46ba51d Mon Sep 17 00:00:00 2001
From: Prashant Sharma <prashsh1@in.ibm.com>
Date: Thu, 14 Dec 2017 11:19:34 -0800
Subject: [PATCH 122/356] [SPARK-16496][SQL] Add wholetext as option for
 reading text in SQL.

## What changes were proposed in this pull request?

In multiple text analysis problems, it is not often desirable for the rows to be split by "\n". There exists a wholeText reader for RDD API, and this JIRA just adds the same support for Dataset API.
## How was this patch tested?

Added relevant new tests for both scala and Java APIs

Author: Prashant Sharma <prashsh1@in.ibm.com>
Author: Prashant Sharma <prashant@apache.org>

Closes #14151 from ScrapCodes/SPARK-16496/wholetext.
---
 python/pyspark/sql/readwriter.py              |   7 +-
 .../apache/spark/sql/DataFrameReader.scala    |  16 ++-
 .../HadoopFileWholeTextReader.scala           |  57 +++++++++
 .../datasources/text/TextFileFormat.scala     |  31 ++++-
 .../datasources/text/TextOptions.scala        |   7 ++
 .../datasources/text/TextSuite.scala          |   5 +-
 .../datasources/text/WholeTextFileSuite.scala | 108 ++++++++++++++++++
 7 files changed, 221 insertions(+), 10 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFileWholeTextReader.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 1ad974e9aa4c7..4e58bfb843644 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -304,7 +304,7 @@ def parquet(self, *paths):
 
     @ignore_unicode_prefix
     @since(1.6)
-    def text(self, paths):
+    def text(self, paths, wholetext=False):
         """
         Loads text files and returns a :class:`DataFrame` whose schema starts with a
         string column named "value", and followed by partitioned columns if there
@@ -313,11 +313,16 @@ def text(self, paths):
         Each line in the text file is a new row in the resulting DataFrame.
 
         :param paths: string, or list of strings, for input path(s).
+        :param wholetext: if true, read each file from input path(s) as a single row.
 
         >>> df = spark.read.text('python/test_support/sql/text-test.txt')
         >>> df.collect()
         [Row(value=u'hello'), Row(value=u'this')]
+        >>> df = spark.read.text('python/test_support/sql/text-test.txt', wholetext=True)
+        >>> df.collect()
+        [Row(value=u'hello\\nthis')]
         """
+        self._set_opts(wholetext=wholetext)
         if isinstance(paths, basestring):
             paths = [paths]
         return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(paths)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index ea1cf66775235..39fec8f983b65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -646,7 +646,14 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * Loads text files and returns a `DataFrame` whose schema starts with a string column named
    * "value", and followed by partitioned columns if there are any.
    *
-   * Each line in the text files is a new row in the resulting DataFrame. For example:
+   * You can set the following text-specific option(s) for reading text files:
+   * <ul>
+   * <li>`wholetext` ( default `false`): If true, read a file as a single row and not split by "\n".
+   * </li>
+   * </ul>
+   * By default, each line in the text files is a new row in the resulting DataFrame.
+   *
+   * Usage example:
    * {{{
    *   // Scala:
    *   spark.read.text("/path/to/spark/README.md")
@@ -678,7 +685,12 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * If the directory structure of the text files contains partitioning information, those are
    * ignored in the resulting Dataset. To include partitioning information as columns, use `text`.
    *
-   * Each line in the text files is a new element in the resulting Dataset. For example:
+   * You can set the following textFile-specific option(s) for reading text files:
+   * <ul>
+   * <li>`wholetext` ( default `false`): If true, read a file as a single row and not split by "\n".
+   * </li>
+   * </ul>
+   * By default, each line in the text files is a new row in the resulting DataFrame. For example:
    * {{{
    *   // Scala:
    *   spark.read.textFile("/path/to/spark/README.md")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFileWholeTextReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFileWholeTextReader.scala
new file mode 100644
index 0000000000000..c61a89e6e8c3f
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFileWholeTextReader.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources
+
+import java.io.Closeable
+import java.net.URI
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.hadoop.io.Text
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
+
+import org.apache.spark.input.WholeTextFileRecordReader
+
+/**
+ * An adaptor from a [[PartitionedFile]] to an [[Iterator]] of [[Text]], which is all of the lines
+ * in that file.
+ */
+class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration)
+  extends Iterator[Text] with Closeable {
+  private val iterator = {
+    val fileSplit = new CombineFileSplit(
+      Array(new Path(new URI(file.filePath))),
+      Array(file.start),
+      Array(file.length),
+      // TODO: Implement Locality
+      Array.empty[String])
+    val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
+    val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
+    val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0)
+    reader.initialize(fileSplit, hadoopAttemptContext)
+    new RecordReaderIterator(reader)
+  }
+
+  override def hasNext: Boolean = iterator.hasNext
+
+  override def next(): Text = iterator.next()
+
+  override def close(): Unit = iterator.close()
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index d0690445d7672..8a6ab303fc0f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -17,12 +17,16 @@
 
 package org.apache.spark.sql.execution.datasources.text
 
+import java.io.Closeable
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.io.Text
 import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
 
 import org.apache.spark.TaskContext
-import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
@@ -53,6 +57,14 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
     }
   }
 
+  override def isSplitable(
+      sparkSession: SparkSession,
+      options: Map[String, String],
+      path: Path): Boolean = {
+    val textOptions = new TextOptions(options)
+    super.isSplitable(sparkSession, options, path) && !textOptions.wholeText
+  }
+
   override def inferSchema(
       sparkSession: SparkSession,
       options: Map[String, String],
@@ -97,14 +109,25 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
     assert(
       requiredSchema.length <= 1,
       "Text data source only produces a single data column named \"value\".")
-
+    val textOptions = new TextOptions(options)
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
 
+    readToUnsafeMem(broadcastedHadoopConf, requiredSchema, textOptions.wholeText)
+  }
+
+  private def readToUnsafeMem(conf: Broadcast[SerializableConfiguration],
+      requiredSchema: StructType, wholeTextMode: Boolean):
+  (PartitionedFile) => Iterator[UnsafeRow] = {
+
     (file: PartitionedFile) => {
-      val reader = new HadoopFileLinesReader(file, broadcastedHadoopConf.value.value)
+      val confValue = conf.value.value
+      val reader = if (!wholeTextMode) {
+        new HadoopFileLinesReader(file, confValue)
+      } else {
+        new HadoopFileWholeTextReader(file, confValue)
+      }
       Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => reader.close()))
-
       if (requiredSchema.isEmpty) {
         val emptyUnsafeRow = new UnsafeRow(0)
         reader.map(_ => emptyUnsafeRow)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala
index 49bd7382f9cf3..2a661561ab51e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOptions.scala
@@ -33,8 +33,15 @@ private[text] class TextOptions(@transient private val parameters: CaseInsensiti
    * Compression codec to use.
    */
   val compressionCodec = parameters.get(COMPRESSION).map(CompressionCodecs.getCodecClassName)
+
+  /**
+   * wholetext - If true, read a file as a single row and not split by "\n".
+   */
+  val wholeText = parameters.getOrElse(WHOLETEXT, "false").toBoolean
+
 }
 
 private[text] object TextOptions {
   val COMPRESSION = "compression"
+  val WHOLETEXT = "wholetext"
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
index cb7393cdd2b9d..33287044f279e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
@@ -185,10 +185,9 @@ class TextSuite extends QueryTest with SharedSQLContext {
     val data = df.collect()
     assert(data(0) == Row("This is a test file for the text data source"))
     assert(data(1) == Row("1+1"))
-    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
-    // scalastyle:off
+    // scalastyle:off nonascii
     assert(data(2) == Row("数据砖头"))
-    // scalastyle:on
+    // scalastyle:on nonascii
     assert(data(3) == Row("\"doh\""))
     assert(data.length == 4)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala
new file mode 100644
index 0000000000000..8bd736bee69de
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/WholeTextFileSuite.scala
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.text
+
+import java.io.File
+
+import org.apache.spark.sql.{QueryTest, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.sql.types.{StringType, StructType}
+
+class WholeTextFileSuite extends QueryTest with SharedSQLContext {
+
+  // Hadoop's FileSystem caching does not use the Configuration as part of its cache key, which
+  // can cause Filesystem.get(Configuration) to return a cached instance created with a different
+  // configuration than the one passed to get() (see HADOOP-8490 for more details). This caused
+  // hard-to-reproduce test failures, since any suites that were run after this one would inherit
+  // the new value of "fs.local.block.size" (see SPARK-5227 and SPARK-5679). To work around this,
+  // we disable FileSystem caching in this suite.
+  protected override def sparkConf =
+    super.sparkConf.set("spark.hadoop.fs.file.impl.disable.cache", "true")
+
+  private def testFile: String = {
+    Thread.currentThread().getContextClassLoader.getResource("test-data/text-suite.txt").toString
+  }
+
+  test("reading text file with option wholetext=true") {
+    val df = spark.read.option("wholetext", "true")
+      .format("text").load(testFile)
+    // schema
+    assert(df.schema == new StructType().add("value", StringType))
+
+    // verify content
+    val data = df.collect()
+    assert(data(0) ==
+      Row(
+        // scalastyle:off nonascii
+        """This is a test file for the text data source
+          |1+1
+          |数据砖头
+          |"doh"
+          |""".stripMargin))
+    // scalastyle:on nonascii
+    assert(data.length == 1)
+  }
+
+  test("correctness of wholetext option") {
+    import org.apache.spark.sql.catalyst.util._
+    withTempDir { dir =>
+      val file1 = new File(dir, "text1.txt")
+      stringToFile(file1,
+        """text file 1 contents.
+          |From: None to: ??
+        """.stripMargin)
+      val file2 = new File(dir, "text2.txt")
+      stringToFile(file2, "text file 2 contents.")
+      val file3 = new File(dir, "text3.txt")
+      stringToFile(file3, "text file 3 contents.")
+      val df = spark.read.option("wholetext", "true").text(dir.getAbsolutePath)
+      // Since wholetext option reads each file into a single row, df.length should be no. of files.
+      val data = df.sort("value").collect()
+      assert(data.length == 3)
+      // Each files should represent a single Row/element in Dataframe/Dataset
+      assert(data(0) == Row(
+        """text file 1 contents.
+          |From: None to: ??
+        """.stripMargin))
+      assert(data(1) == Row(
+        """text file 2 contents.""".stripMargin))
+      assert(data(2) == Row(
+        """text file 3 contents.""".stripMargin))
+    }
+  }
+
+
+  test("Correctness of wholetext option with gzip compression mode.") {
+    withTempDir { dir =>
+      val path = dir.getCanonicalPath
+      val df1 = spark.range(0, 1000).selectExpr("CAST(id AS STRING) AS s").repartition(1)
+      df1.write.option("compression", "gzip").mode("overwrite").text(path)
+      // On reading through wholetext mode, one file will be read as a single row, i.e. not
+      // delimited by "next line" character.
+      val expected = Row(Range(0, 1000).mkString("", "\n", "\n"))
+      Seq(10, 100, 1000).foreach { bytes =>
+        withSQLConf(SQLConf.FILES_MAX_PARTITION_BYTES.key -> bytes.toString) {
+          val df2 = spark.read.option("wholetext", "true").format("text").load(path)
+          val result = df2.collect().head
+          assert(result === expected)
+        }
+      }
+    }
+  }
+}

From 6d99940397136e4ed0764f83f442bcffcb20d6e7 Mon Sep 17 00:00:00 2001
From: kellyzly <kellyzly@126.com>
Date: Thu, 14 Dec 2017 13:39:19 -0600
Subject: [PATCH 123/356] [SPARK-22660][BUILD] Use position() and limit() to
 fix ambiguity issue in scala-2.12

## What changes were proposed in this pull request?
Missing some changes about limit in TaskSetManager.scala

## How was this patch tested?
running tests

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: kellyzly <kellyzly@126.com>

Closes #19976 from kellyzly/SPARK-22660.2.
---
 .../scala/org/apache/spark/scheduler/TaskSetManager.scala   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index de4711f461df2..c3ed11bfe352a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -488,11 +488,11 @@ private[spark] class TaskSetManager(
             abort(s"$msg Exception during serialization: $e")
             throw new TaskNotSerializableException(e)
         }
-        if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
+        if (serializedTask.limit() > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024 &&
           !emittedTaskSizeWarning) {
           emittedTaskSizeWarning = true
           logWarning(s"Stage ${task.stageId} contains a task of very large size " +
-            s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " +
+            s"(${serializedTask.limit() / 1024} KB). The maximum recommended task size is " +
             s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
         }
         addRunningTask(taskId)
@@ -502,7 +502,7 @@ private[spark] class TaskSetManager(
         // val timeTaken = clock.getTime() - startTime
         val taskName = s"task ${info.id} in stage ${taskSet.id}"
         logInfo(s"Starting $taskName (TID $taskId, $host, executor ${info.executorId}, " +
-          s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit} bytes)")
+          s"partition ${task.partitionId}, $taskLocality, ${serializedTask.limit()} bytes)")
 
         sched.dagScheduler.taskStarted(task, info)
         new TaskDescription(

From 2fe16333d59cd8558afca3916821e1ea7e98d1bc Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 14 Dec 2017 14:03:08 -0800
Subject: [PATCH 124/356] [SPARK-22778][KUBERNETES] Added the missing service
 metadata for KubernetesClusterManager

## What changes were proposed in this pull request?

This PR added the missing service metadata for `KubernetesClusterManager`. Without the metadata, the service loader couldn't load `KubernetesClusterManager`, and caused the driver to fail to create a `ExternalClusterManager`, as being reported in SPARK-22778. The PR also changed the `k8s:` prefix used to `k8s://`, which is what existing Spark on k8s users are familiar and used to.

## How was this patch tested?

Manual testing verified that the fix resolved the issue in SPARK-22778.

/cc vanzin felixcheung jiangxb1987

Author: Yinan Li <liyinan926@gmail.com>

Closes #19972 from liyinan926/fix-22778.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala     | 6 +++---
 .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala  | 2 +-
 .../src/test/scala/org/apache/spark/util/UtilsSuite.scala | 8 ++++----
 .../org.apache.spark.scheduler.ExternalClusterManager     | 1 +
 .../deploy/k8s/submit/KubernetesClientApplication.scala   | 4 ++--
 5 files changed, 11 insertions(+), 10 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index fe5b4ea24440b..8871870eb8681 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2757,7 +2757,7 @@ private[spark] object Utils extends Logging {
 
   /**
    * Check the validity of the given Kubernetes master URL and return the resolved URL. Prefix
-   * "k8s:" is appended to the resolved URL as the prefix is used by KubernetesClusterManager
+   * "k8s://" is appended to the resolved URL as the prefix is used by KubernetesClusterManager
    * in canCreate to determine if the KubernetesClusterManager should be used.
    */
   def checkAndGetK8sMasterUrl(rawMasterURL: String): String = {
@@ -2770,7 +2770,7 @@ private[spark] object Utils extends Logging {
       val resolvedURL = s"https://$masterWithoutK8sPrefix"
       logInfo("No scheme specified for kubernetes master URL, so defaulting to https. Resolved " +
         s"URL is $resolvedURL.")
-      return s"k8s:$resolvedURL"
+      return s"k8s://$resolvedURL"
     }
 
     val masterScheme = new URI(masterWithoutK8sPrefix).getScheme
@@ -2789,7 +2789,7 @@ private[spark] object Utils extends Logging {
         throw new IllegalArgumentException("Invalid Kubernetes master scheme: " + masterScheme)
     }
 
-    return s"k8s:$resolvedURL"
+    s"k8s://$resolvedURL"
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 35594ec47c941..2eb8a1fee104c 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -408,7 +408,7 @@ class SparkSubmitSuite
     childArgsMap.get("--arg") should be (Some("arg1"))
     mainClass should be (KUBERNETES_CLUSTER_SUBMIT_CLASS)
     classpath should have length (0)
-    conf.get("spark.master") should be ("k8s:https://host:port")
+    conf.get("spark.master") should be ("k8s://https://host:port")
     conf.get("spark.executor.memory") should be ("5g")
     conf.get("spark.driver.memory") should be ("4g")
     conf.get("spark.kubernetes.namespace") should be ("spark")
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 5c4e4ca0cded6..eaea6b030c154 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1148,16 +1148,16 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
   test("check Kubernetes master URL") {
     val k8sMasterURLHttps = Utils.checkAndGetK8sMasterUrl("k8s://https://host:port")
-    assert(k8sMasterURLHttps === "k8s:https://host:port")
+    assert(k8sMasterURLHttps === "k8s://https://host:port")
 
     val k8sMasterURLHttp = Utils.checkAndGetK8sMasterUrl("k8s://http://host:port")
-    assert(k8sMasterURLHttp === "k8s:http://host:port")
+    assert(k8sMasterURLHttp === "k8s://http://host:port")
 
     val k8sMasterURLWithoutScheme = Utils.checkAndGetK8sMasterUrl("k8s://127.0.0.1:8443")
-    assert(k8sMasterURLWithoutScheme === "k8s:https://127.0.0.1:8443")
+    assert(k8sMasterURLWithoutScheme === "k8s://https://127.0.0.1:8443")
 
     val k8sMasterURLWithoutScheme2 = Utils.checkAndGetK8sMasterUrl("k8s://127.0.0.1")
-    assert(k8sMasterURLWithoutScheme2 === "k8s:https://127.0.0.1")
+    assert(k8sMasterURLWithoutScheme2 === "k8s://https://127.0.0.1")
 
     intercept[IllegalArgumentException] {
       Utils.checkAndGetK8sMasterUrl("k8s:https://host:port")
diff --git a/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
new file mode 100644
index 0000000000000..81d14766ffb8d
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager
@@ -0,0 +1 @@
+org.apache.spark.scheduler.cluster.k8s.KubernetesClusterManager
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
index 4d17608c602d8..240a1144577b0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
@@ -203,8 +203,8 @@ private[spark] class KubernetesClientApplication extends SparkApplication {
     val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
     val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
     // The master URL has been checked for validity already in SparkSubmit.
-    // We just need to get rid of the "k8s:" prefix here.
-    val master = sparkConf.get("spark.master").substring("k8s:".length)
+    // We just need to get rid of the "k8s://" prefix here.
+    val master = sparkConf.get("spark.master").substring("k8s://".length)
     val loggingInterval = if (waitForAppCompletion) Some(sparkConf.get(REPORT_INTERVAL)) else None
 
     val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(

From 59daf91b7cfb50b1c20eb41959921fc03103b739 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Thu, 14 Dec 2017 14:31:21 -0800
Subject: [PATCH 125/356] [SPARK-22733] Split StreamExecution into
 MicroBatchExecution and StreamExecution.

## What changes were proposed in this pull request?

StreamExecution is now an abstract base class, which MicroBatchExecution (the current StreamExecution) inherits. When continuous processing is implemented, we'll have a new ContinuousExecution implementation of StreamExecution.

A few fields are also renamed to make them less microbatch-specific.

## How was this patch tested?

refactoring only

Author: Jose Torres <jose@databricks.com>

Closes #19926 from joseph-torres/continuous-refactor.
---
 .../{BatchCommitLog.scala => CommitLog.scala} |   8 +-
 .../streaming/MicroBatchExecution.scala       | 407 ++++++++++++++++
 .../execution/streaming/StreamExecution.scala | 457 ++----------------
 .../sql/streaming/StreamingQueryManager.scala |   2 +-
 .../streaming/EventTimeWatermarkSuite.scala   |   4 +-
 .../sql/streaming/FileStreamSourceSuite.scala |   5 +-
 .../spark/sql/streaming/StreamSuite.scala     |   2 +-
 .../spark/sql/streaming/StreamTest.scala      |  20 +-
 .../streaming/StreamingAggregationSuite.scala |   4 +-
 .../sql/streaming/StreamingQuerySuite.scala   |   4 +-
 10 files changed, 484 insertions(+), 429 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/{BatchCommitLog.scala => CommitLog.scala} (93%)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BatchCommitLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CommitLog.scala
similarity index 93%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BatchCommitLog.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CommitLog.scala
index 5e24e8fc4e3cc..5b114242558dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BatchCommitLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/CommitLog.scala
@@ -42,10 +42,10 @@ import org.apache.spark.sql.SparkSession
  * line 1: version
  * line 2: metadata (optional json string)
  */
-class BatchCommitLog(sparkSession: SparkSession, path: String)
+class CommitLog(sparkSession: SparkSession, path: String)
   extends HDFSMetadataLog[String](sparkSession, path) {
 
-  import BatchCommitLog._
+  import CommitLog._
 
   def add(batchId: Long): Unit = {
     super.add(batchId, EMPTY_JSON)
@@ -53,7 +53,7 @@ class BatchCommitLog(sparkSession: SparkSession, path: String)
 
   override def add(batchId: Long, metadata: String): Boolean = {
     throw new UnsupportedOperationException(
-      "BatchCommitLog does not take any metadata, use 'add(batchId)' instead")
+      "CommitLog does not take any metadata, use 'add(batchId)' instead")
   }
 
   override protected def deserialize(in: InputStream): String = {
@@ -76,7 +76,7 @@ class BatchCommitLog(sparkSession: SparkSession, path: String)
   }
 }
 
-object BatchCommitLog {
+object CommitLog {
   private val VERSION = 1
   private val EMPTY_JSON = "{}"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
new file mode 100644
index 0000000000000..a67dda99dc01b
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming
+
+import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
+
+import org.apache.spark.sql.{Dataset, SparkSession}
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.sources.v2.reader.Offset
+import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
+import org.apache.spark.util.{Clock, Utils}
+
+class MicroBatchExecution(
+    sparkSession: SparkSession,
+    name: String,
+    checkpointRoot: String,
+    analyzedPlan: LogicalPlan,
+    sink: Sink,
+    trigger: Trigger,
+    triggerClock: Clock,
+    outputMode: OutputMode,
+    deleteCheckpointOnStop: Boolean)
+  extends StreamExecution(
+    sparkSession, name, checkpointRoot, analyzedPlan, sink,
+    trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
+
+  private val triggerExecutor = trigger match {
+    case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock)
+    case OneTimeTrigger => OneTimeExecutor()
+    case _ => throw new IllegalStateException(s"Unknown type of trigger: $trigger")
+  }
+
+  override lazy val logicalPlan: LogicalPlan = {
+    assert(queryExecutionThread eq Thread.currentThread,
+      "logicalPlan must be initialized in QueryExecutionThread " +
+        s"but the current thread was ${Thread.currentThread}")
+    var nextSourceId = 0L
+    val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
+    val _logicalPlan = analyzedPlan.transform {
+      case streamingRelation@StreamingRelation(dataSource, _, output) =>
+        toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
+          // Materialize source to avoid creating it in every batch
+          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+          val source = dataSource.createSource(metadataPath)
+          nextSourceId += 1
+          // We still need to use the previous `output` instead of `source.schema` as attributes in
+          // "df.logicalPlan" has already used attributes of the previous `output`.
+          StreamingExecutionRelation(source, output)(sparkSession)
+        })
+    }
+    sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
+    uniqueSources = sources.distinct
+    _logicalPlan
+  }
+
+  /**
+   * Repeatedly attempts to run batches as data arrives.
+   */
+  protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
+    triggerExecutor.execute(() => {
+      startTrigger()
+
+      if (isActive) {
+        reportTimeTaken("triggerExecution") {
+          if (currentBatchId < 0) {
+            // We'll do this initialization only once
+            populateStartOffsets(sparkSessionForStream)
+            sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
+            logDebug(s"Stream running from $committedOffsets to $availableOffsets")
+          } else {
+            constructNextBatch()
+          }
+          if (dataAvailable) {
+            currentStatus = currentStatus.copy(isDataAvailable = true)
+            updateStatusMessage("Processing new data")
+            runBatch(sparkSessionForStream)
+          }
+        }
+        // Report trigger as finished and construct progress object.
+        finishTrigger(dataAvailable)
+        if (dataAvailable) {
+          // Update committed offsets.
+          commitLog.add(currentBatchId)
+          committedOffsets ++= availableOffsets
+          logDebug(s"batch ${currentBatchId} committed")
+          // We'll increase currentBatchId after we complete processing current batch's data
+          currentBatchId += 1
+          sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
+        } else {
+          currentStatus = currentStatus.copy(isDataAvailable = false)
+          updateStatusMessage("Waiting for data to arrive")
+          Thread.sleep(pollingDelayMs)
+        }
+      }
+      updateStatusMessage("Waiting for next trigger")
+      isActive
+    })
+  }
+
+  /**
+   * Populate the start offsets to start the execution at the current offsets stored in the sink
+   * (i.e. avoid reprocessing data that we have already processed). This function must be called
+   * before any processing occurs and will populate the following fields:
+   *  - currentBatchId
+   *  - committedOffsets
+   *  - availableOffsets
+   *  The basic structure of this method is as follows:
+   *
+   *  Identify (from the offset log) the offsets used to run the last batch
+   *  IF last batch exists THEN
+   *    Set the next batch to be executed as the last recovered batch
+   *    Check the commit log to see which batch was committed last
+   *    IF the last batch was committed THEN
+   *      Call getBatch using the last batch start and end offsets
+   *      // ^^^^ above line is needed since some sources assume last batch always re-executes
+   *      Setup for a new batch i.e., start = last batch end, and identify new end
+   *    DONE
+   *  ELSE
+   *    Identify a brand new batch
+   *  DONE
+   */
+  private def populateStartOffsets(sparkSessionToRunBatches: SparkSession): Unit = {
+    offsetLog.getLatest() match {
+      case Some((latestBatchId, nextOffsets)) =>
+        /* First assume that we are re-executing the latest known batch
+         * in the offset log */
+        currentBatchId = latestBatchId
+        availableOffsets = nextOffsets.toStreamProgress(sources)
+        /* Initialize committed offsets to a committed batch, which at this
+         * is the second latest batch id in the offset log. */
+        if (latestBatchId != 0) {
+          val secondLatestBatchId = offsetLog.get(latestBatchId - 1).getOrElse {
+            throw new IllegalStateException(s"batch ${latestBatchId - 1} doesn't exist")
+          }
+          committedOffsets = secondLatestBatchId.toStreamProgress(sources)
+        }
+
+        // update offset metadata
+        nextOffsets.metadata.foreach { metadata =>
+          OffsetSeqMetadata.setSessionConf(metadata, sparkSessionToRunBatches.conf)
+          offsetSeqMetadata = OffsetSeqMetadata(
+            metadata.batchWatermarkMs, metadata.batchTimestampMs, sparkSessionToRunBatches.conf)
+        }
+
+        /* identify the current batch id: if commit log indicates we successfully processed the
+         * latest batch id in the offset log, then we can safely move to the next batch
+         * i.e., committedBatchId + 1 */
+        commitLog.getLatest() match {
+          case Some((latestCommittedBatchId, _)) =>
+            if (latestBatchId == latestCommittedBatchId) {
+              /* The last batch was successfully committed, so we can safely process a
+               * new next batch but first:
+               * Make a call to getBatch using the offsets from previous batch.
+               * because certain sources (e.g., KafkaSource) assume on restart the last
+               * batch will be executed before getOffset is called again. */
+              availableOffsets.foreach { ao: (Source, Offset) =>
+                val (source, end) = ao
+                if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
+                  val start = committedOffsets.get(source)
+                  source.getBatch(start, end)
+                }
+              }
+              currentBatchId = latestCommittedBatchId + 1
+              committedOffsets ++= availableOffsets
+              // Construct a new batch be recomputing availableOffsets
+              constructNextBatch()
+            } else if (latestCommittedBatchId < latestBatchId - 1) {
+              logWarning(s"Batch completion log latest batch id is " +
+                s"${latestCommittedBatchId}, which is not trailing " +
+                s"batchid $latestBatchId by one")
+            }
+          case None => logInfo("no commit log present")
+        }
+        logDebug(s"Resuming at batch $currentBatchId with committed offsets " +
+          s"$committedOffsets and available offsets $availableOffsets")
+      case None => // We are starting this stream for the first time.
+        logInfo(s"Starting new streaming query.")
+        currentBatchId = 0
+        constructNextBatch()
+    }
+  }
+
+  /**
+   * Returns true if there is any new data available to be processed.
+   */
+  private def dataAvailable: Boolean = {
+    availableOffsets.exists {
+      case (source, available) =>
+        committedOffsets
+          .get(source)
+          .map(committed => committed != available)
+          .getOrElse(true)
+    }
+  }
+
+  /**
+   * Queries all of the sources to see if any new data is available. When there is new data the
+   * batchId counter is incremented and a new log entry is written with the newest offsets.
+   */
+  private def constructNextBatch(): Unit = {
+    // Check to see what new data is available.
+    val hasNewData = {
+      awaitProgressLock.lock()
+      try {
+        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
+          updateStatusMessage(s"Getting offsets from $s")
+          reportTimeTaken("getOffset") {
+            (s, s.getOffset)
+          }
+        }.toMap
+        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
+
+        if (dataAvailable) {
+          true
+        } else {
+          noNewData = true
+          false
+        }
+      } finally {
+        awaitProgressLock.unlock()
+      }
+    }
+    if (hasNewData) {
+      var batchWatermarkMs = offsetSeqMetadata.batchWatermarkMs
+      // Update the eventTime watermarks if we find any in the plan.
+      if (lastExecution != null) {
+        lastExecution.executedPlan.collect {
+          case e: EventTimeWatermarkExec => e
+        }.zipWithIndex.foreach {
+          case (e, index) if e.eventTimeStats.value.count > 0 =>
+            logDebug(s"Observed event time stats $index: ${e.eventTimeStats.value}")
+            val newWatermarkMs = e.eventTimeStats.value.max - e.delayMs
+            val prevWatermarkMs = watermarkMsMap.get(index)
+            if (prevWatermarkMs.isEmpty || newWatermarkMs > prevWatermarkMs.get) {
+              watermarkMsMap.put(index, newWatermarkMs)
+            }
+
+          // Populate 0 if we haven't seen any data yet for this watermark node.
+          case (_, index) =>
+            if (!watermarkMsMap.isDefinedAt(index)) {
+              watermarkMsMap.put(index, 0)
+            }
+        }
+
+        // Update the global watermark to the minimum of all watermark nodes.
+        // This is the safest option, because only the global watermark is fault-tolerant. Making
+        // it the minimum of all individual watermarks guarantees it will never advance past where
+        // any individual watermark operator would be if it were in a plan by itself.
+        if(!watermarkMsMap.isEmpty) {
+          val newWatermarkMs = watermarkMsMap.minBy(_._2)._2
+          if (newWatermarkMs > batchWatermarkMs) {
+            logInfo(s"Updating eventTime watermark to: $newWatermarkMs ms")
+            batchWatermarkMs = newWatermarkMs
+          } else {
+            logDebug(
+              s"Event time didn't move: $newWatermarkMs < " +
+                s"$batchWatermarkMs")
+          }
+        }
+      }
+      offsetSeqMetadata = offsetSeqMetadata.copy(
+        batchWatermarkMs = batchWatermarkMs,
+        batchTimestampMs = triggerClock.getTimeMillis()) // Current batch timestamp in milliseconds
+
+      updateStatusMessage("Writing offsets to log")
+      reportTimeTaken("walCommit") {
+        assert(offsetLog.add(
+          currentBatchId,
+          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata)),
+          s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
+        logInfo(s"Committed offsets for batch $currentBatchId. " +
+          s"Metadata ${offsetSeqMetadata.toString}")
+
+        // NOTE: The following code is correct because runStream() processes exactly one
+        // batch at a time. If we add pipeline parallelism (multiple batches in flight at
+        // the same time), this cleanup logic will need to change.
+
+        // Now that we've updated the scheduler's persistent checkpoint, it is safe for the
+        // sources to discard data from the previous batch.
+        if (currentBatchId != 0) {
+          val prevBatchOff = offsetLog.get(currentBatchId - 1)
+          if (prevBatchOff.isDefined) {
+            prevBatchOff.get.toStreamProgress(sources).foreach {
+              case (src, off) => src.commit(off)
+            }
+          } else {
+            throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
+          }
+        }
+
+        // It is now safe to discard the metadata beyond the minimum number to retain.
+        // Note that purge is exclusive, i.e. it purges everything before the target ID.
+        if (minLogEntriesToMaintain < currentBatchId) {
+          offsetLog.purge(currentBatchId - minLogEntriesToMaintain)
+          commitLog.purge(currentBatchId - minLogEntriesToMaintain)
+        }
+      }
+    } else {
+      awaitProgressLock.lock()
+      try {
+        // Wake up any threads that are waiting for the stream to progress.
+        awaitProgressLockCondition.signalAll()
+      } finally {
+        awaitProgressLock.unlock()
+      }
+    }
+  }
+
+  /**
+   * Processes any data available between `availableOffsets` and `committedOffsets`.
+   * @param sparkSessionToRunBatch Isolated [[SparkSession]] to run this batch with.
+   */
+  private def runBatch(sparkSessionToRunBatch: SparkSession): Unit = {
+    // Request unprocessed data from all sources.
+    newData = reportTimeTaken("getBatch") {
+      availableOffsets.flatMap {
+        case (source, available)
+          if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
+          val current = committedOffsets.get(source)
+          val batch = source.getBatch(current, available)
+          assert(batch.isStreaming,
+            s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
+              s"${batch.queryExecution.logical}")
+          logDebug(s"Retrieving data from $source: $current -> $available")
+          Some(source -> batch)
+        case _ => None
+      }
+    }
+
+    // A list of attributes that will need to be updated.
+    val replacements = new ArrayBuffer[(Attribute, Attribute)]
+    // Replace sources in the logical plan with data that has arrived since the last batch.
+    val withNewSources = logicalPlan transform {
+      case StreamingExecutionRelation(source, output) =>
+        newData.get(source).map { data =>
+          val newPlan = data.logicalPlan
+          assert(output.size == newPlan.output.size,
+            s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
+              s"${Utils.truncatedString(newPlan.output, ",")}")
+          replacements ++= output.zip(newPlan.output)
+          newPlan
+        }.getOrElse {
+          LocalRelation(output, isStreaming = true)
+        }
+    }
+
+    // Rewire the plan to use the new attributes that were returned by the source.
+    val replacementMap = AttributeMap(replacements)
+    val triggerLogicalPlan = withNewSources transformAllExpressions {
+      case a: Attribute if replacementMap.contains(a) =>
+        replacementMap(a).withMetadata(a.metadata)
+      case ct: CurrentTimestamp =>
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
+          ct.dataType)
+      case cd: CurrentDate =>
+        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
+          cd.dataType, cd.timeZoneId)
+    }
+
+    reportTimeTaken("queryPlanning") {
+      lastExecution = new IncrementalExecution(
+        sparkSessionToRunBatch,
+        triggerLogicalPlan,
+        outputMode,
+        checkpointFile("state"),
+        runId,
+        currentBatchId,
+        offsetSeqMetadata)
+      lastExecution.executedPlan // Force the lazy generation of execution plan
+    }
+
+    val nextBatch =
+      new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema))
+
+    reportTimeTaken("addBatch") {
+      SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
+        sink.addBatch(currentBatchId, nextBatch)
+      }
+    }
+
+    awaitProgressLock.lock()
+    try {
+      // Wake up any threads that are waiting for the stream to progress.
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 16063c02ce06b..7946889e85e37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -22,10 +22,9 @@ import java.nio.channels.ClosedByInterruptException
 import java.util.UUID
 import java.util.concurrent.{CountDownLatch, ExecutionException, TimeUnit}
 import java.util.concurrent.atomic.AtomicReference
-import java.util.concurrent.locks.ReentrantLock
+import java.util.concurrent.locks.{Condition, ReentrantLock}
 
 import scala.collection.mutable.{Map => MutableMap}
-import scala.collection.mutable.ArrayBuffer
 import scala.util.control.NonFatal
 
 import com.google.common.util.concurrent.UncheckedExecutionException
@@ -33,10 +32,8 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.encoders.RowEncoder
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
-import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.StreamingExplainCommand
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.v2.reader.Offset
@@ -58,7 +55,7 @@ case object TERMINATED extends State
  * @param deleteCheckpointOnStop whether to delete the checkpoint if the query is stopped without
  *                               errors
  */
-class StreamExecution(
+abstract class StreamExecution(
     override val sparkSession: SparkSession,
     override val name: String,
     private val checkpointRoot: String,
@@ -72,16 +69,16 @@ class StreamExecution(
 
   import org.apache.spark.sql.streaming.StreamingQueryListener._
 
-  private val pollingDelayMs = sparkSession.sessionState.conf.streamingPollingDelay
+  protected val pollingDelayMs: Long = sparkSession.sessionState.conf.streamingPollingDelay
 
-  private val minBatchesToRetain = sparkSession.sessionState.conf.minBatchesToRetain
-  require(minBatchesToRetain > 0, "minBatchesToRetain has to be positive")
+  protected val minLogEntriesToMaintain: Int = sparkSession.sessionState.conf.minBatchesToRetain
+  require(minLogEntriesToMaintain > 0, "minBatchesToRetain has to be positive")
 
   /**
    * A lock used to wait/notify when batches complete. Use a fair lock to avoid thread starvation.
    */
-  private val awaitBatchLock = new ReentrantLock(true)
-  private val awaitBatchLockCondition = awaitBatchLock.newCondition()
+  protected val awaitProgressLock = new ReentrantLock(true)
+  protected val awaitProgressLockCondition = awaitProgressLock.newCondition()
 
   private val initializationLatch = new CountDownLatch(1)
   private val startLatch = new CountDownLatch(1)
@@ -90,9 +87,11 @@ class StreamExecution(
   val resolvedCheckpointRoot = {
     val checkpointPath = new Path(checkpointRoot)
     val fs = checkpointPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
-    checkpointPath.makeQualified(fs.getUri(), fs.getWorkingDirectory()).toUri.toString
+    checkpointPath.makeQualified(fs.getUri, fs.getWorkingDirectory).toUri.toString
   }
 
+  def logicalPlan: LogicalPlan
+
   /**
    * Tracks how much data we have processed and committed to the sink or state store from each
    * input source.
@@ -160,36 +159,7 @@ class StreamExecution(
   /**
    * A list of unique sources in the query plan. This will be set when generating logical plan.
    */
-  @volatile private var uniqueSources: Seq[Source] = Seq.empty
-
-  override lazy val logicalPlan: LogicalPlan = {
-    assert(microBatchThread eq Thread.currentThread,
-      "logicalPlan must be initialized in StreamExecutionThread " +
-        s"but the current thread was ${Thread.currentThread}")
-    var nextSourceId = 0L
-    val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
-    val _logicalPlan = analyzedPlan.transform {
-      case streamingRelation@StreamingRelation(dataSource, _, output) =>
-        toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
-          // Materialize source to avoid creating it in every batch
-          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
-          val source = dataSource.createSource(metadataPath)
-          nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
-          StreamingExecutionRelation(source, output)(sparkSession)
-        })
-    }
-    sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
-    uniqueSources = sources.distinct
-    _logicalPlan
-  }
-
-  private val triggerExecutor = trigger match {
-    case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock)
-    case OneTimeTrigger => OneTimeExecutor()
-    case _ => throw new IllegalStateException(s"Unknown type of trigger: $trigger")
-  }
+  @volatile protected var uniqueSources: Seq[Source] = Seq.empty
 
   /** Defines the internal state of execution */
   private val state = new AtomicReference[State](INITIALIZING)
@@ -215,13 +185,13 @@ class StreamExecution(
    * [[org.apache.spark.util.UninterruptibleThread]] to workaround KAFKA-1894: interrupting a
    * running `KafkaConsumer` may cause endless loop.
    */
-  val microBatchThread =
-    new StreamExecutionThread(s"stream execution thread for $prettyIdString") {
+  val queryExecutionThread: QueryExecutionThread =
+    new QueryExecutionThread(s"stream execution thread for $prettyIdString") {
       override def run(): Unit = {
         // To fix call site like "run at <unknown>:0", we bridge the call site from the caller
         // thread to this micro batch thread
         sparkSession.sparkContext.setCallSite(callSite)
-        runBatches()
+        runStream()
       }
     }
 
@@ -238,7 +208,7 @@ class StreamExecution(
    * fully processed, and its output was committed to the sink, hence no need to process it again.
    * This is used (for instance) during restart, to help identify which batch to run next.
    */
-  val batchCommitLog = new BatchCommitLog(sparkSession, checkpointFile("commits"))
+  val commitLog = new CommitLog(sparkSession, checkpointFile("commits"))
 
   /** Whether all fields of the query have been initialized */
   private def isInitialized: Boolean = state.get != INITIALIZING
@@ -250,7 +220,7 @@ class StreamExecution(
   override def exception: Option[StreamingQueryException] = Option(streamDeathCause)
 
   /** Returns the path of a file with `name` in the checkpoint directory. */
-  private def checkpointFile(name: String): String =
+  protected def checkpointFile(name: String): String =
     new Path(new Path(resolvedCheckpointRoot), name).toUri.toString
 
   /**
@@ -259,20 +229,25 @@ class StreamExecution(
    */
   def start(): Unit = {
     logInfo(s"Starting $prettyIdString. Use $resolvedCheckpointRoot to store the query checkpoint.")
-    microBatchThread.setDaemon(true)
-    microBatchThread.start()
+    queryExecutionThread.setDaemon(true)
+    queryExecutionThread.start()
     startLatch.await()  // Wait until thread started and QueryStart event has been posted
   }
 
   /**
-   * Repeatedly attempts to run batches as data arrives.
+   * Run the activated stream until stopped.
+   */
+  protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit
+
+  /**
+   * Activate the stream and then wrap a callout to runActivatedStream, handling start and stop.
    *
    * Note that this method ensures that [[QueryStartedEvent]] and [[QueryTerminatedEvent]] are
    * posted such that listeners are guaranteed to get a start event before a termination.
    * Furthermore, this method also ensures that [[QueryStartedEvent]] event is posted before the
    * `start()` method returns.
    */
-  private def runBatches(): Unit = {
+  private def runStream(): Unit = {
     try {
       sparkSession.sparkContext.setJobGroup(runId.toString, getBatchDescriptionString,
         interruptOnCancel = true)
@@ -295,56 +270,18 @@ class StreamExecution(
       logicalPlan
 
       // Isolated spark session to run the batches with.
-      val sparkSessionToRunBatches = sparkSession.cloneSession()
+      val sparkSessionForStream = sparkSession.cloneSession()
       // Adaptive execution can change num shuffle partitions, disallow
-      sparkSessionToRunBatches.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
+      sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
       // Disable cost-based join optimization as we do not want stateful operations to be rearranged
-      sparkSessionToRunBatches.conf.set(SQLConf.CBO_ENABLED.key, "false")
+      sparkSessionForStream.conf.set(SQLConf.CBO_ENABLED.key, "false")
       offsetSeqMetadata = OffsetSeqMetadata(
-        batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionToRunBatches.conf)
+        batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionForStream.conf)
 
       if (state.compareAndSet(INITIALIZING, ACTIVE)) {
         // Unblock `awaitInitialization`
         initializationLatch.countDown()
-
-        triggerExecutor.execute(() => {
-          startTrigger()
-
-          if (isActive) {
-            reportTimeTaken("triggerExecution") {
-              if (currentBatchId < 0) {
-                // We'll do this initialization only once
-                populateStartOffsets(sparkSessionToRunBatches)
-                sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
-                logDebug(s"Stream running from $committedOffsets to $availableOffsets")
-              } else {
-                constructNextBatch()
-              }
-              if (dataAvailable) {
-                currentStatus = currentStatus.copy(isDataAvailable = true)
-                updateStatusMessage("Processing new data")
-                runBatch(sparkSessionToRunBatches)
-              }
-            }
-            // Report trigger as finished and construct progress object.
-            finishTrigger(dataAvailable)
-            if (dataAvailable) {
-              // Update committed offsets.
-              batchCommitLog.add(currentBatchId)
-              committedOffsets ++= availableOffsets
-              logDebug(s"batch ${currentBatchId} committed")
-              // We'll increase currentBatchId after we complete processing current batch's data
-              currentBatchId += 1
-              sparkSession.sparkContext.setJobDescription(getBatchDescriptionString)
-            } else {
-              currentStatus = currentStatus.copy(isDataAvailable = false)
-              updateStatusMessage("Waiting for data to arrive")
-              Thread.sleep(pollingDelayMs)
-            }
-          }
-          updateStatusMessage("Waiting for next trigger")
-          isActive
-        })
+        runActivatedStream(sparkSessionForStream)
         updateStatusMessage("Stopped")
       } else {
         // `stop()` is already called. Let `finally` finish the cleanup.
@@ -373,7 +310,7 @@ class StreamExecution(
         if (!NonFatal(e)) {
           throw e
         }
-    } finally microBatchThread.runUninterruptibly {
+    } finally queryExecutionThread.runUninterruptibly {
       // The whole `finally` block must run inside `runUninterruptibly` to avoid being interrupted
       // when a query is stopped by the user. We need to make sure the following codes finish
       // otherwise it may throw `InterruptedException` to `UncaughtExceptionHandler` (SPARK-21248).
@@ -410,12 +347,12 @@ class StreamExecution(
           }
         }
       } finally {
-        awaitBatchLock.lock()
+        awaitProgressLock.lock()
         try {
           // Wake up any threads that are waiting for the stream to progress.
-          awaitBatchLockCondition.signalAll()
+          awaitProgressLockCondition.signalAll()
         } finally {
-          awaitBatchLock.unlock()
+          awaitProgressLock.unlock()
         }
         terminationLatch.countDown()
       }
@@ -448,296 +385,6 @@ class StreamExecution(
     }
   }
 
-  /**
-   * Populate the start offsets to start the execution at the current offsets stored in the sink
-   * (i.e. avoid reprocessing data that we have already processed). This function must be called
-   * before any processing occurs and will populate the following fields:
-   *  - currentBatchId
-   *  - committedOffsets
-   *  - availableOffsets
-   *  The basic structure of this method is as follows:
-   *
-   *  Identify (from the offset log) the offsets used to run the last batch
-   *  IF last batch exists THEN
-   *    Set the next batch to be executed as the last recovered batch
-   *    Check the commit log to see which batch was committed last
-   *    IF the last batch was committed THEN
-   *      Call getBatch using the last batch start and end offsets
-   *      // ^^^^ above line is needed since some sources assume last batch always re-executes
-   *      Setup for a new batch i.e., start = last batch end, and identify new end
-   *    DONE
-   *  ELSE
-   *    Identify a brand new batch
-   *  DONE
-   */
-  private def populateStartOffsets(sparkSessionToRunBatches: SparkSession): Unit = {
-    offsetLog.getLatest() match {
-      case Some((latestBatchId, nextOffsets)) =>
-        /* First assume that we are re-executing the latest known batch
-         * in the offset log */
-        currentBatchId = latestBatchId
-        availableOffsets = nextOffsets.toStreamProgress(sources)
-        /* Initialize committed offsets to a committed batch, which at this
-         * is the second latest batch id in the offset log. */
-        if (latestBatchId != 0) {
-          val secondLatestBatchId = offsetLog.get(latestBatchId - 1).getOrElse {
-            throw new IllegalStateException(s"batch ${latestBatchId - 1} doesn't exist")
-          }
-          committedOffsets = secondLatestBatchId.toStreamProgress(sources)
-        }
-
-        // update offset metadata
-        nextOffsets.metadata.foreach { metadata =>
-          OffsetSeqMetadata.setSessionConf(metadata, sparkSessionToRunBatches.conf)
-          offsetSeqMetadata = OffsetSeqMetadata(
-            metadata.batchWatermarkMs, metadata.batchTimestampMs, sparkSessionToRunBatches.conf)
-        }
-
-        /* identify the current batch id: if commit log indicates we successfully processed the
-         * latest batch id in the offset log, then we can safely move to the next batch
-         * i.e., committedBatchId + 1 */
-        batchCommitLog.getLatest() match {
-          case Some((latestCommittedBatchId, _)) =>
-            if (latestBatchId == latestCommittedBatchId) {
-              /* The last batch was successfully committed, so we can safely process a
-               * new next batch but first:
-               * Make a call to getBatch using the offsets from previous batch.
-               * because certain sources (e.g., KafkaSource) assume on restart the last
-               * batch will be executed before getOffset is called again. */
-              availableOffsets.foreach { ao: (Source, Offset) =>
-                val (source, end) = ao
-                if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
-                  val start = committedOffsets.get(source)
-                  source.getBatch(start, end)
-                }
-              }
-              currentBatchId = latestCommittedBatchId + 1
-              committedOffsets ++= availableOffsets
-              // Construct a new batch be recomputing availableOffsets
-              constructNextBatch()
-            } else if (latestCommittedBatchId < latestBatchId - 1) {
-              logWarning(s"Batch completion log latest batch id is " +
-                s"${latestCommittedBatchId}, which is not trailing " +
-                s"batchid $latestBatchId by one")
-            }
-          case None => logInfo("no commit log present")
-        }
-        logDebug(s"Resuming at batch $currentBatchId with committed offsets " +
-          s"$committedOffsets and available offsets $availableOffsets")
-      case None => // We are starting this stream for the first time.
-        logInfo(s"Starting new streaming query.")
-        currentBatchId = 0
-        constructNextBatch()
-    }
-  }
-
-  /**
-   * Returns true if there is any new data available to be processed.
-   */
-  private def dataAvailable: Boolean = {
-    availableOffsets.exists {
-      case (source, available) =>
-        committedOffsets
-            .get(source)
-            .map(committed => committed != available)
-            .getOrElse(true)
-    }
-  }
-
-  /**
-   * Queries all of the sources to see if any new data is available. When there is new data the
-   * batchId counter is incremented and a new log entry is written with the newest offsets.
-   */
-  private def constructNextBatch(): Unit = {
-    // Check to see what new data is available.
-    val hasNewData = {
-      awaitBatchLock.lock()
-      try {
-        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
-          updateStatusMessage(s"Getting offsets from $s")
-          reportTimeTaken("getOffset") {
-            (s, s.getOffset)
-          }
-        }.toMap
-        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
-
-        if (dataAvailable) {
-          true
-        } else {
-          noNewData = true
-          false
-        }
-      } finally {
-        awaitBatchLock.unlock()
-      }
-    }
-    if (hasNewData) {
-      var batchWatermarkMs = offsetSeqMetadata.batchWatermarkMs
-      // Update the eventTime watermarks if we find any in the plan.
-      if (lastExecution != null) {
-        lastExecution.executedPlan.collect {
-          case e: EventTimeWatermarkExec => e
-        }.zipWithIndex.foreach {
-          case (e, index) if e.eventTimeStats.value.count > 0 =>
-            logDebug(s"Observed event time stats $index: ${e.eventTimeStats.value}")
-            val newWatermarkMs = e.eventTimeStats.value.max - e.delayMs
-            val prevWatermarkMs = watermarkMsMap.get(index)
-            if (prevWatermarkMs.isEmpty || newWatermarkMs > prevWatermarkMs.get) {
-              watermarkMsMap.put(index, newWatermarkMs)
-            }
-
-          // Populate 0 if we haven't seen any data yet for this watermark node.
-          case (_, index) =>
-            if (!watermarkMsMap.isDefinedAt(index)) {
-              watermarkMsMap.put(index, 0)
-            }
-        }
-
-        // Update the global watermark to the minimum of all watermark nodes.
-        // This is the safest option, because only the global watermark is fault-tolerant. Making
-        // it the minimum of all individual watermarks guarantees it will never advance past where
-        // any individual watermark operator would be if it were in a plan by itself.
-        if(!watermarkMsMap.isEmpty) {
-          val newWatermarkMs = watermarkMsMap.minBy(_._2)._2
-          if (newWatermarkMs > batchWatermarkMs) {
-            logInfo(s"Updating eventTime watermark to: $newWatermarkMs ms")
-            batchWatermarkMs = newWatermarkMs
-          } else {
-            logDebug(
-              s"Event time didn't move: $newWatermarkMs < " +
-                s"$batchWatermarkMs")
-          }
-        }
-      }
-      offsetSeqMetadata = offsetSeqMetadata.copy(
-        batchWatermarkMs = batchWatermarkMs,
-        batchTimestampMs = triggerClock.getTimeMillis()) // Current batch timestamp in milliseconds
-
-      updateStatusMessage("Writing offsets to log")
-      reportTimeTaken("walCommit") {
-        assert(offsetLog.add(
-          currentBatchId,
-          availableOffsets.toOffsetSeq(sources, offsetSeqMetadata)),
-          s"Concurrent update to the log. Multiple streaming jobs detected for $currentBatchId")
-        logInfo(s"Committed offsets for batch $currentBatchId. " +
-          s"Metadata ${offsetSeqMetadata.toString}")
-
-        // NOTE: The following code is correct because runBatches() processes exactly one
-        // batch at a time. If we add pipeline parallelism (multiple batches in flight at
-        // the same time), this cleanup logic will need to change.
-
-        // Now that we've updated the scheduler's persistent checkpoint, it is safe for the
-        // sources to discard data from the previous batch.
-        if (currentBatchId != 0) {
-          val prevBatchOff = offsetLog.get(currentBatchId - 1)
-          if (prevBatchOff.isDefined) {
-            prevBatchOff.get.toStreamProgress(sources).foreach {
-              case (src, off) => src.commit(off)
-            }
-          } else {
-            throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
-          }
-        }
-
-        // It is now safe to discard the metadata beyond the minimum number to retain.
-        // Note that purge is exclusive, i.e. it purges everything before the target ID.
-        if (minBatchesToRetain < currentBatchId) {
-          offsetLog.purge(currentBatchId - minBatchesToRetain)
-          batchCommitLog.purge(currentBatchId - minBatchesToRetain)
-        }
-      }
-    } else {
-      awaitBatchLock.lock()
-      try {
-        // Wake up any threads that are waiting for the stream to progress.
-        awaitBatchLockCondition.signalAll()
-      } finally {
-        awaitBatchLock.unlock()
-      }
-    }
-  }
-
-  /**
-   * Processes any data available between `availableOffsets` and `committedOffsets`.
-   * @param sparkSessionToRunBatch Isolated [[SparkSession]] to run this batch with.
-   */
-  private def runBatch(sparkSessionToRunBatch: SparkSession): Unit = {
-    // Request unprocessed data from all sources.
-    newData = reportTimeTaken("getBatch") {
-      availableOffsets.flatMap {
-        case (source, available)
-          if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
-          val current = committedOffsets.get(source)
-          val batch = source.getBatch(current, available)
-          assert(batch.isStreaming,
-            s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
-              s"${batch.queryExecution.logical}")
-          logDebug(s"Retrieving data from $source: $current -> $available")
-          Some(source -> batch)
-        case _ => None
-      }
-    }
-
-    // A list of attributes that will need to be updated.
-    val replacements = new ArrayBuffer[(Attribute, Attribute)]
-    // Replace sources in the logical plan with data that has arrived since the last batch.
-    val withNewSources = logicalPlan transform {
-      case StreamingExecutionRelation(source, output) =>
-        newData.get(source).map { data =>
-          val newPlan = data.logicalPlan
-          assert(output.size == newPlan.output.size,
-            s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
-            s"${Utils.truncatedString(newPlan.output, ",")}")
-          replacements ++= output.zip(newPlan.output)
-          newPlan
-        }.getOrElse {
-          LocalRelation(output, isStreaming = true)
-        }
-    }
-
-    // Rewire the plan to use the new attributes that were returned by the source.
-    val replacementMap = AttributeMap(replacements)
-    val triggerLogicalPlan = withNewSources transformAllExpressions {
-      case a: Attribute if replacementMap.contains(a) =>
-        replacementMap(a).withMetadata(a.metadata)
-      case ct: CurrentTimestamp =>
-        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
-          ct.dataType)
-      case cd: CurrentDate =>
-        CurrentBatchTimestamp(offsetSeqMetadata.batchTimestampMs,
-          cd.dataType, cd.timeZoneId)
-    }
-
-    reportTimeTaken("queryPlanning") {
-      lastExecution = new IncrementalExecution(
-        sparkSessionToRunBatch,
-        triggerLogicalPlan,
-        outputMode,
-        checkpointFile("state"),
-        runId,
-        currentBatchId,
-        offsetSeqMetadata)
-      lastExecution.executedPlan // Force the lazy generation of execution plan
-    }
-
-    val nextBatch =
-      new Dataset(sparkSessionToRunBatch, lastExecution, RowEncoder(lastExecution.analyzed.schema))
-
-    reportTimeTaken("addBatch") {
-      SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
-        sink.addBatch(currentBatchId, nextBatch)
-      }
-    }
-
-    awaitBatchLock.lock()
-    try {
-      // Wake up any threads that are waiting for the stream to progress.
-      awaitBatchLockCondition.signalAll()
-    } finally {
-      awaitBatchLock.unlock()
-    }
-  }
-
   override protected def postEvent(event: StreamingQueryListener.Event): Unit = {
     sparkSession.streams.postListenerEvent(event)
   }
@@ -762,10 +409,10 @@ class StreamExecution(
     // Set the state to TERMINATED so that the batching thread knows that it was interrupted
     // intentionally
     state.set(TERMINATED)
-    if (microBatchThread.isAlive) {
+    if (queryExecutionThread.isAlive) {
       sparkSession.sparkContext.cancelJobGroup(runId.toString)
-      microBatchThread.interrupt()
-      microBatchThread.join()
+      queryExecutionThread.interrupt()
+      queryExecutionThread.join()
       // microBatchThread may spawn new jobs, so we need to cancel again to prevent a leak
       sparkSession.sparkContext.cancelJobGroup(runId.toString)
     }
@@ -784,21 +431,21 @@ class StreamExecution(
     }
 
     while (notDone) {
-      awaitBatchLock.lock()
+      awaitProgressLock.lock()
       try {
-        awaitBatchLockCondition.await(100, TimeUnit.MILLISECONDS)
+        awaitProgressLockCondition.await(100, TimeUnit.MILLISECONDS)
         if (streamDeathCause != null) {
           throw streamDeathCause
         }
       } finally {
-        awaitBatchLock.unlock()
+        awaitProgressLock.unlock()
       }
     }
     logDebug(s"Unblocked at $newOffset for $source")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
-  @volatile private var noNewData = false
+  @volatile protected var noNewData = false
 
   /**
    * Assert that the await APIs should not be called in the stream thread. Otherwise, it may cause
@@ -806,7 +453,7 @@ class StreamExecution(
    * the stream thread forever.
    */
   private def assertAwaitThread(): Unit = {
-    if (microBatchThread eq Thread.currentThread) {
+    if (queryExecutionThread eq Thread.currentThread) {
       throw new IllegalStateException(
         "Cannot wait for a query state from the same thread that is running the query")
     }
@@ -833,11 +480,11 @@ class StreamExecution(
       throw streamDeathCause
     }
     if (!isActive) return
-    awaitBatchLock.lock()
+    awaitProgressLock.lock()
     try {
       noNewData = false
       while (true) {
-        awaitBatchLockCondition.await(10000, TimeUnit.MILLISECONDS)
+        awaitProgressLockCondition.await(10000, TimeUnit.MILLISECONDS)
         if (streamDeathCause != null) {
           throw streamDeathCause
         }
@@ -846,7 +493,7 @@ class StreamExecution(
         }
       }
     } finally {
-      awaitBatchLock.unlock()
+      awaitProgressLock.unlock()
     }
   }
 
@@ -900,7 +547,7 @@ class StreamExecution(
           |Current Available Offsets: $availableOffsets
           |
           |Current State: $state
-          |Thread State: ${microBatchThread.getState}""".stripMargin
+          |Thread State: ${queryExecutionThread.getState}""".stripMargin
     if (includeLogicalPlan) {
       debugString + s"\n\nLogical Plan:\n$logicalPlan"
     } else {
@@ -908,7 +555,7 @@ class StreamExecution(
     }
   }
 
-  private def getBatchDescriptionString: String = {
+  protected def getBatchDescriptionString: String = {
     val batchDescription = if (currentBatchId < 0) "init" else currentBatchId.toString
     Option(name).map(_ + "<br/>").getOrElse("") +
       s"id = $id<br/>runId = $runId<br/>batch = $batchDescription"
@@ -920,7 +567,7 @@ object StreamExecution {
 }
 
 /**
- * A special thread to run the stream query. Some codes require to run in the StreamExecutionThread
- * and will use `classOf[StreamExecutionThread]` to check.
+ * A special thread to run the stream query. Some codes require to run in the QueryExecutionThread
+ * and will use `classOf[QueryxecutionThread]` to check.
  */
-abstract class StreamExecutionThread(name: String) extends UninterruptibleThread(name)
+abstract class QueryExecutionThread(name: String) extends UninterruptibleThread(name)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 48b0ea20e5da1..555d6e23f9385 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -237,7 +237,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           "is not supported in streaming DataFrames/Datasets and will be disabled.")
     }
 
-    new StreamingQueryWrapper(new StreamExecution(
+    new StreamingQueryWrapper(new MicroBatchExecution(
       sparkSession,
       userSpecifiedName.orNull,
       checkpointLocation,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
index 47bc452bda0d4..d6bef9ce07379 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -260,8 +260,8 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche
       CheckLastBatch((10, 5)),
       StopStream,
       AssertOnQuery { q => // purge commit and clear the sink
-        val commit = q.batchCommitLog.getLatest().map(_._1).getOrElse(-1L) + 1L
-        q.batchCommitLog.purge(commit)
+        val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) + 1L
+        q.commitLog.purge(commit)
         q.sink.asInstanceOf[MemorySink].clear()
         true
       },
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index 7a2d9e3728208..c5b57bca18313 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -1024,7 +1024,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         expectedCompactInterval: Int): Boolean = {
       import CompactibleFileStreamLog._
 
-      val fileSource = (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
+      val fileSource = getSourcesFromStreamingQuery(execution).head
       val metadataLog = fileSource invokePrivate _metadataLog()
 
       if (isCompactionBatch(batchId, expectedCompactInterval)) {
@@ -1100,8 +1100,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
           CheckAnswer("keep1", "keep2", "keep3"),
           AssertOnQuery("check getBatch") { execution: StreamExecution =>
             val _sources = PrivateMethod[Seq[Source]]('sources)
-            val fileSource =
-              (execution invokePrivate _sources()).head.asInstanceOf[FileStreamSource]
+            val fileSource = getSourcesFromStreamingQuery(execution).head
 
             def verify(startId: Option[Int], endId: Int, expected: String*): Unit = {
               val start = startId.map(new FileStreamSourceOffset(_))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 8163a1f91e1ca..9e696b2236b68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -276,7 +276,7 @@ class StreamSuite extends StreamTest {
 
     // Check the latest batchid in the commit log
     def CheckCommitLogLatestBatchId(expectedId: Int): AssertOnQuery =
-      AssertOnQuery(_.batchCommitLog.getLatest().get._1 == expectedId,
+      AssertOnQuery(_.commitLog.getLatest().get._1 == expectedId,
         s"commitLog's latest should be $expectedId")
 
     // Ensure that there has not been an incremental execution after restart
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 7a1ff89f2f3b2..fb88c5d327043 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -300,12 +300,14 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
       if (currentStream != null) currentStream.committedOffsets.toString else "not started"
 
     def threadState =
-      if (currentStream != null && currentStream.microBatchThread.isAlive) "alive" else "dead"
-    def threadStackTrace = if (currentStream != null && currentStream.microBatchThread.isAlive) {
-      s"Thread stack trace: ${currentStream.microBatchThread.getStackTrace.mkString("\n")}"
-    } else {
-      ""
-    }
+      if (currentStream != null && currentStream.queryExecutionThread.isAlive) "alive" else "dead"
+
+    def threadStackTrace =
+      if (currentStream != null && currentStream.queryExecutionThread.isAlive) {
+        s"Thread stack trace: ${currentStream.queryExecutionThread.getStackTrace.mkString("\n")}"
+      } else {
+        ""
+      }
 
     def testState =
       s"""
@@ -460,7 +462,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             verify(currentStream != null, "can not stop a stream that is not running")
             try failAfter(streamingTimeout) {
               currentStream.stop()
-              verify(!currentStream.microBatchThread.isAlive,
+              verify(!currentStream.queryExecutionThread.isAlive,
                 s"microbatch thread not stopped")
               verify(!currentStream.isActive,
                 "query.isActive() is false even after stopping")
@@ -486,7 +488,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                 currentStream.awaitTermination()
               }
               eventually("microbatch thread not stopped after termination with failure") {
-                assert(!currentStream.microBatchThread.isAlive)
+                assert(!currentStream.queryExecutionThread.isAlive)
               }
               verify(currentStream.exception === Some(thrownException),
                 s"incorrect exception returned by query.exception()")
@@ -614,7 +616,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
       case e: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
         failTest("Timed out waiting for stream", e)
     } finally {
-      if (currentStream != null && currentStream.microBatchThread.isAlive) {
+      if (currentStream != null && currentStream.queryExecutionThread.isAlive) {
         currentStream.stop()
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index fa0313592b8e7..38aa5171314f2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -300,7 +300,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       StopStream,
       AssertOnQuery { q => // clear the sink
         q.sink.asInstanceOf[MemorySink].clear()
-        q.batchCommitLog.purge(3)
+        q.commitLog.purge(3)
         // advance by a minute i.e., 90 seconds total
         clock.advance(60 * 1000L)
         true
@@ -352,7 +352,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest
       StopStream,
       AssertOnQuery { q => // clear the sink
         q.sink.asInstanceOf[MemorySink].clear()
-        q.batchCommitLog.purge(3)
+        q.commitLog.purge(3)
         // advance by 60 days i.e., 90 days total
         clock.advance(DateTimeUtils.MILLIS_PER_DAY * 60)
         true
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index f813b77e3ce6b..ad4d3abd01aa5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -173,12 +173,12 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
       StopStream, // clears out StreamTest state
       AssertOnQuery { q =>
         // both commit log and offset log contain the same (latest) batch id
-        q.batchCommitLog.getLatest().map(_._1).getOrElse(-1L) ==
+        q.commitLog.getLatest().map(_._1).getOrElse(-1L) ==
           q.offsetLog.getLatest().map(_._1).getOrElse(-2L)
       },
       AssertOnQuery { q =>
         // blow away commit log and sink result
-        q.batchCommitLog.purge(1)
+        q.commitLog.purge(1)
         q.sink.asInstanceOf[MemorySink].clear()
         true
       },

From 0ea2d8c12e49e30df6bbfa57d74134b25f96a196 Mon Sep 17 00:00:00 2001
From: zouchenjun <zouchenjun@youzan.com>
Date: Thu, 14 Dec 2017 15:37:26 -0800
Subject: [PATCH 126/356] [SPARK-22496][SQL] thrift server adds operation logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
since hive 2.0+  upgrades log4j to log4j2，a lot of [changes](https://issues.apache.org/jira/browse/HIVE-11304) are made working on it.
as spark is not to ready to update its inner hive version(1.2.1) , so I manage to make little changes.
the function registerCurrentOperationLog  is moved from SQLOperstion to its parent class ExecuteStatementOperation so spark can use it.

## How was this patch tested?
manual test

Closes #19721 from ChenjunZou/operation-log.

Author: zouchenjun <zouchenjun@youzan.com>

Closes #19961 from ChenjunZou/spark-22496.
---
 .../cli/operation/ExecuteStatementOperation.java    | 13 +++++++++++++
 .../hive/service/cli/operation/SQLOperation.java    | 12 ------------
 .../SparkExecuteStatementOperation.scala            |  1 +
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
index 3f2de108f069a..6740d3bb59dc3 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
@@ -23,6 +23,7 @@
 
 import org.apache.hadoop.hive.ql.processors.CommandProcessor;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory;
+import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hive.service.cli.HiveSQLException;
 import org.apache.hive.service.cli.OperationType;
 import org.apache.hive.service.cli.session.HiveSession;
@@ -67,4 +68,16 @@ protected void setConfOverlay(Map<String, String> confOverlay) {
       this.confOverlay = confOverlay;
     }
   }
+
+  protected void registerCurrentOperationLog() {
+    if (isOperationLogEnabled) {
+      if (operationLog == null) {
+        LOG.warn("Failed to get current OperationLog object of Operation: " +
+          getHandle().getHandleIdentifier());
+        isOperationLogEnabled = false;
+        return;
+      }
+      OperationLog.setCurrentOperationLog(operationLog);
+    }
+  }
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index 5014cedd870b6..fd9108eb53ca9 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -274,18 +274,6 @@ private Hive getSessionHive() throws HiveSQLException {
     }
   }
 
-  private void registerCurrentOperationLog() {
-    if (isOperationLogEnabled) {
-      if (operationLog == null) {
-        LOG.warn("Failed to get current OperationLog object of Operation: " +
-            getHandle().getHandleIdentifier());
-        isOperationLogEnabled = false;
-        return;
-      }
-      OperationLog.setCurrentOperationLog(operationLog);
-    }
-  }
-
   private void cleanup(OperationState state) throws HiveSQLException {
     setState(state);
     if (shouldRunAsync()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index f5191fa9132bd..664bc20601eaa 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -170,6 +170,7 @@ private[hive] class SparkExecuteStatementOperation(
         override def run(): Unit = {
           val doAsAction = new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
+              registerCurrentOperationLog()
               try {
                 execute()
               } catch {

From 3fea5c4f19cb5369ff8bbeca80768a8aadb463f5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 14 Dec 2017 22:56:57 -0800
Subject: [PATCH 127/356] [SPARK-22787][TEST][SQL] Add a TPC-H query suite

## What changes were proposed in this pull request?
Add a test suite to ensure all the TPC-H queries can be successfully analyzed, optimized and compiled without hitting the max iteration threshold.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19982 from gatorsmile/testTPCH.
---
 .../datasources/text/TextFileFormat.scala     |   7 +-
 sql/core/src/test/resources/tpch/q1.sql       |  23 ++++
 sql/core/src/test/resources/tpch/q10.sql      |  34 ++++++
 sql/core/src/test/resources/tpch/q11.sql      |  29 +++++
 sql/core/src/test/resources/tpch/q12.sql      |  30 +++++
 sql/core/src/test/resources/tpch/q13.sql      |  22 ++++
 sql/core/src/test/resources/tpch/q14.sql      |  15 +++
 sql/core/src/test/resources/tpch/q15.sql      |  35 ++++++
 sql/core/src/test/resources/tpch/q16.sql      |  32 +++++
 sql/core/src/test/resources/tpch/q17.sql      |  19 +++
 sql/core/src/test/resources/tpch/q18.sql      |  35 ++++++
 sql/core/src/test/resources/tpch/q19.sql      |  37 ++++++
 sql/core/src/test/resources/tpch/q2.sql       |  46 ++++++++
 sql/core/src/test/resources/tpch/q20.sql      |  39 +++++++
 sql/core/src/test/resources/tpch/q21.sql      |  42 +++++++
 sql/core/src/test/resources/tpch/q22.sql      |  39 +++++++
 sql/core/src/test/resources/tpch/q3.sql       |  25 ++++
 sql/core/src/test/resources/tpch/q4.sql       |  23 ++++
 sql/core/src/test/resources/tpch/q5.sql       |  26 +++++
 sql/core/src/test/resources/tpch/q6.sql       |  11 ++
 sql/core/src/test/resources/tpch/q7.sql       |  41 +++++++
 sql/core/src/test/resources/tpch/q8.sql       |  39 +++++++
 sql/core/src/test/resources/tpch/q9.sql       |  34 ++++++
 .../apache/spark/sql/BenchmarkQueryTest.scala |  78 +++++++++++++
 .../apache/spark/sql/TPCDSQuerySuite.scala    |  58 +--------
 .../org/apache/spark/sql/TPCHQuerySuite.scala | 110 ++++++++++++++++++
 26 files changed, 872 insertions(+), 57 deletions(-)
 create mode 100644 sql/core/src/test/resources/tpch/q1.sql
 create mode 100644 sql/core/src/test/resources/tpch/q10.sql
 create mode 100644 sql/core/src/test/resources/tpch/q11.sql
 create mode 100644 sql/core/src/test/resources/tpch/q12.sql
 create mode 100644 sql/core/src/test/resources/tpch/q13.sql
 create mode 100644 sql/core/src/test/resources/tpch/q14.sql
 create mode 100644 sql/core/src/test/resources/tpch/q15.sql
 create mode 100644 sql/core/src/test/resources/tpch/q16.sql
 create mode 100644 sql/core/src/test/resources/tpch/q17.sql
 create mode 100644 sql/core/src/test/resources/tpch/q18.sql
 create mode 100644 sql/core/src/test/resources/tpch/q19.sql
 create mode 100644 sql/core/src/test/resources/tpch/q2.sql
 create mode 100644 sql/core/src/test/resources/tpch/q20.sql
 create mode 100644 sql/core/src/test/resources/tpch/q21.sql
 create mode 100644 sql/core/src/test/resources/tpch/q22.sql
 create mode 100644 sql/core/src/test/resources/tpch/q3.sql
 create mode 100644 sql/core/src/test/resources/tpch/q4.sql
 create mode 100644 sql/core/src/test/resources/tpch/q5.sql
 create mode 100644 sql/core/src/test/resources/tpch/q6.sql
 create mode 100644 sql/core/src/test/resources/tpch/q7.sql
 create mode 100644 sql/core/src/test/resources/tpch/q8.sql
 create mode 100644 sql/core/src/test/resources/tpch/q9.sql
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
index 8a6ab303fc0f0..c661e9bd3b94c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala
@@ -116,9 +116,10 @@ class TextFileFormat extends TextBasedFileFormat with DataSourceRegister {
     readToUnsafeMem(broadcastedHadoopConf, requiredSchema, textOptions.wholeText)
   }
 
-  private def readToUnsafeMem(conf: Broadcast[SerializableConfiguration],
-      requiredSchema: StructType, wholeTextMode: Boolean):
-  (PartitionedFile) => Iterator[UnsafeRow] = {
+  private def readToUnsafeMem(
+      conf: Broadcast[SerializableConfiguration],
+      requiredSchema: StructType,
+      wholeTextMode: Boolean): (PartitionedFile) => Iterator[UnsafeRow] = {
 
     (file: PartitionedFile) => {
       val confValue = conf.value.value
diff --git a/sql/core/src/test/resources/tpch/q1.sql b/sql/core/src/test/resources/tpch/q1.sql
new file mode 100644
index 0000000000000..73eb8d8417155
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q1.sql
@@ -0,0 +1,23 @@
+-- using default substitutions
+
+select
+	l_returnflag,
+	l_linestatus,
+	sum(l_quantity) as sum_qty,
+	sum(l_extendedprice) as sum_base_price,
+	sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+	sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+	avg(l_quantity) as avg_qty,
+	avg(l_extendedprice) as avg_price,
+	avg(l_discount) as avg_disc,
+	count(*) as count_order
+from
+	lineitem
+where
+	l_shipdate <= date '1998-12-01' - interval '90' day
+group by
+	l_returnflag,
+	l_linestatus
+order by
+	l_returnflag,
+	l_linestatus
diff --git a/sql/core/src/test/resources/tpch/q10.sql b/sql/core/src/test/resources/tpch/q10.sql
new file mode 100644
index 0000000000000..3b2ae588dee63
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q10.sql
@@ -0,0 +1,34 @@
+-- using default substitutions
+
+select
+	c_custkey,
+	c_name,
+	sum(l_extendedprice * (1 - l_discount)) as revenue,
+	c_acctbal,
+	n_name,
+	c_address,
+	c_phone,
+	c_comment
+from
+	customer,
+	orders,
+	lineitem,
+	nation
+where
+	c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and o_orderdate >= date '1993-10-01'
+	and o_orderdate < date '1993-10-01' + interval '3' month
+	and l_returnflag = 'R'
+	and c_nationkey = n_nationkey
+group by
+	c_custkey,
+	c_name,
+	c_acctbal,
+	c_phone,
+	n_name,
+	c_address,
+	c_comment
+order by
+	revenue desc
+limit 20
diff --git a/sql/core/src/test/resources/tpch/q11.sql b/sql/core/src/test/resources/tpch/q11.sql
new file mode 100644
index 0000000000000..531e78c21b047
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q11.sql
@@ -0,0 +1,29 @@
+-- using default substitutions
+
+select
+	ps_partkey,
+	sum(ps_supplycost * ps_availqty) as value
+from
+	partsupp,
+	supplier,
+	nation
+where
+	ps_suppkey = s_suppkey
+	and s_nationkey = n_nationkey
+	and n_name = 'GERMANY'
+group by
+	ps_partkey having
+		sum(ps_supplycost * ps_availqty) > (
+			select
+				sum(ps_supplycost * ps_availqty) * 0.0001000000
+			from
+				partsupp,
+				supplier,
+				nation
+			where
+				ps_suppkey = s_suppkey
+				and s_nationkey = n_nationkey
+				and n_name = 'GERMANY'
+		)
+order by
+	value desc
diff --git a/sql/core/src/test/resources/tpch/q12.sql b/sql/core/src/test/resources/tpch/q12.sql
new file mode 100644
index 0000000000000..d3e70eb481a9d
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q12.sql
@@ -0,0 +1,30 @@
+-- using default substitutions
+
+select
+	l_shipmode,
+	sum(case
+		when o_orderpriority = '1-URGENT'
+			or o_orderpriority = '2-HIGH'
+			then 1
+		else 0
+	end) as high_line_count,
+	sum(case
+		when o_orderpriority <> '1-URGENT'
+			and o_orderpriority <> '2-HIGH'
+			then 1
+		else 0
+	end) as low_line_count
+from
+	orders,
+	lineitem
+where
+	o_orderkey = l_orderkey
+	and l_shipmode in ('MAIL', 'SHIP')
+	and l_commitdate < l_receiptdate
+	and l_shipdate < l_commitdate
+	and l_receiptdate >= date '1994-01-01'
+	and l_receiptdate < date '1994-01-01' + interval '1' year
+group by
+	l_shipmode
+order by
+	l_shipmode
diff --git a/sql/core/src/test/resources/tpch/q13.sql b/sql/core/src/test/resources/tpch/q13.sql
new file mode 100644
index 0000000000000..3375002c5fd44
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q13.sql
@@ -0,0 +1,22 @@
+-- using default substitutions
+
+select
+	c_count,
+	count(*) as custdist
+from
+	(
+		select
+			c_custkey,
+			count(o_orderkey) as c_count
+		from
+			customer left outer join orders on
+				c_custkey = o_custkey
+				and o_comment not like '%special%requests%'
+		group by
+			c_custkey
+	) as c_orders
+group by
+	c_count
+order by
+	custdist desc,
+	c_count desc
diff --git a/sql/core/src/test/resources/tpch/q14.sql b/sql/core/src/test/resources/tpch/q14.sql
new file mode 100644
index 0000000000000..753ea568914cb
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q14.sql
@@ -0,0 +1,15 @@
+-- using default substitutions
+
+select
+	100.00 * sum(case
+		when p_type like 'PROMO%'
+			then l_extendedprice * (1 - l_discount)
+		else 0
+	end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
+from
+	lineitem,
+	part
+where
+	l_partkey = p_partkey
+	and l_shipdate >= date '1995-09-01'
+	and l_shipdate < date '1995-09-01' + interval '1' month
diff --git a/sql/core/src/test/resources/tpch/q15.sql b/sql/core/src/test/resources/tpch/q15.sql
new file mode 100644
index 0000000000000..64d0b48ec09ae
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q15.sql
@@ -0,0 +1,35 @@
+-- using default substitutions
+
+with revenue0 as
+	(select
+		l_suppkey as supplier_no,
+		sum(l_extendedprice * (1 - l_discount)) as total_revenue
+	from
+		lineitem
+	where
+		l_shipdate >= date '1996-01-01'
+		and l_shipdate < date '1996-01-01' + interval '3' month
+	group by
+		l_suppkey)
+
+
+select
+	s_suppkey,
+	s_name,
+	s_address,
+	s_phone,
+	total_revenue
+from
+	supplier,
+	revenue0
+where
+	s_suppkey = supplier_no
+	and total_revenue = (
+		select
+			max(total_revenue)
+		from
+			revenue0
+	)
+order by
+	s_suppkey
+
diff --git a/sql/core/src/test/resources/tpch/q16.sql b/sql/core/src/test/resources/tpch/q16.sql
new file mode 100644
index 0000000000000..a6ac68898ec8d
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q16.sql
@@ -0,0 +1,32 @@
+-- using default substitutions
+
+select
+	p_brand,
+	p_type,
+	p_size,
+	count(distinct ps_suppkey) as supplier_cnt
+from
+	partsupp,
+	part
+where
+	p_partkey = ps_partkey
+	and p_brand <> 'Brand#45'
+	and p_type not like 'MEDIUM POLISHED%'
+	and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
+	and ps_suppkey not in (
+		select
+			s_suppkey
+		from
+			supplier
+		where
+			s_comment like '%Customer%Complaints%'
+	)
+group by
+	p_brand,
+	p_type,
+	p_size
+order by
+	supplier_cnt desc,
+	p_brand,
+	p_type,
+	p_size
diff --git a/sql/core/src/test/resources/tpch/q17.sql b/sql/core/src/test/resources/tpch/q17.sql
new file mode 100644
index 0000000000000..74fb1f653a945
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q17.sql
@@ -0,0 +1,19 @@
+-- using default substitutions
+
+select
+	sum(l_extendedprice) / 7.0 as avg_yearly
+from
+	lineitem,
+	part
+where
+	p_partkey = l_partkey
+	and p_brand = 'Brand#23'
+	and p_container = 'MED BOX'
+	and l_quantity < (
+		select
+			0.2 * avg(l_quantity)
+		from
+			lineitem
+		where
+			l_partkey = p_partkey
+	)
diff --git a/sql/core/src/test/resources/tpch/q18.sql b/sql/core/src/test/resources/tpch/q18.sql
new file mode 100644
index 0000000000000..210fba19ec7f7
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q18.sql
@@ -0,0 +1,35 @@
+-- using default substitutions
+
+select
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice,
+	sum(l_quantity)
+from
+	customer,
+	orders,
+	lineitem
+where
+	o_orderkey in (
+		select
+			l_orderkey
+		from
+			lineitem
+		group by
+			l_orderkey having
+				sum(l_quantity) > 300
+	)
+	and c_custkey = o_custkey
+	and o_orderkey = l_orderkey
+group by
+	c_name,
+	c_custkey,
+	o_orderkey,
+	o_orderdate,
+	o_totalprice
+order by
+	o_totalprice desc,
+	o_orderdate
+limit 100
\ No newline at end of file
diff --git a/sql/core/src/test/resources/tpch/q19.sql b/sql/core/src/test/resources/tpch/q19.sql
new file mode 100644
index 0000000000000..c07327da3ac29
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q19.sql
@@ -0,0 +1,37 @@
+-- using default substitutions
+
+select
+	sum(l_extendedprice* (1 - l_discount)) as revenue
+from
+	lineitem,
+	part
+where
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#12'
+		and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
+		and l_quantity >= 1 and l_quantity <= 1 + 10
+		and p_size between 1 and 5
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#23'
+		and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
+		and l_quantity >= 10 and l_quantity <= 10 + 10
+		and p_size between 1 and 10
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
+	or
+	(
+		p_partkey = l_partkey
+		and p_brand = 'Brand#34'
+		and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
+		and l_quantity >= 20 and l_quantity <= 20 + 10
+		and p_size between 1 and 15
+		and l_shipmode in ('AIR', 'AIR REG')
+		and l_shipinstruct = 'DELIVER IN PERSON'
+	)
diff --git a/sql/core/src/test/resources/tpch/q2.sql b/sql/core/src/test/resources/tpch/q2.sql
new file mode 100644
index 0000000000000..d0e3b7e13e301
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q2.sql
@@ -0,0 +1,46 @@
+-- using default substitutions
+
+select
+	s_acctbal,
+	s_name,
+	n_name,
+	p_partkey,
+	p_mfgr,
+	s_address,
+	s_phone,
+	s_comment
+from
+	part,
+	supplier,
+	partsupp,
+	nation,
+	region
+where
+	p_partkey = ps_partkey
+	and s_suppkey = ps_suppkey
+	and p_size = 15
+	and p_type like '%BRASS'
+	and s_nationkey = n_nationkey
+	and n_regionkey = r_regionkey
+	and r_name = 'EUROPE'
+	and ps_supplycost = (
+		select
+			min(ps_supplycost)
+		from
+			partsupp,
+			supplier,
+			nation,
+			region
+		where
+			p_partkey = ps_partkey
+			and s_suppkey = ps_suppkey
+			and s_nationkey = n_nationkey
+			and n_regionkey = r_regionkey
+			and r_name = 'EUROPE'
+	)
+order by
+	s_acctbal desc,
+	n_name,
+	s_name,
+	p_partkey
+limit 100
diff --git a/sql/core/src/test/resources/tpch/q20.sql b/sql/core/src/test/resources/tpch/q20.sql
new file mode 100644
index 0000000000000..e161d340b9b6b
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q20.sql
@@ -0,0 +1,39 @@
+-- using default substitutions
+
+select
+	s_name,
+	s_address
+from
+	supplier,
+	nation
+where
+	s_suppkey in (
+		select
+			ps_suppkey
+		from
+			partsupp
+		where
+			ps_partkey in (
+				select
+					p_partkey
+				from
+					part
+				where
+					p_name like 'forest%'
+			)
+			and ps_availqty > (
+				select
+					0.5 * sum(l_quantity)
+				from
+					lineitem
+				where
+					l_partkey = ps_partkey
+					and l_suppkey = ps_suppkey
+					and l_shipdate >= date '1994-01-01'
+					and l_shipdate < date '1994-01-01' + interval '1' year
+			)
+	)
+	and s_nationkey = n_nationkey
+	and n_name = 'CANADA'
+order by
+	s_name
diff --git a/sql/core/src/test/resources/tpch/q21.sql b/sql/core/src/test/resources/tpch/q21.sql
new file mode 100644
index 0000000000000..fdcdfbcf7976e
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q21.sql
@@ -0,0 +1,42 @@
+-- using default substitutions
+
+select
+	s_name,
+	count(*) as numwait
+from
+	supplier,
+	lineitem l1,
+	orders,
+	nation
+where
+	s_suppkey = l1.l_suppkey
+	and o_orderkey = l1.l_orderkey
+	and o_orderstatus = 'F'
+	and l1.l_receiptdate > l1.l_commitdate
+	and exists (
+		select
+			*
+		from
+			lineitem l2
+		where
+			l2.l_orderkey = l1.l_orderkey
+			and l2.l_suppkey <> l1.l_suppkey
+	)
+	and not exists (
+		select
+			*
+		from
+			lineitem l3
+		where
+			l3.l_orderkey = l1.l_orderkey
+			and l3.l_suppkey <> l1.l_suppkey
+			and l3.l_receiptdate > l3.l_commitdate
+	)
+	and s_nationkey = n_nationkey
+	and n_name = 'SAUDI ARABIA'
+group by
+	s_name
+order by
+	numwait desc,
+	s_name
+limit 100
\ No newline at end of file
diff --git a/sql/core/src/test/resources/tpch/q22.sql b/sql/core/src/test/resources/tpch/q22.sql
new file mode 100644
index 0000000000000..1d7706e9a0bf4
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q22.sql
@@ -0,0 +1,39 @@
+-- using default substitutions
+
+select
+	cntrycode,
+	count(*) as numcust,
+	sum(c_acctbal) as totacctbal
+from
+	(
+		select
+			substring(c_phone, 1, 2) as cntrycode,
+			c_acctbal
+		from
+			customer
+		where
+			substring(c_phone, 1, 2) in
+				('13', '31', '23', '29', '30', '18', '17')
+			and c_acctbal > (
+				select
+					avg(c_acctbal)
+				from
+					customer
+				where
+					c_acctbal > 0.00
+					and substring(c_phone, 1, 2) in
+						('13', '31', '23', '29', '30', '18', '17')
+			)
+			and not exists (
+				select
+					*
+				from
+					orders
+				where
+					o_custkey = c_custkey
+			)
+	) as custsale
+group by
+	cntrycode
+order by
+	cntrycode
diff --git a/sql/core/src/test/resources/tpch/q3.sql b/sql/core/src/test/resources/tpch/q3.sql
new file mode 100644
index 0000000000000..948d6bcf12f68
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q3.sql
@@ -0,0 +1,25 @@
+-- using default substitutions
+
+select
+	l_orderkey,
+	sum(l_extendedprice * (1 - l_discount)) as revenue,
+	o_orderdate,
+	o_shippriority
+from
+	customer,
+	orders,
+	lineitem
+where
+	c_mktsegment = 'BUILDING'
+	and c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and o_orderdate < date '1995-03-15'
+	and l_shipdate > date '1995-03-15'
+group by
+	l_orderkey,
+	o_orderdate,
+	o_shippriority
+order by
+	revenue desc,
+	o_orderdate
+limit 10
diff --git a/sql/core/src/test/resources/tpch/q4.sql b/sql/core/src/test/resources/tpch/q4.sql
new file mode 100644
index 0000000000000..67330e36a066d
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q4.sql
@@ -0,0 +1,23 @@
+-- using default substitutions
+
+select
+	o_orderpriority,
+	count(*) as order_count
+from
+	orders
+where
+	o_orderdate >= date '1993-07-01'
+	and o_orderdate < date '1993-07-01' + interval '3' month
+	and exists (
+		select
+			*
+		from
+			lineitem
+		where
+			l_orderkey = o_orderkey
+			and l_commitdate < l_receiptdate
+	)
+group by
+	o_orderpriority
+order by
+	o_orderpriority
diff --git a/sql/core/src/test/resources/tpch/q5.sql b/sql/core/src/test/resources/tpch/q5.sql
new file mode 100644
index 0000000000000..b973e9f0a0956
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q5.sql
@@ -0,0 +1,26 @@
+-- using default substitutions
+
+select
+	n_name,
+	sum(l_extendedprice * (1 - l_discount)) as revenue
+from
+	customer,
+	orders,
+	lineitem,
+	supplier,
+	nation,
+	region
+where
+	c_custkey = o_custkey
+	and l_orderkey = o_orderkey
+	and l_suppkey = s_suppkey
+	and c_nationkey = s_nationkey
+	and s_nationkey = n_nationkey
+	and n_regionkey = r_regionkey
+	and r_name = 'ASIA'
+	and o_orderdate >= date '1994-01-01'
+	and o_orderdate < date '1994-01-01' + interval '1' year
+group by
+	n_name
+order by
+	revenue desc
diff --git a/sql/core/src/test/resources/tpch/q6.sql b/sql/core/src/test/resources/tpch/q6.sql
new file mode 100644
index 0000000000000..22294579ee043
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q6.sql
@@ -0,0 +1,11 @@
+-- using default substitutions
+
+select
+	sum(l_extendedprice * l_discount) as revenue
+from
+	lineitem
+where
+	l_shipdate >= date '1994-01-01'
+	and l_shipdate < date '1994-01-01' + interval '1' year
+	and l_discount between .06 - 0.01 and .06 + 0.01
+	and l_quantity < 24
diff --git a/sql/core/src/test/resources/tpch/q7.sql b/sql/core/src/test/resources/tpch/q7.sql
new file mode 100644
index 0000000000000..21105c0519e1b
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q7.sql
@@ -0,0 +1,41 @@
+-- using default substitutions
+
+select
+	supp_nation,
+	cust_nation,
+	l_year,
+	sum(volume) as revenue
+from
+	(
+		select
+			n1.n_name as supp_nation,
+			n2.n_name as cust_nation,
+			year(l_shipdate) as l_year,
+			l_extendedprice * (1 - l_discount) as volume
+		from
+			supplier,
+			lineitem,
+			orders,
+			customer,
+			nation n1,
+			nation n2
+		where
+			s_suppkey = l_suppkey
+			and o_orderkey = l_orderkey
+			and c_custkey = o_custkey
+			and s_nationkey = n1.n_nationkey
+			and c_nationkey = n2.n_nationkey
+			and (
+				(n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
+				or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
+			)
+			and l_shipdate between date '1995-01-01' and date '1996-12-31'
+	) as shipping
+group by
+	supp_nation,
+	cust_nation,
+	l_year
+order by
+	supp_nation,
+	cust_nation,
+	l_year
diff --git a/sql/core/src/test/resources/tpch/q8.sql b/sql/core/src/test/resources/tpch/q8.sql
new file mode 100644
index 0000000000000..81d81871c4920
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q8.sql
@@ -0,0 +1,39 @@
+-- using default substitutions
+
+select
+	o_year,
+	sum(case
+		when nation = 'BRAZIL' then volume
+		else 0
+	end) / sum(volume) as mkt_share
+from
+	(
+		select
+			year(o_orderdate) as o_year,
+			l_extendedprice * (1 - l_discount) as volume,
+			n2.n_name as nation
+		from
+			part,
+			supplier,
+			lineitem,
+			orders,
+			customer,
+			nation n1,
+			nation n2,
+			region
+		where
+			p_partkey = l_partkey
+			and s_suppkey = l_suppkey
+			and l_orderkey = o_orderkey
+			and o_custkey = c_custkey
+			and c_nationkey = n1.n_nationkey
+			and n1.n_regionkey = r_regionkey
+			and r_name = 'AMERICA'
+			and s_nationkey = n2.n_nationkey
+			and o_orderdate between date '1995-01-01' and date '1996-12-31'
+			and p_type = 'ECONOMY ANODIZED STEEL'
+	) as all_nations
+group by
+	o_year
+order by
+	o_year
diff --git a/sql/core/src/test/resources/tpch/q9.sql b/sql/core/src/test/resources/tpch/q9.sql
new file mode 100644
index 0000000000000..a4e8e8382be6a
--- /dev/null
+++ b/sql/core/src/test/resources/tpch/q9.sql
@@ -0,0 +1,34 @@
+-- using default substitutions
+
+select
+	nation,
+	o_year,
+	sum(amount) as sum_profit
+from
+	(
+		select
+			n_name as nation,
+			year(o_orderdate) as o_year,
+			l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+		from
+			part,
+			supplier,
+			lineitem,
+			partsupp,
+			orders,
+			nation
+		where
+			s_suppkey = l_suppkey
+			and ps_suppkey = l_suppkey
+			and ps_partkey = l_partkey
+			and p_partkey = l_partkey
+			and o_orderkey = l_orderkey
+			and s_nationkey = n_nationkey
+			and p_name like '%green%'
+	) as profit
+group by
+	nation,
+	o_year
+order by
+	nation,
+	o_year desc
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
new file mode 100644
index 0000000000000..7037749f14478
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/BenchmarkQueryTest.scala
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.scalatest.BeforeAndAfterAll
+
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.util.Utils
+
+abstract class BenchmarkQueryTest extends QueryTest with SharedSQLContext with BeforeAndAfterAll {
+
+  // When Utils.isTesting is true, the RuleExecutor will issue an exception when hitting
+  // the max iteration of analyzer/optimizer batches.
+  assert(Utils.isTesting, "spark.testing is not set to true")
+
+  /**
+   * Drop all the tables
+   */
+  protected override def afterAll(): Unit = {
+    try {
+      // For debugging dump some statistics about how much time was spent in various optimizer rules
+      logWarning(RuleExecutor.dumpTimeSpent())
+      spark.sessionState.catalog.reset()
+    } finally {
+      super.afterAll()
+    }
+  }
+
+  override def beforeAll() {
+    super.beforeAll()
+    RuleExecutor.resetTime()
+  }
+
+  protected def checkGeneratedCode(plan: SparkPlan): Unit = {
+    val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]()
+    plan foreach {
+      case s: WholeStageCodegenExec =>
+        codegenSubtrees += s
+      case s => s
+    }
+    codegenSubtrees.toSeq.foreach { subtree =>
+      val code = subtree.doCodeGen()._2
+      try {
+        // Just check the generated code can be properly compiled
+        CodeGenerator.compile(code)
+      } catch {
+        case e: Exception =>
+          val msg =
+            s"""
+               |failed to compile:
+               |Subtree:
+               |$subtree
+               |Generated code:
+               |${CodeFormatter.format(code)}
+             """.stripMargin
+          throw new Exception(msg, e)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
index dd427a5c52edf..1a584187a06e5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCDSQuerySuite.scala
@@ -17,41 +17,18 @@
 
 package org.apache.spark.sql
 
-import org.scalatest.BeforeAndAfterAll
-
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
-import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.resourceToString
-import org.apache.spark.sql.execution.{SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.util.Utils
 
 /**
- * This test suite ensures all the TPC-DS queries can be successfully analyzed and optimized
- * without hitting the max iteration threshold.
+ * This test suite ensures all the TPC-DS queries can be successfully analyzed, optimized
+ * and compiled without hitting the max iteration threshold.
  */
-class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfterAll {
-
-  // When Utils.isTesting is true, the RuleExecutor will issue an exception when hitting
-  // the max iteration of analyzer/optimizer batches.
-  assert(Utils.isTesting, "spark.testing is not set to true")
-
-  /**
-   * Drop all the tables
-   */
-  protected override def afterAll(): Unit = {
-    try {
-      // For debugging dump some statistics about how much time was spent in various optimizer rules
-      logWarning(RuleExecutor.dumpTimeSpent())
-      spark.sessionState.catalog.reset()
-    } finally {
-      super.afterAll()
-    }
-  }
+class TPCDSQuerySuite extends BenchmarkQueryTest {
 
   override def beforeAll() {
     super.beforeAll()
+
     sql(
       """
         |CREATE TABLE `catalog_page` (
@@ -350,33 +327,6 @@ class TPCDSQuerySuite extends QueryTest with SharedSQLContext with BeforeAndAfte
     "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90",
     "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99")
 
-  private def checkGeneratedCode(plan: SparkPlan): Unit = {
-    val codegenSubtrees = new collection.mutable.HashSet[WholeStageCodegenExec]()
-    plan foreach {
-      case s: WholeStageCodegenExec =>
-        codegenSubtrees += s
-      case s => s
-    }
-    codegenSubtrees.toSeq.foreach { subtree =>
-      val code = subtree.doCodeGen()._2
-      try {
-        // Just check the generated code can be properly compiled
-        CodeGenerator.compile(code)
-      } catch {
-        case e: Exception =>
-          val msg =
-            s"""
-               |failed to compile:
-               |Subtree:
-               |$subtree
-               |Generated code:
-               |${CodeFormatter.format(code)}
-             """.stripMargin
-          throw new Exception(msg, e)
-      }
-    }
-  }
-
   tpcdsQueries.foreach { name =>
     val queryString = resourceToString(s"tpcds/$name.sql",
       classLoader = Thread.currentThread().getContextClassLoader)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala
new file mode 100644
index 0000000000000..69ac92eaed7dc
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.catalyst.util.resourceToString
+import org.apache.spark.util.Utils
+
+/**
+ * This test suite ensures all the TPC-H queries can be successfully analyzed, optimized
+ * and compiled without hitting the max iteration threshold.
+ */
+class TPCHQuerySuite extends BenchmarkQueryTest {
+
+  override def beforeAll() {
+    super.beforeAll()
+
+    sql(
+      """
+        |CREATE TABLE `orders` (
+        |`o_orderkey` BIGINT, `o_custkey` BIGINT, `o_orderstatus` STRING,
+        |`o_totalprice` DECIMAL(10,0), `o_orderdate` DATE, `o_orderpriority` STRING,
+        |`o_clerk` STRING, `o_shippriority` INT, `o_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `nation` (
+        |`n_nationkey` BIGINT, `n_name` STRING, `n_regionkey` BIGINT, `n_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `region` (
+        |`r_regionkey` BIGINT, `r_name` STRING, `r_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `part` (`p_partkey` BIGINT, `p_name` STRING, `p_mfgr` STRING,
+        |`p_brand` STRING, `p_type` STRING, `p_size` INT, `p_container` STRING,
+        |`p_retailprice` DECIMAL(10,0), `p_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `partsupp` (`ps_partkey` BIGINT, `ps_suppkey` BIGINT,
+        |`ps_availqty` INT, `ps_supplycost` DECIMAL(10,0), `ps_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `customer` (`c_custkey` BIGINT, `c_name` STRING, `c_address` STRING,
+        |`c_nationkey` STRING, `c_phone` STRING, `c_acctbal` DECIMAL(10,0),
+        |`c_mktsegment` STRING, `c_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `supplier` (`s_suppkey` BIGINT, `s_name` STRING, `s_address` STRING,
+        |`s_nationkey` BIGINT, `s_phone` STRING, `s_acctbal` DECIMAL(10,0), `s_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `lineitem` (`l_orderkey` BIGINT, `l_partkey` BIGINT, `l_suppkey` BIGINT,
+        |`l_linenumber` INT, `l_quantity` DECIMAL(10,0), `l_extendedprice` DECIMAL(10,0),
+        |`l_discount` DECIMAL(10,0), `l_tax` DECIMAL(10,0), `l_returnflag` STRING,
+        |`l_linestatus` STRING, `l_shipdate` DATE, `l_commitdate` DATE, `l_receiptdate` DATE,
+        |`l_shipinstruct` STRING, `l_shipmode` STRING, `l_comment` STRING)
+        |USING parquet
+      """.stripMargin)
+  }
+
+  val tpchQueries = Seq(
+    "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
+    "q12", "q13", "q14", "q15", "q16", "q17", "q18", "q19", "q20", "q21", "q22")
+
+  tpchQueries.foreach { name =>
+    val queryString = resourceToString(s"tpch/$name.sql",
+      classLoader = Thread.currentThread().getContextClassLoader)
+    test(name) {
+      // check the plans can be properly generated
+      val plan = sql(queryString).queryExecution.executedPlan
+      checkGeneratedCode(plan)
+    }
+  }
+}

From 3775dd31ee86c32b6161ca99d8fd5cfd7c1a758e Mon Sep 17 00:00:00 2001
From: Yuanjian Li <xyliyuanjian@gmail.com>
Date: Thu, 14 Dec 2017 23:11:13 -0800
Subject: [PATCH 128/356] [SPARK-22753][SQL] Get rid of dataSource.writeAndRead

## What changes were proposed in this pull request?

As the discussion in https://github.com/apache/spark/pull/16481 and https://github.com/apache/spark/pull/18975#discussion_r155454606
Currently the BaseRelation returned by `dataSource.writeAndRead` only used in `CreateDataSourceTableAsSelect`, planForWriting and writeAndRead has some common code paths.
In this patch I removed the writeAndRead function and added the getRelation function which only use in `CreateDataSourceTableAsSelectCommand` while saving data to non-existing table.

## How was this patch tested?

Existing UT

Author: Yuanjian Li <xyliyuanjian@gmail.com>

Closes #19941 from xuanyuanking/SPARK-22753.
---
 .../sql/execution/command/InsertIntoDataSourceDirCommand.scala | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
index 9e3519073303c..1dc24b3d221cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/InsertIntoDataSourceDirCommand.scala
@@ -67,8 +67,7 @@ case class InsertIntoDataSourceDirCommand(
 
     val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists
     try {
-      sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query))
-      dataSource.writeAndRead(saveMode, query)
+      sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd
     } catch {
       case ex: AnalysisException =>
         logError(s"Failed to write to directory " + storage.locationUri.toString, ex)

From e58f275678fb4f904124a4a2a1762f04c835eb0e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Fri, 15 Dec 2017 09:46:15 -0800
Subject: [PATCH 129/356] Revert "[SPARK-22496][SQL] thrift server adds
 operation logs"

This reverts commit 0ea2d8c12e49e30df6bbfa57d74134b25f96a196.
---
 .../cli/operation/ExecuteStatementOperation.java    | 13 -------------
 .../hive/service/cli/operation/SQLOperation.java    | 12 ++++++++++++
 .../SparkExecuteStatementOperation.scala            |  1 -
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
index 6740d3bb59dc3..3f2de108f069a 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
@@ -23,7 +23,6 @@
 
 import org.apache.hadoop.hive.ql.processors.CommandProcessor;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory;
-import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hive.service.cli.HiveSQLException;
 import org.apache.hive.service.cli.OperationType;
 import org.apache.hive.service.cli.session.HiveSession;
@@ -68,16 +67,4 @@ protected void setConfOverlay(Map<String, String> confOverlay) {
       this.confOverlay = confOverlay;
     }
   }
-
-  protected void registerCurrentOperationLog() {
-    if (isOperationLogEnabled) {
-      if (operationLog == null) {
-        LOG.warn("Failed to get current OperationLog object of Operation: " +
-          getHandle().getHandleIdentifier());
-        isOperationLogEnabled = false;
-        return;
-      }
-      OperationLog.setCurrentOperationLog(operationLog);
-    }
-  }
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index fd9108eb53ca9..5014cedd870b6 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -274,6 +274,18 @@ private Hive getSessionHive() throws HiveSQLException {
     }
   }
 
+  private void registerCurrentOperationLog() {
+    if (isOperationLogEnabled) {
+      if (operationLog == null) {
+        LOG.warn("Failed to get current OperationLog object of Operation: " +
+            getHandle().getHandleIdentifier());
+        isOperationLogEnabled = false;
+        return;
+      }
+      OperationLog.setCurrentOperationLog(operationLog);
+    }
+  }
+
   private void cleanup(OperationState state) throws HiveSQLException {
     setState(state);
     if (shouldRunAsync()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 664bc20601eaa..f5191fa9132bd 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -170,7 +170,6 @@ private[hive] class SparkExecuteStatementOperation(
         override def run(): Unit = {
           val doAsAction = new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
-              registerCurrentOperationLog()
               try {
                 execute()
               } catch {

From 9fafa8209c51adc2a22b89aedf9af7b5e29e0059 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 15 Dec 2017 09:56:22 -0800
Subject: [PATCH 130/356] [SPARK-22800][TEST][SQL] Add a SSB query suite

## What changes were proposed in this pull request?
Add a test suite to ensure all the [SSB (Star Schema Benchmark)](https://www.cs.umb.edu/~poneil/StarSchemaB.PDF) queries can be successfully analyzed, optimized and compiled without hitting the max iteration threshold.

## How was this patch tested?
Added `SSBQuerySuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19990 from maropu/SPARK-22800.
---
 sql/core/src/test/resources/ssb/1.1.sql       |  6 ++
 sql/core/src/test/resources/ssb/1.2.sql       |  6 ++
 sql/core/src/test/resources/ssb/1.3.sql       |  6 ++
 sql/core/src/test/resources/ssb/2.1.sql       |  9 ++
 sql/core/src/test/resources/ssb/2.2.sql       |  9 ++
 sql/core/src/test/resources/ssb/2.3.sql       |  9 ++
 sql/core/src/test/resources/ssb/3.1.sql       | 10 +++
 sql/core/src/test/resources/ssb/3.2.sql       | 10 +++
 sql/core/src/test/resources/ssb/3.3.sql       | 12 +++
 sql/core/src/test/resources/ssb/3.4.sql       | 12 +++
 sql/core/src/test/resources/ssb/4.1.sql       | 11 +++
 sql/core/src/test/resources/ssb/4.2.sql       | 12 +++
 sql/core/src/test/resources/ssb/4.3.sql       | 12 +++
 .../org/apache/spark/sql/SSBQuerySuite.scala  | 87 +++++++++++++++++++
 .../org/apache/spark/sql/TPCHQuerySuite.scala |  2 -
 15 files changed, 211 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/test/resources/ssb/1.1.sql
 create mode 100644 sql/core/src/test/resources/ssb/1.2.sql
 create mode 100644 sql/core/src/test/resources/ssb/1.3.sql
 create mode 100644 sql/core/src/test/resources/ssb/2.1.sql
 create mode 100644 sql/core/src/test/resources/ssb/2.2.sql
 create mode 100644 sql/core/src/test/resources/ssb/2.3.sql
 create mode 100644 sql/core/src/test/resources/ssb/3.1.sql
 create mode 100644 sql/core/src/test/resources/ssb/3.2.sql
 create mode 100644 sql/core/src/test/resources/ssb/3.3.sql
 create mode 100644 sql/core/src/test/resources/ssb/3.4.sql
 create mode 100644 sql/core/src/test/resources/ssb/4.1.sql
 create mode 100644 sql/core/src/test/resources/ssb/4.2.sql
 create mode 100644 sql/core/src/test/resources/ssb/4.3.sql
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala

diff --git a/sql/core/src/test/resources/ssb/1.1.sql b/sql/core/src/test/resources/ssb/1.1.sql
new file mode 100644
index 0000000000000..62da3029469e4
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/1.1.sql
@@ -0,0 +1,6 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+	from lineorder, date
+	where lo_orderdate = d_datekey
+		and d_year = 1993
+		and lo_discount between 1 and 3
+		and lo_quantity < 25
diff --git a/sql/core/src/test/resources/ssb/1.2.sql b/sql/core/src/test/resources/ssb/1.2.sql
new file mode 100644
index 0000000000000..1657bfd4b2a32
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/1.2.sql
@@ -0,0 +1,6 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+	from lineorder, date
+	where lo_orderdate = d_datekey
+		and d_yearmonthnum = 199401
+		and lo_discount between 4 and 6
+		and lo_quantity between 26 and 35
diff --git a/sql/core/src/test/resources/ssb/1.3.sql b/sql/core/src/test/resources/ssb/1.3.sql
new file mode 100644
index 0000000000000..e9bbf51f051dd
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/1.3.sql
@@ -0,0 +1,6 @@
+select sum(lo_extendedprice*lo_discount) as revenue
+	from lineorder, date
+	where lo_orderdate = d_datekey
+		and d_weeknuminyear = 6 and d_year = 1994
+		and lo_discount between 5 and 7
+		and lo_quantity between 36 and 40
diff --git a/sql/core/src/test/resources/ssb/2.1.sql b/sql/core/src/test/resources/ssb/2.1.sql
new file mode 100644
index 0000000000000..00d402785320b
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/2.1.sql
@@ -0,0 +1,9 @@
+select sum(lo_revenue), d_year, p_brand1
+	from lineorder, date, part, supplier
+	where lo_orderdate = d_datekey
+		and lo_partkey = p_partkey
+		and lo_suppkey = s_suppkey
+		and p_category = 'MFGR#12'
+		and s_region = 'AMERICA'
+	group by d_year, p_brand1
+	order by d_year, p_brand1
diff --git a/sql/core/src/test/resources/ssb/2.2.sql b/sql/core/src/test/resources/ssb/2.2.sql
new file mode 100644
index 0000000000000..c53a925f75ddb
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/2.2.sql
@@ -0,0 +1,9 @@
+select sum(lo_revenue), d_year, p_brand1
+	from lineorder, date, part, supplier
+	where lo_orderdate = d_datekey
+		and lo_partkey = p_partkey
+		and lo_suppkey = s_suppkey
+		and p_brand1 between 'MFGR#2221' and 'MFGR#2228'
+		and s_region = 'ASIA'
+	group by d_year, p_brand1
+	order by d_year, p_brand1
diff --git a/sql/core/src/test/resources/ssb/2.3.sql b/sql/core/src/test/resources/ssb/2.3.sql
new file mode 100644
index 0000000000000..e36530c65e428
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/2.3.sql
@@ -0,0 +1,9 @@
+select sum(lo_revenue), d_year, p_brand1
+	from lineorder, date, part, supplier
+	where lo_orderdate = d_datekey
+		and lo_partkey = p_partkey
+		and lo_suppkey = s_suppkey
+		and p_brand1 = 'MFGR#2221'
+		and s_region = 'EUROPE'
+	group by d_year, p_brand1
+	order by d_year, p_brand1
diff --git a/sql/core/src/test/resources/ssb/3.1.sql b/sql/core/src/test/resources/ssb/3.1.sql
new file mode 100644
index 0000000000000..663ec3fb3d5fa
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/3.1.sql
@@ -0,0 +1,10 @@
+select c_nation, s_nation, d_year, sum(lo_revenue) as revenue
+	from customer, lineorder, supplier, date
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_orderdate = d_datekey
+		and c_region = 'ASIA'
+		and s_region = 'ASIA'
+		and d_year >= 1992 and d_year <= 1997
+	group by c_nation, s_nation, d_year
+	order by d_year asc, revenue desc
diff --git a/sql/core/src/test/resources/ssb/3.2.sql b/sql/core/src/test/resources/ssb/3.2.sql
new file mode 100644
index 0000000000000..e1eaf10cded62
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/3.2.sql
@@ -0,0 +1,10 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+	from customer, lineorder, supplier, date
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_orderdate = d_datekey
+		and c_nation = 'UNITED STATES'
+		and s_nation = 'UNITED STATES'
+		and d_year >= 1992 and d_year <= 1997
+	group by c_city, s_city, d_year
+	order by d_year asc, revenue desc
diff --git a/sql/core/src/test/resources/ssb/3.3.sql b/sql/core/src/test/resources/ssb/3.3.sql
new file mode 100644
index 0000000000000..f2cb44f15a761
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/3.3.sql
@@ -0,0 +1,12 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+	from customer, lineorder, supplier, date
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_orderdate = d_datekey
+		and c_nation = 'UNITED KINGDOM'
+		and (c_city='UNITED KI1' or c_city='UNITED KI5')
+		and (s_city='UNITED KI1' or s_city='UNITED KI5')
+		and s_nation = 'UNITED KINGDOM'
+		and d_year >= 1992 and d_year <= 1997
+	group by c_city, s_city, d_year
+	order by d_year asc, revenue desc
diff --git a/sql/core/src/test/resources/ssb/3.4.sql b/sql/core/src/test/resources/ssb/3.4.sql
new file mode 100644
index 0000000000000..936f2464a55f1
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/3.4.sql
@@ -0,0 +1,12 @@
+select c_city, s_city, d_year, sum(lo_revenue) as revenue
+	from customer, lineorder, supplier, date
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_orderdate = d_datekey
+		and c_nation = 'UNITED KINGDOM'
+		and (c_city='UNITED KI1' or c_city='UNITED KI5')
+		and (s_city='UNITED KI1' or s_city='UNITED KI5')
+		and s_nation = 'UNITED KINGDOM'
+		and d_yearmonth = 'Dec1997'
+	group by c_city, s_city, d_year
+	order by d_year asc, revenue desc
diff --git a/sql/core/src/test/resources/ssb/4.1.sql b/sql/core/src/test/resources/ssb/4.1.sql
new file mode 100644
index 0000000000000..af19ecdf58c1f
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/4.1.sql
@@ -0,0 +1,11 @@
+select d_year, c_nation, sum(lo_revenue-lo_supplycost) as profit1
+	from date, customer, supplier, part, lineorder
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_partkey = p_partkey
+		and lo_orderdate = d_datekey
+		and c_region = 'AMERICA'
+		and s_region = 'AMERICA'
+		and (p_mfgr = 'MFGR#1' or p_mfgr = 'MFGR#2')
+	group by d_year, c_nation
+	order by d_year, c_nation
diff --git a/sql/core/src/test/resources/ssb/4.2.sql b/sql/core/src/test/resources/ssb/4.2.sql
new file mode 100644
index 0000000000000..2ffe6e2897522
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/4.2.sql
@@ -0,0 +1,12 @@
+select d_year, s_nation, p_category, sum(lo_revenue-lo_supplycost) as profit1
+	from date, customer, supplier, part, lineorder
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_partkey = p_partkey
+		and lo_orderdate = d_datekey
+		and c_region = 'AMERICA'
+		and s_region = 'AMERICA'
+		and (d_year = 1997 or d_year = 1998)
+		and (p_mfgr = 'MFGR#1' or p_mfgr = 'MFGR#2')
+	group by d_year, s_nation, p_category
+	order by d_year, s_nation, p_category
diff --git a/sql/core/src/test/resources/ssb/4.3.sql b/sql/core/src/test/resources/ssb/4.3.sql
new file mode 100644
index 0000000000000..e24e2cc78c914
--- /dev/null
+++ b/sql/core/src/test/resources/ssb/4.3.sql
@@ -0,0 +1,12 @@
+select d_year, s_city, p_brand1, sum(lo_revenue-lo_supplycost) as profit1
+	from date, customer, supplier, part, lineorder
+	where lo_custkey = c_custkey
+		and lo_suppkey = s_suppkey
+		and lo_partkey = p_partkey
+		and lo_orderdate = d_datekey
+		and c_region = 'AMERICA'
+		and s_nation = 'UNITED STATES'
+		and (d_year = 1997 or d_year = 1998)
+		and p_category = 'MFGR#14'
+	group by d_year, s_city, p_brand1
+	order by d_year, s_city, p_brand1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala
new file mode 100644
index 0000000000000..9a0c61b3304c5
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SSBQuerySuite.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.sql.catalyst.util.resourceToString
+
+/**
+ * This test suite ensures all the Star Schema Benchmark queries can be successfully analyzed,
+ * optimized and compiled without hitting the max iteration threshold.
+ */
+class SSBQuerySuite extends BenchmarkQueryTest {
+
+  override def beforeAll {
+    super.beforeAll
+
+    sql(
+      """
+        |CREATE TABLE `part` (`p_partkey` INT, `p_name` STRING, `p_mfgr` STRING,
+        |`p_category` STRING, `p_brand1` STRING, `p_color` STRING, `p_type` STRING, `p_size` INT,
+        |`p_container` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `supplier` (`s_suppkey` INT, `s_name` STRING, `s_address` STRING,
+        |`s_city` STRING, `s_nation` STRING, `s_region` STRING, `s_phone` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `customer` (`c_custkey` INT, `c_name` STRING, `c_address` STRING,
+        |`c_city` STRING, `c_nation` STRING, `c_region` STRING, `c_phone` STRING,
+        |`c_mktsegment` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `date` (`d_datekey` INT, `d_date` STRING, `d_dayofweek` STRING,
+        |`d_month` STRING, `d_year` INT, `d_yearmonthnum` INT, `d_yearmonth` STRING,
+        |`d_daynuminweek` INT, `d_daynuminmonth` INT, `d_daynuminyear` INT, `d_monthnuminyear` INT,
+        |`d_weeknuminyear` INT, `d_sellingseason` STRING, `d_lastdayinweekfl` STRING,
+        |`d_lastdayinmonthfl` STRING, `d_holidayfl` STRING, `d_weekdayfl` STRING)
+        |USING parquet
+      """.stripMargin)
+
+    sql(
+      """
+        |CREATE TABLE `lineorder` (`lo_orderkey` INT, `lo_linenumber` INT, `lo_custkey` INT,
+        |`lo_partkey` INT, `lo_suppkey` INT, `lo_orderdate` INT, `lo_orderpriority` STRING,
+        |`lo_shippriority` STRING, `lo_quantity` INT, `lo_extendedprice` INT,
+        |`lo_ordertotalprice` INT, `lo_discount` INT, `lo_revenue` INT, `lo_supplycost` INT,
+        |`lo_tax` INT, `lo_commitdate` INT, `lo_shipmode` STRING)
+        |USING parquet
+      """.stripMargin)
+  }
+
+  val ssbQueries = Seq(
+    "1.1", "1.2", "1.3", "2.1", "2.2", "2.3", "3.1", "3.2", "3.3", "3.4", "4.1", "4.2", "4.3")
+
+  ssbQueries.foreach { name =>
+    val queryString = resourceToString(s"ssb/$name.sql",
+      classLoader = Thread.currentThread.getContextClassLoader)
+    test(name) {
+      // check the plans can be properly generated
+      val plan = sql(queryString).queryExecution.executedPlan
+      checkGeneratedCode(plan)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala
index 69ac92eaed7dc..e3e700529bba7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/TPCHQuerySuite.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.util.resourceToString
-import org.apache.spark.util.Utils
 
 /**
  * This test suite ensures all the TPC-H queries can be successfully analyzed, optimized

From 46776234a49742e94c64897322500582d7393d35 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Fri, 15 Dec 2017 09:58:31 -0800
Subject: [PATCH 131/356] [SPARK-22762][TEST] Basic tests for IfCoercion and
 CaseWhenCoercion

## What changes were proposed in this pull request?

Basic tests for IfCoercion and CaseWhenCoercion

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #19949 from wangyum/SPARK-22762.
---
 .../typeCoercion/native/caseWhenCoercion.sql  |  174 +++
 .../inputs/typeCoercion/native/ifCoercion.sql |  174 +++
 .../native/caseWhenCoercion.sql.out           | 1232 +++++++++++++++++
 .../typeCoercion/native/ifCoercion.sql.out    | 1232 +++++++++++++++++
 4 files changed, 2812 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/caseWhenCoercion.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/ifCoercion.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/caseWhenCoercion.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/caseWhenCoercion.sql
new file mode 100644
index 0000000000000..a780529fdca8c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/caseWhenCoercion.sql
@@ -0,0 +1,174 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
+
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as tinyint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as smallint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as int) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as bigint) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as float) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as double) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as decimal(10, 0)) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as string) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2' as binary) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as boolean) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t;
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00' as date) END FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/ifCoercion.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/ifCoercion.sql
new file mode 100644
index 0000000000000..42597f169daec
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/ifCoercion.sql
@@ -0,0 +1,174 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT IF(true, cast(1 as tinyint), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as smallint), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as int), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as int), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as int), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as bigint), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as float), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as float), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as float), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as double), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as double), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as double), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as string), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as string), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as string), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast('1' as binary), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as int)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as float)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as double)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as string)) FROM t;
+SELECT IF(true, cast('1' as binary), cast('2' as binary)) FROM t;
+SELECT IF(true, cast('1' as binary), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast(1 as boolean), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as int)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as float)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as double)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as string)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast('2' as binary)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as int)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as float)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as double)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as string)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2' as binary)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as tinyint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as smallint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as int)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as bigint)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as float)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as double)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as decimal(10, 0))) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as string)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2' as binary)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as boolean)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out
new file mode 100644
index 0000000000000..a739f8d73181c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/caseWhenCoercion.sql.out
@@ -0,0 +1,1232 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 145
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as tinyint) END FROM t
+-- !query 1 schema
+struct<CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST(2 AS TINYINT) END:tinyint>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as smallint) END FROM t
+-- !query 2 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS SMALLINT) ELSE CAST(2 AS SMALLINT) END:smallint>
+-- !query 2 output
+1
+
+
+-- !query 3
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as int) END FROM t
+-- !query 3 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS INT) ELSE CAST(2 AS INT) END:int>
+-- !query 3 output
+1
+
+
+-- !query 4
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as bigint) END FROM t
+-- !query 4 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS BIGINT) ELSE CAST(2 AS BIGINT) END:bigint>
+-- !query 4 output
+1
+
+
+-- !query 5
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as float) END FROM t
+-- !query 5 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS FLOAT) ELSE CAST(2 AS FLOAT) END:float>
+-- !query 5 output
+1.0
+
+
+-- !query 6
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as double) END FROM t
+-- !query 6 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 6 output
+1.0
+
+
+-- !query 7
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 7 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)) ELSE CAST(2 AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 7 output
+1
+
+
+-- !query 8
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as string) END FROM t
+-- !query 8 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS TINYINT) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 8 output
+1
+
+
+-- !query 9
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2' as binary) END FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 10
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast(2 as boolean) END FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 11
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 12
+SELECT CASE WHEN true THEN cast(1 as tinyint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS TINYINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 13
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as tinyint) END FROM t
+-- !query 13 schema
+struct<CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST(CAST(2 AS TINYINT) AS SMALLINT) END:smallint>
+-- !query 13 output
+1
+
+
+-- !query 14
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as smallint) END FROM t
+-- !query 14 schema
+struct<CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST(2 AS SMALLINT) END:smallint>
+-- !query 14 output
+1
+
+
+-- !query 15
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as int) END FROM t
+-- !query 15 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS INT) ELSE CAST(2 AS INT) END:int>
+-- !query 15 output
+1
+
+
+-- !query 16
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as bigint) END FROM t
+-- !query 16 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS BIGINT) ELSE CAST(2 AS BIGINT) END:bigint>
+-- !query 16 output
+1
+
+
+-- !query 17
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as float) END FROM t
+-- !query 17 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS FLOAT) ELSE CAST(2 AS FLOAT) END:float>
+-- !query 17 output
+1.0
+
+
+-- !query 18
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as double) END FROM t
+-- !query 18 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 18 output
+1.0
+
+
+-- !query 19
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 19 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)) ELSE CAST(2 AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 19 output
+1
+
+
+-- !query 20
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as string) END FROM t
+-- !query 20 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS SMALLINT) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 20 output
+1
+
+
+-- !query 21
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2' as binary) END FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 22
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast(2 as boolean) END FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 23
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 24
+SELECT CASE WHEN true THEN cast(1 as smallint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS SMALLINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 25
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as tinyint) END FROM t
+-- !query 25 schema
+struct<CASE WHEN true THEN CAST(1 AS INT) ELSE CAST(CAST(2 AS TINYINT) AS INT) END:int>
+-- !query 25 output
+1
+
+
+-- !query 26
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as smallint) END FROM t
+-- !query 26 schema
+struct<CASE WHEN true THEN CAST(1 AS INT) ELSE CAST(CAST(2 AS SMALLINT) AS INT) END:int>
+-- !query 26 output
+1
+
+
+-- !query 27
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as int) END FROM t
+-- !query 27 schema
+struct<CASE WHEN true THEN CAST(1 AS INT) ELSE CAST(2 AS INT) END:int>
+-- !query 27 output
+1
+
+
+-- !query 28
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as bigint) END FROM t
+-- !query 28 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS INT) AS BIGINT) ELSE CAST(2 AS BIGINT) END:bigint>
+-- !query 28 output
+1
+
+
+-- !query 29
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as float) END FROM t
+-- !query 29 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS INT) AS FLOAT) ELSE CAST(2 AS FLOAT) END:float>
+-- !query 29 output
+1.0
+
+
+-- !query 30
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as double) END FROM t
+-- !query 30 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS INT) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 30 output
+1.0
+
+
+-- !query 31
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 31 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS INT) AS DECIMAL(10,0)) ELSE CAST(2 AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 31 output
+1
+
+
+-- !query 32
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as string) END FROM t
+-- !query 32 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS INT) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 32 output
+1
+
+
+-- !query 33
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2' as binary) END FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 34
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast(2 as boolean) END FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 35
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 36
+SELECT CASE WHEN true THEN cast(1 as int) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS INT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 37
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as tinyint) END FROM t
+-- !query 37 schema
+struct<CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(CAST(2 AS TINYINT) AS BIGINT) END:bigint>
+-- !query 37 output
+1
+
+
+-- !query 38
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as smallint) END FROM t
+-- !query 38 schema
+struct<CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(CAST(2 AS SMALLINT) AS BIGINT) END:bigint>
+-- !query 38 output
+1
+
+
+-- !query 39
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as int) END FROM t
+-- !query 39 schema
+struct<CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(CAST(2 AS INT) AS BIGINT) END:bigint>
+-- !query 39 output
+1
+
+
+-- !query 40
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as bigint) END FROM t
+-- !query 40 schema
+struct<CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(2 AS BIGINT) END:bigint>
+-- !query 40 output
+1
+
+
+-- !query 41
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as float) END FROM t
+-- !query 41 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS BIGINT) AS FLOAT) ELSE CAST(2 AS FLOAT) END:float>
+-- !query 41 output
+1.0
+
+
+-- !query 42
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as double) END FROM t
+-- !query 42 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS BIGINT) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 42 output
+1.0
+
+
+-- !query 43
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 43 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) ELSE CAST(CAST(2 AS DECIMAL(10,0)) AS DECIMAL(20,0)) END:decimal(20,0)>
+-- !query 43 output
+1
+
+
+-- !query 44
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as string) END FROM t
+-- !query 44 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS BIGINT) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 44 output
+1
+
+
+-- !query 45
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2' as binary) END FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 46
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast(2 as boolean) END FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 47
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 48
+SELECT CASE WHEN true THEN cast(1 as bigint) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BIGINT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 49
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as tinyint) END FROM t
+-- !query 49 schema
+struct<CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(CAST(2 AS TINYINT) AS FLOAT) END:float>
+-- !query 49 output
+1.0
+
+
+-- !query 50
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as smallint) END FROM t
+-- !query 50 schema
+struct<CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(CAST(2 AS SMALLINT) AS FLOAT) END:float>
+-- !query 50 output
+1.0
+
+
+-- !query 51
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as int) END FROM t
+-- !query 51 schema
+struct<CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(CAST(2 AS INT) AS FLOAT) END:float>
+-- !query 51 output
+1.0
+
+
+-- !query 52
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as bigint) END FROM t
+-- !query 52 schema
+struct<CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(CAST(2 AS BIGINT) AS FLOAT) END:float>
+-- !query 52 output
+1.0
+
+
+-- !query 53
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as float) END FROM t
+-- !query 53 schema
+struct<CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(2 AS FLOAT) END:float>
+-- !query 53 output
+1.0
+
+
+-- !query 54
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as double) END FROM t
+-- !query 54 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS FLOAT) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 54 output
+1.0
+
+
+-- !query 55
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 55 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS FLOAT) AS DOUBLE) ELSE CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE) END:double>
+-- !query 55 output
+1.0
+
+
+-- !query 56
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as string) END FROM t
+-- !query 56 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS FLOAT) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 56 output
+1.0
+
+
+-- !query 57
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2' as binary) END FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 58
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast(2 as boolean) END FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 59
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 60
+SELECT CASE WHEN true THEN cast(1 as float) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS FLOAT) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 61
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as tinyint) END FROM t
+-- !query 61 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS TINYINT) AS DOUBLE) END:double>
+-- !query 61 output
+1.0
+
+
+-- !query 62
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as smallint) END FROM t
+-- !query 62 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS SMALLINT) AS DOUBLE) END:double>
+-- !query 62 output
+1.0
+
+
+-- !query 63
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as int) END FROM t
+-- !query 63 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS INT) AS DOUBLE) END:double>
+-- !query 63 output
+1.0
+
+
+-- !query 64
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as bigint) END FROM t
+-- !query 64 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS BIGINT) AS DOUBLE) END:double>
+-- !query 64 output
+1.0
+
+
+-- !query 65
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as float) END FROM t
+-- !query 65 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS FLOAT) AS DOUBLE) END:double>
+-- !query 65 output
+1.0
+
+
+-- !query 66
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as double) END FROM t
+-- !query 66 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 66 output
+1.0
+
+
+-- !query 67
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 67 schema
+struct<CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE) END:double>
+-- !query 67 output
+1.0
+
+
+-- !query 68
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as string) END FROM t
+-- !query 68 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS DOUBLE) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 68 output
+1.0
+
+
+-- !query 69
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2' as binary) END FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 70
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast(2 as boolean) END FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 71
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 72
+SELECT CASE WHEN true THEN cast(1 as double) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DOUBLE) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 73
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as tinyint) END FROM t
+-- !query 73 schema
+struct<CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(CAST(2 AS TINYINT) AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 73 output
+1
+
+
+-- !query 74
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as smallint) END FROM t
+-- !query 74 schema
+struct<CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(CAST(2 AS SMALLINT) AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 74 output
+1
+
+
+-- !query 75
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as int) END FROM t
+-- !query 75 schema
+struct<CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(CAST(2 AS INT) AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 75 output
+1
+
+
+-- !query 76
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as bigint) END FROM t
+-- !query 76 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) ELSE CAST(CAST(2 AS BIGINT) AS DECIMAL(20,0)) END:decimal(20,0)>
+-- !query 76 output
+1
+
+
+-- !query 77
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as float) END FROM t
+-- !query 77 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) ELSE CAST(CAST(2 AS FLOAT) AS DOUBLE) END:double>
+-- !query 77 output
+1.0
+
+
+-- !query 78
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as double) END FROM t
+-- !query 78 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) ELSE CAST(2 AS DOUBLE) END:double>
+-- !query 78 output
+1.0
+
+
+-- !query 79
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 79 schema
+struct<CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(2 AS DECIMAL(10,0)) END:decimal(10,0)>
+-- !query 79 output
+1
+
+
+-- !query 80
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as string) END FROM t
+-- !query 80 schema
+struct<CASE WHEN true THEN CAST(CAST(1 AS DECIMAL(10,0)) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 80 output
+1
+
+
+-- !query 81
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2' as binary) END FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 82
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast(2 as boolean) END FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 83
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 84
+SELECT CASE WHEN true THEN cast(1 as decimal(10, 0)) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS DECIMAL(10,0)) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 85
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as tinyint) END FROM t
+-- !query 85 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS TINYINT) AS STRING) END:string>
+-- !query 85 output
+1
+
+
+-- !query 86
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as smallint) END FROM t
+-- !query 86 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS SMALLINT) AS STRING) END:string>
+-- !query 86 output
+1
+
+
+-- !query 87
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as int) END FROM t
+-- !query 87 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS INT) AS STRING) END:string>
+-- !query 87 output
+1
+
+
+-- !query 88
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as bigint) END FROM t
+-- !query 88 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS BIGINT) AS STRING) END:string>
+-- !query 88 output
+1
+
+
+-- !query 89
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as float) END FROM t
+-- !query 89 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS FLOAT) AS STRING) END:string>
+-- !query 89 output
+1
+
+
+-- !query 90
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as double) END FROM t
+-- !query 90 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS DOUBLE) AS STRING) END:string>
+-- !query 90 output
+1
+
+
+-- !query 91
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 91 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2 AS DECIMAL(10,0)) AS STRING) END:string>
+-- !query 91 output
+1
+
+
+-- !query 92
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as string) END FROM t
+-- !query 92 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 92 output
+1
+
+
+-- !query 93
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2' as binary) END FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 94
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast(2 as boolean) END FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 95
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 95 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING) END:string>
+-- !query 95 output
+1
+
+
+-- !query 96
+SELECT CASE WHEN true THEN cast(1 as string) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 96 schema
+struct<CASE WHEN true THEN CAST(1 AS STRING) ELSE CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) END:string>
+-- !query 96 output
+1
+
+
+-- !query 97
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as tinyint) END FROM t
+-- !query 97 schema
+struct<>
+-- !query 97 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 98
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as smallint) END FROM t
+-- !query 98 schema
+struct<>
+-- !query 98 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 99
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as int) END FROM t
+-- !query 99 schema
+struct<>
+-- !query 99 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 100
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as bigint) END FROM t
+-- !query 100 schema
+struct<>
+-- !query 100 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 101
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as float) END FROM t
+-- !query 101 schema
+struct<>
+-- !query 101 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 102
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as double) END FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 103
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 104
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as string) END FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS STRING) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 105
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2' as binary) END FROM t
+-- !query 105 schema
+struct<CASE WHEN true THEN CAST(1 AS BINARY) ELSE CAST(2 AS BINARY) END:binary>
+-- !query 105 output
+1
+
+
+-- !query 106
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast(2 as boolean) END FROM t
+-- !query 106 schema
+struct<>
+-- !query 106 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 107
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 107 schema
+struct<>
+-- !query 107 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 108
+SELECT CASE WHEN true THEN cast('1' as binary) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 108 schema
+struct<>
+-- !query 108 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('1' AS BINARY) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 109
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as tinyint) END FROM t
+-- !query 109 schema
+struct<>
+-- !query 109 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 110
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as smallint) END FROM t
+-- !query 110 schema
+struct<>
+-- !query 110 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 111
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as int) END FROM t
+-- !query 111 schema
+struct<>
+-- !query 111 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 112
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as bigint) END FROM t
+-- !query 112 schema
+struct<>
+-- !query 112 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 113
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as float) END FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 114
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as double) END FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 115
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 116
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as string) END FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS STRING) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 117
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2' as binary) END FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 118
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast(2 as boolean) END FROM t
+-- !query 118 schema
+struct<CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST(2 AS BOOLEAN) END:boolean>
+-- !query 118 output
+true
+
+
+-- !query 119
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 120
+SELECT CASE WHEN true THEN cast(1 as boolean) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST(1 AS BOOLEAN) ELSE CAST('2017-12-11 09:30:00' AS DATE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 121
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as tinyint) END FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 122
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as smallint) END FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 123
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as int) END FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 124
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as bigint) END FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 125
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as float) END FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 126
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as double) END FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 127
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 128
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as string) END FROM t
+-- !query 128 schema
+struct<CASE WHEN true THEN CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 128 output
+2017-12-12 09:30:00
+
+
+-- !query 129
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2' as binary) END FROM t
+-- !query 129 schema
+struct<>
+-- !query 129 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 130
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast(2 as boolean) END FROM t
+-- !query 130 schema
+struct<>
+-- !query 130 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 131
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 131 schema
+struct<CASE WHEN true THEN CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) ELSE CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) END:timestamp>
+-- !query 131 output
+2017-12-12 09:30:00
+
+
+-- !query 132
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00.0' as timestamp) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 132 schema
+struct<CASE WHEN true THEN CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) ELSE CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP) END:timestamp>
+-- !query 132 output
+2017-12-12 09:30:00
+
+
+-- !query 133
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as tinyint) END FROM t
+-- !query 133 schema
+struct<>
+-- !query 133 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS TINYINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 134
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as smallint) END FROM t
+-- !query 134 schema
+struct<>
+-- !query 134 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS SMALLINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 135
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as int) END FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS INT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 136
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as bigint) END FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS BIGINT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 137
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as float) END FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS FLOAT) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 138
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as double) END FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS DOUBLE) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 139
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as decimal(10, 0)) END FROM t
+-- !query 139 schema
+struct<>
+-- !query 139 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS DECIMAL(10,0)) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 140
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as string) END FROM t
+-- !query 140 schema
+struct<CASE WHEN true THEN CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING) ELSE CAST(2 AS STRING) END:string>
+-- !query 140 output
+2017-12-12
+
+
+-- !query 141
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2' as binary) END FROM t
+-- !query 141 schema
+struct<>
+-- !query 141 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST('2' AS BINARY) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 142
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast(2 as boolean) END FROM t
+-- !query 142 schema
+struct<>
+-- !query 142 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'CASE WHEN true THEN CAST('2017-12-12 09:30:00' AS DATE) ELSE CAST(2 AS BOOLEAN) END' due to data type mismatch: THEN and ELSE expressions should all be same type or coercible to a common type; line 1 pos 7
+
+
+-- !query 143
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00.0' as timestamp) END FROM t
+-- !query 143 schema
+struct<CASE WHEN true THEN CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP) ELSE CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) END:timestamp>
+-- !query 143 output
+2017-12-12 00:00:00
+
+
+-- !query 144
+SELECT CASE WHEN true THEN cast('2017-12-12 09:30:00' as date) ELSE cast('2017-12-11 09:30:00' as date) END FROM t
+-- !query 144 schema
+struct<CASE WHEN true THEN CAST(2017-12-12 09:30:00 AS DATE) ELSE CAST(2017-12-11 09:30:00 AS DATE) END:date>
+-- !query 144 output
+2017-12-12
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out
new file mode 100644
index 0000000000000..7097027872707
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/ifCoercion.sql.out
@@ -0,0 +1,1232 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 145
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT IF(true, cast(1 as tinyint), cast(2 as tinyint)) FROM t
+-- !query 1 schema
+struct<(IF(true, CAST(1 AS TINYINT), CAST(2 AS TINYINT))):tinyint>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT IF(true, cast(1 as tinyint), cast(2 as smallint)) FROM t
+-- !query 2 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS SMALLINT), CAST(2 AS SMALLINT))):smallint>
+-- !query 2 output
+1
+
+
+-- !query 3
+SELECT IF(true, cast(1 as tinyint), cast(2 as int)) FROM t
+-- !query 3 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS INT), CAST(2 AS INT))):int>
+-- !query 3 output
+1
+
+
+-- !query 4
+SELECT IF(true, cast(1 as tinyint), cast(2 as bigint)) FROM t
+-- !query 4 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS BIGINT), CAST(2 AS BIGINT))):bigint>
+-- !query 4 output
+1
+
+
+-- !query 5
+SELECT IF(true, cast(1 as tinyint), cast(2 as float)) FROM t
+-- !query 5 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS FLOAT), CAST(2 AS FLOAT))):float>
+-- !query 5 output
+1.0
+
+
+-- !query 6
+SELECT IF(true, cast(1 as tinyint), cast(2 as double)) FROM t
+-- !query 6 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 6 output
+1.0
+
+
+-- !query 7
+SELECT IF(true, cast(1 as tinyint), cast(2 as decimal(10, 0))) FROM t
+-- !query 7 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 7 output
+1
+
+
+-- !query 8
+SELECT IF(true, cast(1 as tinyint), cast(2 as string)) FROM t
+-- !query 8 schema
+struct<(IF(true, CAST(CAST(1 AS TINYINT) AS STRING), CAST(2 AS STRING))):string>
+-- !query 8 output
+1
+
+
+-- !query 9
+SELECT IF(true, cast(1 as tinyint), cast('2' as binary)) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2' AS BINARY)))' (tinyint and binary).; line 1 pos 7
+
+
+-- !query 10
+SELECT IF(true, cast(1 as tinyint), cast(2 as boolean)) FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST(2 AS BOOLEAN)))' (tinyint and boolean).; line 1 pos 7
+
+
+-- !query 11
+SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (tinyint and timestamp).; line 1 pos 7
+
+
+-- !query 12
+SELECT IF(true, cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' (tinyint and date).; line 1 pos 7
+
+
+-- !query 13
+SELECT IF(true, cast(1 as smallint), cast(2 as tinyint)) FROM t
+-- !query 13 schema
+struct<(IF(true, CAST(1 AS SMALLINT), CAST(CAST(2 AS TINYINT) AS SMALLINT))):smallint>
+-- !query 13 output
+1
+
+
+-- !query 14
+SELECT IF(true, cast(1 as smallint), cast(2 as smallint)) FROM t
+-- !query 14 schema
+struct<(IF(true, CAST(1 AS SMALLINT), CAST(2 AS SMALLINT))):smallint>
+-- !query 14 output
+1
+
+
+-- !query 15
+SELECT IF(true, cast(1 as smallint), cast(2 as int)) FROM t
+-- !query 15 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS INT), CAST(2 AS INT))):int>
+-- !query 15 output
+1
+
+
+-- !query 16
+SELECT IF(true, cast(1 as smallint), cast(2 as bigint)) FROM t
+-- !query 16 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS BIGINT), CAST(2 AS BIGINT))):bigint>
+-- !query 16 output
+1
+
+
+-- !query 17
+SELECT IF(true, cast(1 as smallint), cast(2 as float)) FROM t
+-- !query 17 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS FLOAT), CAST(2 AS FLOAT))):float>
+-- !query 17 output
+1.0
+
+
+-- !query 18
+SELECT IF(true, cast(1 as smallint), cast(2 as double)) FROM t
+-- !query 18 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 18 output
+1.0
+
+
+-- !query 19
+SELECT IF(true, cast(1 as smallint), cast(2 as decimal(10, 0))) FROM t
+-- !query 19 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 19 output
+1
+
+
+-- !query 20
+SELECT IF(true, cast(1 as smallint), cast(2 as string)) FROM t
+-- !query 20 schema
+struct<(IF(true, CAST(CAST(1 AS SMALLINT) AS STRING), CAST(2 AS STRING))):string>
+-- !query 20 output
+1
+
+
+-- !query 21
+SELECT IF(true, cast(1 as smallint), cast('2' as binary)) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2' AS BINARY)))' (smallint and binary).; line 1 pos 7
+
+
+-- !query 22
+SELECT IF(true, cast(1 as smallint), cast(2 as boolean)) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST(2 AS BOOLEAN)))' (smallint and boolean).; line 1 pos 7
+
+
+-- !query 23
+SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (smallint and timestamp).; line 1 pos 7
+
+
+-- !query 24
+SELECT IF(true, cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' (smallint and date).; line 1 pos 7
+
+
+-- !query 25
+SELECT IF(true, cast(1 as int), cast(2 as tinyint)) FROM t
+-- !query 25 schema
+struct<(IF(true, CAST(1 AS INT), CAST(CAST(2 AS TINYINT) AS INT))):int>
+-- !query 25 output
+1
+
+
+-- !query 26
+SELECT IF(true, cast(1 as int), cast(2 as smallint)) FROM t
+-- !query 26 schema
+struct<(IF(true, CAST(1 AS INT), CAST(CAST(2 AS SMALLINT) AS INT))):int>
+-- !query 26 output
+1
+
+
+-- !query 27
+SELECT IF(true, cast(1 as int), cast(2 as int)) FROM t
+-- !query 27 schema
+struct<(IF(true, CAST(1 AS INT), CAST(2 AS INT))):int>
+-- !query 27 output
+1
+
+
+-- !query 28
+SELECT IF(true, cast(1 as int), cast(2 as bigint)) FROM t
+-- !query 28 schema
+struct<(IF(true, CAST(CAST(1 AS INT) AS BIGINT), CAST(2 AS BIGINT))):bigint>
+-- !query 28 output
+1
+
+
+-- !query 29
+SELECT IF(true, cast(1 as int), cast(2 as float)) FROM t
+-- !query 29 schema
+struct<(IF(true, CAST(CAST(1 AS INT) AS FLOAT), CAST(2 AS FLOAT))):float>
+-- !query 29 output
+1.0
+
+
+-- !query 30
+SELECT IF(true, cast(1 as int), cast(2 as double)) FROM t
+-- !query 30 schema
+struct<(IF(true, CAST(CAST(1 AS INT) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 30 output
+1.0
+
+
+-- !query 31
+SELECT IF(true, cast(1 as int), cast(2 as decimal(10, 0))) FROM t
+-- !query 31 schema
+struct<(IF(true, CAST(CAST(1 AS INT) AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 31 output
+1
+
+
+-- !query 32
+SELECT IF(true, cast(1 as int), cast(2 as string)) FROM t
+-- !query 32 schema
+struct<(IF(true, CAST(CAST(1 AS INT) AS STRING), CAST(2 AS STRING))):string>
+-- !query 32 output
+1
+
+
+-- !query 33
+SELECT IF(true, cast(1 as int), cast('2' as binary)) FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS INT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2' AS BINARY)))' (int and binary).; line 1 pos 7
+
+
+-- !query 34
+SELECT IF(true, cast(1 as int), cast(2 as boolean)) FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS INT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST(2 AS BOOLEAN)))' (int and boolean).; line 1 pos 7
+
+
+-- !query 35
+SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (int and timestamp).; line 1 pos 7
+
+
+-- !query 36
+SELECT IF(true, cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' (int and date).; line 1 pos 7
+
+
+-- !query 37
+SELECT IF(true, cast(1 as bigint), cast(2 as tinyint)) FROM t
+-- !query 37 schema
+struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS TINYINT) AS BIGINT))):bigint>
+-- !query 37 output
+1
+
+
+-- !query 38
+SELECT IF(true, cast(1 as bigint), cast(2 as smallint)) FROM t
+-- !query 38 schema
+struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS SMALLINT) AS BIGINT))):bigint>
+-- !query 38 output
+1
+
+
+-- !query 39
+SELECT IF(true, cast(1 as bigint), cast(2 as int)) FROM t
+-- !query 39 schema
+struct<(IF(true, CAST(1 AS BIGINT), CAST(CAST(2 AS INT) AS BIGINT))):bigint>
+-- !query 39 output
+1
+
+
+-- !query 40
+SELECT IF(true, cast(1 as bigint), cast(2 as bigint)) FROM t
+-- !query 40 schema
+struct<(IF(true, CAST(1 AS BIGINT), CAST(2 AS BIGINT))):bigint>
+-- !query 40 output
+1
+
+
+-- !query 41
+SELECT IF(true, cast(1 as bigint), cast(2 as float)) FROM t
+-- !query 41 schema
+struct<(IF(true, CAST(CAST(1 AS BIGINT) AS FLOAT), CAST(2 AS FLOAT))):float>
+-- !query 41 output
+1.0
+
+
+-- !query 42
+SELECT IF(true, cast(1 as bigint), cast(2 as double)) FROM t
+-- !query 42 schema
+struct<(IF(true, CAST(CAST(1 AS BIGINT) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 42 output
+1.0
+
+
+-- !query 43
+SELECT IF(true, cast(1 as bigint), cast(2 as decimal(10, 0))) FROM t
+-- !query 43 schema
+struct<(IF(true, CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)), CAST(CAST(2 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):decimal(20,0)>
+-- !query 43 output
+1
+
+
+-- !query 44
+SELECT IF(true, cast(1 as bigint), cast(2 as string)) FROM t
+-- !query 44 schema
+struct<(IF(true, CAST(CAST(1 AS BIGINT) AS STRING), CAST(2 AS STRING))):string>
+-- !query 44 output
+1
+
+
+-- !query 45
+SELECT IF(true, cast(1 as bigint), cast('2' as binary)) FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2' AS BINARY)))' (bigint and binary).; line 1 pos 7
+
+
+-- !query 46
+SELECT IF(true, cast(1 as bigint), cast(2 as boolean)) FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST(2 AS BOOLEAN)))' (bigint and boolean).; line 1 pos 7
+
+
+-- !query 47
+SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (bigint and timestamp).; line 1 pos 7
+
+
+-- !query 48
+SELECT IF(true, cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' (bigint and date).; line 1 pos 7
+
+
+-- !query 49
+SELECT IF(true, cast(1 as float), cast(2 as tinyint)) FROM t
+-- !query 49 schema
+struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS TINYINT) AS FLOAT))):float>
+-- !query 49 output
+1.0
+
+
+-- !query 50
+SELECT IF(true, cast(1 as float), cast(2 as smallint)) FROM t
+-- !query 50 schema
+struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS SMALLINT) AS FLOAT))):float>
+-- !query 50 output
+1.0
+
+
+-- !query 51
+SELECT IF(true, cast(1 as float), cast(2 as int)) FROM t
+-- !query 51 schema
+struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS INT) AS FLOAT))):float>
+-- !query 51 output
+1.0
+
+
+-- !query 52
+SELECT IF(true, cast(1 as float), cast(2 as bigint)) FROM t
+-- !query 52 schema
+struct<(IF(true, CAST(1 AS FLOAT), CAST(CAST(2 AS BIGINT) AS FLOAT))):float>
+-- !query 52 output
+1.0
+
+
+-- !query 53
+SELECT IF(true, cast(1 as float), cast(2 as float)) FROM t
+-- !query 53 schema
+struct<(IF(true, CAST(1 AS FLOAT), CAST(2 AS FLOAT))):float>
+-- !query 53 output
+1.0
+
+
+-- !query 54
+SELECT IF(true, cast(1 as float), cast(2 as double)) FROM t
+-- !query 54 schema
+struct<(IF(true, CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 54 output
+1.0
+
+
+-- !query 55
+SELECT IF(true, cast(1 as float), cast(2 as decimal(10, 0))) FROM t
+-- !query 55 schema
+struct<(IF(true, CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE))):double>
+-- !query 55 output
+1.0
+
+
+-- !query 56
+SELECT IF(true, cast(1 as float), cast(2 as string)) FROM t
+-- !query 56 schema
+struct<(IF(true, CAST(CAST(1 AS FLOAT) AS STRING), CAST(2 AS STRING))):string>
+-- !query 56 output
+1.0
+
+
+-- !query 57
+SELECT IF(true, cast(1 as float), cast('2' as binary)) FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2' AS BINARY)))' (float and binary).; line 1 pos 7
+
+
+-- !query 58
+SELECT IF(true, cast(1 as float), cast(2 as boolean)) FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST(2 AS BOOLEAN)))' (float and boolean).; line 1 pos 7
+
+
+-- !query 59
+SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (float and timestamp).; line 1 pos 7
+
+
+-- !query 60
+SELECT IF(true, cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' (float and date).; line 1 pos 7
+
+
+-- !query 61
+SELECT IF(true, cast(1 as double), cast(2 as tinyint)) FROM t
+-- !query 61 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS TINYINT) AS DOUBLE))):double>
+-- !query 61 output
+1.0
+
+
+-- !query 62
+SELECT IF(true, cast(1 as double), cast(2 as smallint)) FROM t
+-- !query 62 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS SMALLINT) AS DOUBLE))):double>
+-- !query 62 output
+1.0
+
+
+-- !query 63
+SELECT IF(true, cast(1 as double), cast(2 as int)) FROM t
+-- !query 63 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS INT) AS DOUBLE))):double>
+-- !query 63 output
+1.0
+
+
+-- !query 64
+SELECT IF(true, cast(1 as double), cast(2 as bigint)) FROM t
+-- !query 64 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS BIGINT) AS DOUBLE))):double>
+-- !query 64 output
+1.0
+
+
+-- !query 65
+SELECT IF(true, cast(1 as double), cast(2 as float)) FROM t
+-- !query 65 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS FLOAT) AS DOUBLE))):double>
+-- !query 65 output
+1.0
+
+
+-- !query 66
+SELECT IF(true, cast(1 as double), cast(2 as double)) FROM t
+-- !query 66 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 66 output
+1.0
+
+
+-- !query 67
+SELECT IF(true, cast(1 as double), cast(2 as decimal(10, 0))) FROM t
+-- !query 67 schema
+struct<(IF(true, CAST(1 AS DOUBLE), CAST(CAST(2 AS DECIMAL(10,0)) AS DOUBLE))):double>
+-- !query 67 output
+1.0
+
+
+-- !query 68
+SELECT IF(true, cast(1 as double), cast(2 as string)) FROM t
+-- !query 68 schema
+struct<(IF(true, CAST(CAST(1 AS DOUBLE) AS STRING), CAST(2 AS STRING))):string>
+-- !query 68 output
+1.0
+
+
+-- !query 69
+SELECT IF(true, cast(1 as double), cast('2' as binary)) FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2' AS BINARY)))' (double and binary).; line 1 pos 7
+
+
+-- !query 70
+SELECT IF(true, cast(1 as double), cast(2 as boolean)) FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST(2 AS BOOLEAN)))' (double and boolean).; line 1 pos 7
+
+
+-- !query 71
+SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 72
+SELECT IF(true, cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' (double and date).; line 1 pos 7
+
+
+-- !query 73
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as tinyint)) FROM t
+-- !query 73 schema
+struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS TINYINT) AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 73 output
+1
+
+
+-- !query 74
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as smallint)) FROM t
+-- !query 74 schema
+struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS SMALLINT) AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 74 output
+1
+
+
+-- !query 75
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as int)) FROM t
+-- !query 75 schema
+struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(CAST(2 AS INT) AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 75 output
+1
+
+
+-- !query 76
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as bigint)) FROM t
+-- !query 76 schema
+struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(2 AS BIGINT) AS DECIMAL(20,0)))):decimal(20,0)>
+-- !query 76 output
+1
+
+
+-- !query 77
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as float)) FROM t
+-- !query 77 schema
+struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(2 AS FLOAT) AS DOUBLE))):double>
+-- !query 77 output
+1.0
+
+
+-- !query 78
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as double)) FROM t
+-- !query 78 schema
+struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(2 AS DOUBLE))):double>
+-- !query 78 output
+1.0
+
+
+-- !query 79
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as decimal(10, 0))) FROM t
+-- !query 79 schema
+struct<(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS DECIMAL(10,0)))):decimal(10,0)>
+-- !query 79 output
+1
+
+
+-- !query 80
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as string)) FROM t
+-- !query 80 schema
+struct<(IF(true, CAST(CAST(1 AS DECIMAL(10,0)) AS STRING), CAST(2 AS STRING))):string>
+-- !query 80 output
+1
+
+
+-- !query 81
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2' as binary)) FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2' AS BINARY)))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 82
+SELECT IF(true, cast(1 as decimal(10, 0)), cast(2 as boolean)) FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST(2 AS BOOLEAN)))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 83
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 84
+SELECT IF(true, cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 85
+SELECT IF(true, cast(1 as string), cast(2 as tinyint)) FROM t
+-- !query 85 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS TINYINT) AS STRING))):string>
+-- !query 85 output
+1
+
+
+-- !query 86
+SELECT IF(true, cast(1 as string), cast(2 as smallint)) FROM t
+-- !query 86 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS SMALLINT) AS STRING))):string>
+-- !query 86 output
+1
+
+
+-- !query 87
+SELECT IF(true, cast(1 as string), cast(2 as int)) FROM t
+-- !query 87 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS INT) AS STRING))):string>
+-- !query 87 output
+1
+
+
+-- !query 88
+SELECT IF(true, cast(1 as string), cast(2 as bigint)) FROM t
+-- !query 88 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS BIGINT) AS STRING))):string>
+-- !query 88 output
+1
+
+
+-- !query 89
+SELECT IF(true, cast(1 as string), cast(2 as float)) FROM t
+-- !query 89 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS FLOAT) AS STRING))):string>
+-- !query 89 output
+1
+
+
+-- !query 90
+SELECT IF(true, cast(1 as string), cast(2 as double)) FROM t
+-- !query 90 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS DOUBLE) AS STRING))):string>
+-- !query 90 output
+1
+
+
+-- !query 91
+SELECT IF(true, cast(1 as string), cast(2 as decimal(10, 0))) FROM t
+-- !query 91 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2 AS DECIMAL(10,0)) AS STRING))):string>
+-- !query 91 output
+1
+
+
+-- !query 92
+SELECT IF(true, cast(1 as string), cast(2 as string)) FROM t
+-- !query 92 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(2 AS STRING))):string>
+-- !query 92 output
+1
+
+
+-- !query 93
+SELECT IF(true, cast(1 as string), cast('2' as binary)) FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS STRING), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS STRING), CAST('2' AS BINARY)))' (string and binary).; line 1 pos 7
+
+
+-- !query 94
+SELECT IF(true, cast(1 as string), cast(2 as boolean)) FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS STRING), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS STRING), CAST(2 AS BOOLEAN)))' (string and boolean).; line 1 pos 7
+
+
+-- !query 95
+SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 95 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):string>
+-- !query 95 output
+1
+
+
+-- !query 96
+SELECT IF(true, cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 96 schema
+struct<(IF(true, CAST(1 AS STRING), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):string>
+-- !query 96 output
+1
+
+
+-- !query 97
+SELECT IF(true, cast('1' as binary), cast(2 as tinyint)) FROM t
+-- !query 97 schema
+struct<>
+-- !query 97 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS TINYINT)))' (binary and tinyint).; line 1 pos 7
+
+
+-- !query 98
+SELECT IF(true, cast('1' as binary), cast(2 as smallint)) FROM t
+-- !query 98 schema
+struct<>
+-- !query 98 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS SMALLINT)))' (binary and smallint).; line 1 pos 7
+
+
+-- !query 99
+SELECT IF(true, cast('1' as binary), cast(2 as int)) FROM t
+-- !query 99 schema
+struct<>
+-- !query 99 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS INT)))' (binary and int).; line 1 pos 7
+
+
+-- !query 100
+SELECT IF(true, cast('1' as binary), cast(2 as bigint)) FROM t
+-- !query 100 schema
+struct<>
+-- !query 100 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS BIGINT)))' (binary and bigint).; line 1 pos 7
+
+
+-- !query 101
+SELECT IF(true, cast('1' as binary), cast(2 as float)) FROM t
+-- !query 101 schema
+struct<>
+-- !query 101 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS FLOAT)))' (binary and float).; line 1 pos 7
+
+
+-- !query 102
+SELECT IF(true, cast('1' as binary), cast(2 as double)) FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS DOUBLE)))' (binary and double).; line 1 pos 7
+
+
+-- !query 103
+SELECT IF(true, cast('1' as binary), cast(2 as decimal(10, 0))) FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS DECIMAL(10,0))))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 104
+SELECT IF(true, cast('1' as binary), cast(2 as string)) FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS STRING)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS STRING)))' (binary and string).; line 1 pos 7
+
+
+-- !query 105
+SELECT IF(true, cast('1' as binary), cast('2' as binary)) FROM t
+-- !query 105 schema
+struct<(IF(true, CAST(1 AS BINARY), CAST(2 AS BINARY))):binary>
+-- !query 105 output
+1
+
+
+-- !query 106
+SELECT IF(true, cast('1' as binary), cast(2 as boolean)) FROM t
+-- !query 106 schema
+struct<>
+-- !query 106 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST(2 AS BOOLEAN)))' (binary and boolean).; line 1 pos 7
+
+
+-- !query 107
+SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 107 schema
+struct<>
+-- !query 107 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (binary and timestamp).; line 1 pos 7
+
+
+-- !query 108
+SELECT IF(true, cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 108 schema
+struct<>
+-- !query 108 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' (binary and date).; line 1 pos 7
+
+
+-- !query 109
+SELECT IF(true, cast(1 as boolean), cast(2 as tinyint)) FROM t
+-- !query 109 schema
+struct<>
+-- !query 109 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS TINYINT)))' (boolean and tinyint).; line 1 pos 7
+
+
+-- !query 110
+SELECT IF(true, cast(1 as boolean), cast(2 as smallint)) FROM t
+-- !query 110 schema
+struct<>
+-- !query 110 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS SMALLINT)))' (boolean and smallint).; line 1 pos 7
+
+
+-- !query 111
+SELECT IF(true, cast(1 as boolean), cast(2 as int)) FROM t
+-- !query 111 schema
+struct<>
+-- !query 111 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS INT)))' (boolean and int).; line 1 pos 7
+
+
+-- !query 112
+SELECT IF(true, cast(1 as boolean), cast(2 as bigint)) FROM t
+-- !query 112 schema
+struct<>
+-- !query 112 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BIGINT)))' (boolean and bigint).; line 1 pos 7
+
+
+-- !query 113
+SELECT IF(true, cast(1 as boolean), cast(2 as float)) FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS FLOAT)))' (boolean and float).; line 1 pos 7
+
+
+-- !query 114
+SELECT IF(true, cast(1 as boolean), cast(2 as double)) FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DOUBLE)))' (boolean and double).; line 1 pos 7
+
+
+-- !query 115
+SELECT IF(true, cast(1 as boolean), cast(2 as decimal(10, 0))) FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS DECIMAL(10,0))))' (boolean and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 116
+SELECT IF(true, cast(1 as boolean), cast(2 as string)) FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS STRING)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS STRING)))' (boolean and string).; line 1 pos 7
+
+
+-- !query 117
+SELECT IF(true, cast(1 as boolean), cast('2' as binary)) FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2' AS BINARY)))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 118
+SELECT IF(true, cast(1 as boolean), cast(2 as boolean)) FROM t
+-- !query 118 schema
+struct<(IF(true, CAST(1 AS BOOLEAN), CAST(2 AS BOOLEAN))):boolean>
+-- !query 118 output
+true
+
+
+-- !query 119
+SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 120
+SELECT IF(true, cast(1 as boolean), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: differing types in '(IF(true, CAST(1 AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' (boolean and date).; line 1 pos 7
+
+
+-- !query 121
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as tinyint)) FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS TINYINT)))' (timestamp and tinyint).; line 1 pos 7
+
+
+-- !query 122
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as smallint)) FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS SMALLINT)))' (timestamp and smallint).; line 1 pos 7
+
+
+-- !query 123
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as int)) FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS INT)))' (timestamp and int).; line 1 pos 7
+
+
+-- !query 124
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as bigint)) FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BIGINT)))' (timestamp and bigint).; line 1 pos 7
+
+
+-- !query 125
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as float)) FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS FLOAT)))' (timestamp and float).; line 1 pos 7
+
+
+-- !query 126
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as double)) FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DOUBLE)))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 127
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as decimal(10, 0))) FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS DECIMAL(10,0))))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 128
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as string)) FROM t
+-- !query 128 schema
+struct<(IF(true, CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING), CAST(2 AS STRING))):string>
+-- !query 128 output
+2017-12-12 09:30:00
+
+
+-- !query 129
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2' as binary)) FROM t
+-- !query 129 schema
+struct<>
+-- !query 129 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('2' AS BINARY)))' (timestamp and binary).; line 1 pos 7
+
+
+-- !query 130
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast(2 as boolean)) FROM t
+-- !query 130 schema
+struct<>
+-- !query 130 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(2 AS BOOLEAN)))' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 131
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 131 schema
+struct<(IF(true, CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):timestamp>
+-- !query 131 output
+2017-12-12 09:30:00
+
+
+-- !query 132
+SELECT IF(true, cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 132 schema
+struct<(IF(true, CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):timestamp>
+-- !query 132 output
+2017-12-12 09:30:00
+
+
+-- !query 133
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as tinyint)) FROM t
+-- !query 133 schema
+struct<>
+-- !query 133 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS TINYINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS TINYINT)))' (date and tinyint).; line 1 pos 7
+
+
+-- !query 134
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as smallint)) FROM t
+-- !query 134 schema
+struct<>
+-- !query 134 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS SMALLINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS SMALLINT)))' (date and smallint).; line 1 pos 7
+
+
+-- !query 135
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as int)) FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS INT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS INT)))' (date and int).; line 1 pos 7
+
+
+-- !query 136
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as bigint)) FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BIGINT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BIGINT)))' (date and bigint).; line 1 pos 7
+
+
+-- !query 137
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as float)) FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS FLOAT)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS FLOAT)))' (date and float).; line 1 pos 7
+
+
+-- !query 138
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as double)) FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DOUBLE)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DOUBLE)))' (date and double).; line 1 pos 7
+
+
+-- !query 139
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as decimal(10, 0))) FROM t
+-- !query 139 schema
+struct<>
+-- !query 139 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS DECIMAL(10,0))))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 140
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as string)) FROM t
+-- !query 140 schema
+struct<(IF(true, CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING), CAST(2 AS STRING))):string>
+-- !query 140 output
+2017-12-12
+
+
+-- !query 141
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2' as binary)) FROM t
+-- !query 141 schema
+struct<>
+-- !query 141 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST('2' AS BINARY)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST('2' AS BINARY)))' (date and binary).; line 1 pos 7
+
+
+-- !query 142
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast(2 as boolean)) FROM t
+-- !query 142 schema
+struct<>
+-- !query 142 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BOOLEAN)))' due to data type mismatch: differing types in '(IF(true, CAST('2017-12-12 09:30:00' AS DATE), CAST(2 AS BOOLEAN)))' (date and boolean).; line 1 pos 7
+
+
+-- !query 143
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 143 schema
+struct<(IF(true, CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):timestamp>
+-- !query 143 output
+2017-12-12 00:00:00
+
+
+-- !query 144
+SELECT IF(true, cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 144 schema
+struct<(IF(true, CAST(2017-12-12 09:30:00 AS DATE), CAST(2017-12-11 09:30:00 AS DATE))):date>
+-- !query 144 output
+2017-12-12

From 0c8fca4608643ed9e1eb3ae8620e6f4f6a017a87 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Sat, 16 Dec 2017 10:57:35 +0900
Subject: [PATCH 132/356] [SPARK-22811][PYSPARK][ML] Fix pyspark.ml.tests
 failure when Hive is not available.

## What changes were proposed in this pull request?

pyspark.ml.tests is missing a py4j import. I've added the import and fixed the test that uses it. This test was only failing when testing without Hive.

## How was this patch tested?

Existing tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Bago Amirbekian <bago@databricks.com>

Closes #19997 from MrBago/fix-ImageReaderTest2.
---
 python/pyspark/ml/tests.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 3a0b816c367ec..be1521154f042 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -44,6 +44,7 @@
 import numpy as np
 from numpy import abs, all, arange, array, array_equal, inf, ones, tile, zeros
 import inspect
+import py4j
 
 from pyspark import keyword_only, SparkContext
 from pyspark.ml import Estimator, Model, Pipeline, PipelineModel, Transformer, UnaryTransformer
@@ -1859,8 +1860,9 @@ class ImageReaderTest2(PySparkTestCase):
 
     @classmethod
     def setUpClass(cls):
-        PySparkTestCase.setUpClass()
+        super(ImageReaderTest2, cls).setUpClass()
         # Note that here we enable Hive's support.
+        cls.spark = None
         try:
             cls.sc._jvm.org.apache.hadoop.hive.conf.HiveConf()
         except py4j.protocol.Py4JError:
@@ -1873,8 +1875,10 @@ def setUpClass(cls):
 
     @classmethod
     def tearDownClass(cls):
-        PySparkTestCase.tearDownClass()
-        cls.spark.sparkSession.stop()
+        super(ImageReaderTest2, cls).tearDownClass()
+        if cls.spark is not None:
+            cls.spark.sparkSession.stop()
+            cls.spark = None
 
     def test_read_images_multiple_times(self):
         # This test case is to check if `ImageSchema.readImages` tries to

From c2aeddf9eae2f8f72c244a4b16af264362d6cf5d Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 17 Dec 2017 14:40:41 +0900
Subject: [PATCH 133/356] [SPARK-22817][R] Use fixed testthat version for
 SparkR tests in AppVeyor

## What changes were proposed in this pull request?

`testthat` 2.0.0 is released and AppVeyor now started to use it instead of 1.0.2. And then, we started to have R tests failed in AppVeyor. See - https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark/build/1967-master

```
Error in get(name, envir = asNamespace(pkg), inherits = FALSE) :
  object 'run_tests' not found
Calls: ::: -> get
```

This seems because we rely on internal `testthat:::run_tests` here:

https://github.com/r-lib/testthat/blob/v1.0.2/R/test-package.R#L62-L75

https://github.com/apache/spark/blob/dc4c351837879dab26ad8fb471dc51c06832a9e4/R/pkg/tests/run-all.R#L49-L52

However, seems it was removed out from 2.0.0.  I tried few other exposed APIs like `test_dir` but I failed to make a good compatible fix.

Seems we better fix the `testthat` version first to make the build passed.

## How was this patch tested?

Manually tested and AppVeyor tests.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20003 from HyukjinKwon/SPARK-22817.
---
 appveyor.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/appveyor.yml b/appveyor.yml
index 48740920cd09b..aee94c59612d2 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -42,7 +42,9 @@ install:
   # Install maven and dependencies
   - ps: .\dev\appveyor-install-dependencies.ps1
   # Required package for R unit tests
-  - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'testthat', 'e1071', 'survival'), repos='http://cran.us.r-project.org')"
+  - cmd: R -e "install.packages(c('knitr', 'rmarkdown', 'devtools', 'e1071', 'survival'), repos='http://cran.us.r-project.org')"
+  # Here, we use the fixed version of testthat. For more details, please see SPARK-22817.
+  - cmd: R -e "devtools::install_version('testthat', version = '1.0.2', repos='http://cran.us.r-project.org')"
   - cmd: R -e "packageVersion('knitr'); packageVersion('rmarkdown'); packageVersion('testthat'); packageVersion('e1071'); packageVersion('survival')"
 
 build_script:

From 77988a9d0d553f26034f7206e5e6314acab2dec5 Mon Sep 17 00:00:00 2001
From: Mahmut CAVDAR <mahmutcvdr@gmail.com>
Date: Sun, 17 Dec 2017 10:52:01 -0600
Subject: [PATCH 134/356] [MINOR][DOC] Fix the link of 'Getting Started'

## What changes were proposed in this pull request?

Easy fix in the link.

## How was this patch tested?

Tested manually

Author: Mahmut CAVDAR <mahmutcvdr@gmail.com>

Closes #19996 from mcavdar/master.
---
 docs/mllib-decision-tree.md   | 2 +-
 docs/running-on-mesos.md      | 4 ++--
 docs/spark-standalone.md      | 2 +-
 docs/sql-programming-guide.md | 1 +
 docs/tuning.md                | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index 0e753b8dd04a2..ec13b81f85557 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -91,7 +91,7 @@ For a categorical feature with `$M$` possible values (categories), one could com
 `$2^{M-1}-1$` split candidates. For binary (0/1) classification and regression,
 we can reduce the number of split candidates to `$M-1$` by ordering the
 categorical feature values by the average label. (See Section 9.2.4 in
-[Elements of Statistical Machine Learning](http://statweb.stanford.edu/~tibs/ElemStatLearn/) for
+[Elements of Statistical Machine Learning](https://web.stanford.edu/~hastie/ElemStatLearn/) for
 details.) For example, for a binary classification problem with one categorical feature with three
 categories A, B and C whose corresponding proportions of label 1 are 0.2, 0.6 and 0.4, the categorical
 features are ordered as A, C, B. The two split candidates are A \| C, B
diff --git a/docs/running-on-mesos.md b/docs/running-on-mesos.md
index 19ec7c1e0aeee..382cbfd5301b0 100644
--- a/docs/running-on-mesos.md
+++ b/docs/running-on-mesos.md
@@ -47,7 +47,7 @@ To install Apache Mesos from source, follow these steps:
 
 1. Download a Mesos release from a
    [mirror](http://www.apache.org/dyn/closer.lua/mesos/{{site.MESOS_VERSION}}/)
-2. Follow the Mesos [Getting Started](http://mesos.apache.org/gettingstarted) page for compiling and
+2. Follow the Mesos [Getting Started](http://mesos.apache.org/getting-started) page for compiling and
    installing Mesos
 
 **Note:** If you want to run Mesos without installing it into the default paths on your system
@@ -159,7 +159,7 @@ By setting the Mesos proxy config property (requires mesos version >= 1.4), `--c
 If you like to run the `MesosClusterDispatcher` with Marathon, you need to run the `MesosClusterDispatcher` in the foreground (i.e: `bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher`). Note that the `MesosClusterDispatcher` not yet supports multiple instances for HA.
 
 The `MesosClusterDispatcher` also supports writing recovery state into Zookeeper. This will allow the `MesosClusterDispatcher` to be able to recover all submitted and running containers on relaunch.   In order to enable this recovery mode, you can set SPARK_DAEMON_JAVA_OPTS in spark-env by configuring `spark.deploy.recoveryMode` and related spark.deploy.zookeeper.* configurations.
-For more information about these configurations please refer to the configurations [doc](configurations.html#deploy).
+For more information about these configurations please refer to the configurations [doc](configuration.html#deploy).
 
 You can also specify any additional jars required by the `MesosClusterDispatcher` in the classpath by setting the environment variable SPARK_DAEMON_CLASSPATH in spark-env.
 
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index f51c5cc38f4de..8fa643abf1373 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -364,7 +364,7 @@ By default, standalone scheduling clusters are resilient to Worker failures (ins
 
 Utilizing ZooKeeper to provide leader election and some state storage, you can launch multiple Masters in your cluster connected to the same ZooKeeper instance. One will be elected "leader" and the others will remain in standby mode. If the current leader dies, another Master will be elected, recover the old Master's state, and then resume scheduling. The entire recovery process (from the time the first leader goes down) should take between 1 and 2 minutes. Note that this delay only affects scheduling _new_ applications -- applications that were already running during Master failover are unaffected.
 
-Learn more about getting started with ZooKeeper [here](http://zookeeper.apache.org/doc/trunk/zookeeperStarted.html).
+Learn more about getting started with ZooKeeper [here](http://zookeeper.apache.org/doc/current/zookeeperStarted.html).
 
 **Configuration**
 
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b76be9132dd03..f02f46236e2b0 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -501,6 +501,7 @@ To load a CSV file you can use:
 
 </div>
 </div>
+
 ### Run SQL on files directly
 
 Instead of using read API to load a file into DataFrame and query it, you can also query that
diff --git a/docs/tuning.md b/docs/tuning.md
index 7d5f97a02fe6e..fc27713f28d46 100644
--- a/docs/tuning.md
+++ b/docs/tuning.md
@@ -219,7 +219,7 @@ temporary objects created during task execution. Some steps which may be useful
   
 * Try the G1GC garbage collector with `-XX:+UseG1GC`. It can improve performance in some situations where
   garbage collection is a bottleneck. Note that with large executor heap sizes, it may be important to
-  increase the [G1 region size](https://blogs.oracle.com/g1gc/entry/g1_gc_tuning_a_case) 
+  increase the [G1 region size](http://www.oracle.com/technetwork/articles/java/g1gc-1984535.html) 
   with `-XX:G1HeapRegionSize`
 
 * As an example, if your task is reading data from HDFS, the amount of memory used by the task can be estimated using

From 7f6d10a7376594cf7e5225a05fa9e58b2011ad3d Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Sun, 17 Dec 2017 09:15:10 -0800
Subject: [PATCH 135/356] [SPARK-22816][TEST] Basic tests for PromoteStrings
 and InConversion

## What changes were proposed in this pull request?

Test Coverage for `PromoteStrings` and `InConversion`, this is a Sub-tasks for [SPARK-22722](https://issues.apache.org/jira/browse/SPARK-22722).

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20001 from wangyum/SPARK-22816.
---
 .../typeCoercion/native/inConversion.sql      |  330 +++
 .../typeCoercion/native/promoteStrings.sql    |  364 +++
 .../typeCoercion/native/inConversion.sql.out  | 2454 ++++++++++++++++
 .../native/promoteStrings.sql.out             | 2578 +++++++++++++++++
 4 files changed, 5726 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/inConversion.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/promoteStrings.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/inConversion.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/inConversion.sql
new file mode 100644
index 0000000000000..39dbe7268fba0
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/inConversion.sql
@@ -0,0 +1,330 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT cast(1 as tinyint) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as int)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as float)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as double)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as string)) FROM t;
+SELECT cast(1 as tinyint) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as smallint) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as int)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as float)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as double)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as string)) FROM t;
+SELECT cast(1 as smallint) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as int) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int)) FROM t;
+SELECT cast(1 as int) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as float)) FROM t;
+SELECT cast(1 as int) in (cast(1 as double)) FROM t;
+SELECT cast(1 as int) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as int) in (cast(1 as string)) FROM t;
+SELECT cast(1 as int) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as int) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as int) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as int) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as bigint) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as int)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as float)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as double)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as string)) FROM t;
+SELECT cast(1 as bigint) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as float) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as int)) FROM t;
+SELECT cast(1 as float) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float)) FROM t;
+SELECT cast(1 as float) in (cast(1 as double)) FROM t;
+SELECT cast(1 as float) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as float) in (cast(1 as string)) FROM t;
+SELECT cast(1 as float) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as float) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as float) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as float) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as double) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as int)) FROM t;
+SELECT cast(1 as double) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as float)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double)) FROM t;
+SELECT cast(1 as double) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as double) in (cast(1 as string)) FROM t;
+SELECT cast(1 as double) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as double) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as double) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as double) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as int)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as float)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as double)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as string)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as string) in (cast(1 as tinyint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as smallint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as int)) FROM t;
+SELECT cast(1 as string) in (cast(1 as bigint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as float)) FROM t;
+SELECT cast(1 as string) in (cast(1 as double)) FROM t;
+SELECT cast(1 as string) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as string) in (cast(1 as string)) FROM t;
+SELECT cast(1 as string) in (cast('1' as binary)) FROM t;
+SELECT cast(1 as string) in (cast(1 as boolean)) FROM t;
+SELECT cast(1 as string) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as string) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('1' as binary) in (cast(1 as tinyint)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as smallint)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as int)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as bigint)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as float)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as double)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as decimal(10, 0))) FROM t;
+SELECT cast('1' as binary) in (cast(1 as string)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary)) FROM t;
+SELECT cast('1' as binary) in (cast(1 as boolean)) FROM t;
+SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT true in (cast(1 as tinyint)) FROM t;
+SELECT true in (cast(1 as smallint)) FROM t;
+SELECT true in (cast(1 as int)) FROM t;
+SELECT true in (cast(1 as bigint)) FROM t;
+SELECT true in (cast(1 as float)) FROM t;
+SELECT true in (cast(1 as double)) FROM t;
+SELECT true in (cast(1 as decimal(10, 0))) FROM t;
+SELECT true in (cast(1 as string)) FROM t;
+SELECT true in (cast('1' as binary)) FROM t;
+SELECT true in (cast(1 as boolean)) FROM t;
+SELECT true in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT true in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as tinyint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as smallint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as int)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as bigint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as float)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as double)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as decimal(10, 0))) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as string)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2' as binary)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as boolean)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as tinyint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as smallint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as int)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as bigint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as float)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as double)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as decimal(10, 0))) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as string)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2' as binary)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as boolean)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as smallint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as int)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as bigint)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as float)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as double)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as string)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('1' as binary)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as boolean)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as smallint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as int)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as bigint)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as float)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as double)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as string)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('1' as binary)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as boolean)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as smallint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as int)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as bigint)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as float)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as double)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as string)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast('1' as binary)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as boolean)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as smallint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as int)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as bigint)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as float)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as double)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as string)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('1' as binary)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as boolean)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as smallint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as int)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as bigint)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as float)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as double)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as string)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast('1' as binary)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as boolean)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as smallint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as int)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as bigint)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as float)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as double)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as string)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast('1' as binary)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as boolean)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as int)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as float)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as double)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as string)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as tinyint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as smallint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as int)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as bigint)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as float)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as double)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as string)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast('1' as binary)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as boolean)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as tinyint)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as smallint)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as int)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as bigint)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as float)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as double)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as string)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast('1' as binary)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as boolean)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as tinyint)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as smallint)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as int)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as bigint)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as float)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as double)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as string)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('1' as binary)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as boolean)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as tinyint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as smallint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as int)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as bigint)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as float)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as double)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as string)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('1' as binary)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as boolean)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as tinyint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as smallint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as int)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as bigint)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as float)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as double)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as string)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('1' as binary)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as boolean)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/promoteStrings.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/promoteStrings.sql
new file mode 100644
index 0000000000000..a5603a184578d
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/promoteStrings.sql
@@ -0,0 +1,364 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+-- Binary arithmetic
+SELECT '1' + cast(1 as tinyint)                         FROM t;
+SELECT '1' + cast(1 as smallint)                        FROM t;
+SELECT '1' + cast(1 as int)                             FROM t;
+SELECT '1' + cast(1 as bigint)                          FROM t;
+SELECT '1' + cast(1 as float)                           FROM t;
+SELECT '1' + cast(1 as double)                          FROM t;
+SELECT '1' + cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' + '1'                                        FROM t;
+SELECT '1' + cast('1' as binary)                        FROM t;
+SELECT '1' + cast(1 as boolean)                         FROM t;
+SELECT '1' + cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' + cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' - cast(1 as tinyint)                         FROM t;
+SELECT '1' - cast(1 as smallint)                        FROM t;
+SELECT '1' - cast(1 as int)                             FROM t;
+SELECT '1' - cast(1 as bigint)                          FROM t;
+SELECT '1' - cast(1 as float)                           FROM t;
+SELECT '1' - cast(1 as double)                          FROM t;
+SELECT '1' - cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' - '1'                                        FROM t;
+SELECT '1' - cast('1' as binary)                        FROM t;
+SELECT '1' - cast(1 as boolean)                         FROM t;
+SELECT '1' - cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' - cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' * cast(1 as tinyint)                         FROM t;
+SELECT '1' * cast(1 as smallint)                        FROM t;
+SELECT '1' * cast(1 as int)                             FROM t;
+SELECT '1' * cast(1 as bigint)                          FROM t;
+SELECT '1' * cast(1 as float)                           FROM t;
+SELECT '1' * cast(1 as double)                          FROM t;
+SELECT '1' * cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' * '1'                                        FROM t;
+SELECT '1' * cast('1' as binary)                        FROM t;
+SELECT '1' * cast(1 as boolean)                         FROM t;
+SELECT '1' * cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' * cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' / cast(1 as tinyint)                         FROM t;
+SELECT '1' / cast(1 as smallint)                        FROM t;
+SELECT '1' / cast(1 as int)                             FROM t;
+SELECT '1' / cast(1 as bigint)                          FROM t;
+SELECT '1' / cast(1 as float)                           FROM t;
+SELECT '1' / cast(1 as double)                          FROM t;
+SELECT '1' / cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' / '1'                                        FROM t;
+SELECT '1' / cast('1' as binary)                        FROM t;
+SELECT '1' / cast(1 as boolean)                         FROM t;
+SELECT '1' / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' / cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' % cast(1 as tinyint)                         FROM t;
+SELECT '1' % cast(1 as smallint)                        FROM t;
+SELECT '1' % cast(1 as int)                             FROM t;
+SELECT '1' % cast(1 as bigint)                          FROM t;
+SELECT '1' % cast(1 as float)                           FROM t;
+SELECT '1' % cast(1 as double)                          FROM t;
+SELECT '1' % cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' % '1'                                        FROM t;
+SELECT '1' % cast('1' as binary)                        FROM t;
+SELECT '1' % cast(1 as boolean)                         FROM t;
+SELECT '1' % cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' % cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT pmod('1', cast(1 as tinyint))                         FROM t;
+SELECT pmod('1', cast(1 as smallint))                        FROM t;
+SELECT pmod('1', cast(1 as int))                             FROM t;
+SELECT pmod('1', cast(1 as bigint))                          FROM t;
+SELECT pmod('1', cast(1 as float))                           FROM t;
+SELECT pmod('1', cast(1 as double))                          FROM t;
+SELECT pmod('1', cast(1 as decimal(10, 0)))                  FROM t;
+SELECT pmod('1', '1')                                        FROM t;
+SELECT pmod('1', cast('1' as binary))                        FROM t;
+SELECT pmod('1', cast(1 as boolean))                         FROM t;
+SELECT pmod('1', cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT pmod('1', cast('2017-12-11 09:30:00' as date))        FROM t;
+
+SELECT cast(1 as tinyint)                         + '1' FROM t;
+SELECT cast(1 as smallint)                        + '1' FROM t;
+SELECT cast(1 as int)                             + '1' FROM t;
+SELECT cast(1 as bigint)                          + '1' FROM t;
+SELECT cast(1 as float)                           + '1' FROM t;
+SELECT cast(1 as double)                          + '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  + '1' FROM t;
+SELECT cast('1' as binary)                        + '1' FROM t;
+SELECT cast(1 as boolean)                         + '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        + '1' FROM t;
+
+SELECT cast(1 as tinyint)                         - '1' FROM t;
+SELECT cast(1 as smallint)                        - '1' FROM t;
+SELECT cast(1 as int)                             - '1' FROM t;
+SELECT cast(1 as bigint)                          - '1' FROM t;
+SELECT cast(1 as float)                           - '1' FROM t;
+SELECT cast(1 as double)                          - '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  - '1' FROM t;
+SELECT cast('1' as binary)                        - '1' FROM t;
+SELECT cast(1 as boolean)                         - '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        - '1' FROM t;
+
+SELECT cast(1 as tinyint)                         * '1' FROM t;
+SELECT cast(1 as smallint)                        * '1' FROM t;
+SELECT cast(1 as int)                             * '1' FROM t;
+SELECT cast(1 as bigint)                          * '1' FROM t;
+SELECT cast(1 as float)                           * '1' FROM t;
+SELECT cast(1 as double)                          * '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  * '1' FROM t;
+SELECT cast('1' as binary)                        * '1' FROM t;
+SELECT cast(1 as boolean)                         * '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) * '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        * '1' FROM t;
+
+SELECT cast(1 as tinyint)                         / '1' FROM t;
+SELECT cast(1 as smallint)                        / '1' FROM t;
+SELECT cast(1 as int)                             / '1' FROM t;
+SELECT cast(1 as bigint)                          / '1' FROM t;
+SELECT cast(1 as float)                           / '1' FROM t;
+SELECT cast(1 as double)                          / '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  / '1' FROM t;
+SELECT cast('1' as binary)                        / '1' FROM t;
+SELECT cast(1 as boolean)                         / '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        / '1' FROM t;
+
+SELECT cast(1 as tinyint)                         % '1' FROM t;
+SELECT cast(1 as smallint)                        % '1' FROM t;
+SELECT cast(1 as int)                             % '1' FROM t;
+SELECT cast(1 as bigint)                          % '1' FROM t;
+SELECT cast(1 as float)                           % '1' FROM t;
+SELECT cast(1 as double)                          % '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  % '1' FROM t;
+SELECT cast('1' as binary)                        % '1' FROM t;
+SELECT cast(1 as boolean)                         % '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        % '1' FROM t;
+
+SELECT pmod(cast(1 as tinyint), '1')                         FROM t;
+SELECT pmod(cast(1 as smallint), '1')                        FROM t;
+SELECT pmod(cast(1 as int), '1')                             FROM t;
+SELECT pmod(cast(1 as bigint), '1')                          FROM t;
+SELECT pmod(cast(1 as float), '1')                           FROM t;
+SELECT pmod(cast(1 as double), '1')                          FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), '1')                  FROM t;
+SELECT pmod(cast('1' as binary), '1')                        FROM t;
+SELECT pmod(cast(1 as boolean), '1')                         FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), '1') FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00' as date), '1')        FROM t;
+
+-- Equality
+SELECT '1' = cast(1 as tinyint)                         FROM t;
+SELECT '1' = cast(1 as smallint)                        FROM t;
+SELECT '1' = cast(1 as int)                             FROM t;
+SELECT '1' = cast(1 as bigint)                          FROM t;
+SELECT '1' = cast(1 as float)                           FROM t;
+SELECT '1' = cast(1 as double)                          FROM t;
+SELECT '1' = cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' = '1'                                        FROM t;
+SELECT '1' = cast('1' as binary)                        FROM t;
+SELECT '1' = cast(1 as boolean)                         FROM t;
+SELECT '1' = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' = cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT cast(1 as tinyint)                         = '1' FROM t;
+SELECT cast(1 as smallint)                        = '1' FROM t;
+SELECT cast(1 as int)                             = '1' FROM t;
+SELECT cast(1 as bigint)                          = '1' FROM t;
+SELECT cast(1 as float)                           = '1' FROM t;
+SELECT cast(1 as double)                          = '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  = '1' FROM t;
+SELECT cast('1' as binary)                        = '1' FROM t;
+SELECT cast(1 as boolean)                         = '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        = '1' FROM t;
+
+SELECT '1' <=> cast(1 as tinyint)                         FROM t;
+SELECT '1' <=> cast(1 as smallint)                        FROM t;
+SELECT '1' <=> cast(1 as int)                             FROM t;
+SELECT '1' <=> cast(1 as bigint)                          FROM t;
+SELECT '1' <=> cast(1 as float)                           FROM t;
+SELECT '1' <=> cast(1 as double)                          FROM t;
+SELECT '1' <=> cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' <=> '1'                                        FROM t;
+SELECT '1' <=> cast('1' as binary)                        FROM t;
+SELECT '1' <=> cast(1 as boolean)                         FROM t;
+SELECT '1' <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' <=> cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT cast(1 as tinyint)                         <=> '1' FROM t;
+SELECT cast(1 as smallint)                        <=> '1' FROM t;
+SELECT cast(1 as int)                             <=> '1' FROM t;
+SELECT cast(1 as bigint)                          <=> '1' FROM t;
+SELECT cast(1 as float)                           <=> '1' FROM t;
+SELECT cast(1 as double)                          <=> '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  <=> '1' FROM t;
+SELECT cast('1' as binary)                        <=> '1' FROM t;
+SELECT cast(1 as boolean)                         <=> '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        <=> '1' FROM t;
+
+-- Binary comparison
+SELECT '1' < cast(1 as tinyint)                         FROM t;
+SELECT '1' < cast(1 as smallint)                        FROM t;
+SELECT '1' < cast(1 as int)                             FROM t;
+SELECT '1' < cast(1 as bigint)                          FROM t;
+SELECT '1' < cast(1 as float)                           FROM t;
+SELECT '1' < cast(1 as double)                          FROM t;
+SELECT '1' < cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' < '1'                                        FROM t;
+SELECT '1' < cast('1' as binary)                        FROM t;
+SELECT '1' < cast(1 as boolean)                         FROM t;
+SELECT '1' < cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' < cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' <= cast(1 as tinyint)                         FROM t;
+SELECT '1' <= cast(1 as smallint)                        FROM t;
+SELECT '1' <= cast(1 as int)                             FROM t;
+SELECT '1' <= cast(1 as bigint)                          FROM t;
+SELECT '1' <= cast(1 as float)                           FROM t;
+SELECT '1' <= cast(1 as double)                          FROM t;
+SELECT '1' <= cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' <= '1'                                        FROM t;
+SELECT '1' <= cast('1' as binary)                        FROM t;
+SELECT '1' <= cast(1 as boolean)                         FROM t;
+SELECT '1' <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' <= cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' > cast(1 as tinyint)                         FROM t;
+SELECT '1' > cast(1 as smallint)                        FROM t;
+SELECT '1' > cast(1 as int)                             FROM t;
+SELECT '1' > cast(1 as bigint)                          FROM t;
+SELECT '1' > cast(1 as float)                           FROM t;
+SELECT '1' > cast(1 as double)                          FROM t;
+SELECT '1' > cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' > '1'                                        FROM t;
+SELECT '1' > cast('1' as binary)                        FROM t;
+SELECT '1' > cast(1 as boolean)                         FROM t;
+SELECT '1' > cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' > cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' >= cast(1 as tinyint)                         FROM t;
+SELECT '1' >= cast(1 as smallint)                        FROM t;
+SELECT '1' >= cast(1 as int)                             FROM t;
+SELECT '1' >= cast(1 as bigint)                          FROM t;
+SELECT '1' >= cast(1 as float)                           FROM t;
+SELECT '1' >= cast(1 as double)                          FROM t;
+SELECT '1' >= cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' >= '1'                                        FROM t;
+SELECT '1' >= cast('1' as binary)                        FROM t;
+SELECT '1' >= cast(1 as boolean)                         FROM t;
+SELECT '1' >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' >= cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT '1' <> cast(1 as tinyint)                         FROM t;
+SELECT '1' <> cast(1 as smallint)                        FROM t;
+SELECT '1' <> cast(1 as int)                             FROM t;
+SELECT '1' <> cast(1 as bigint)                          FROM t;
+SELECT '1' <> cast(1 as float)                           FROM t;
+SELECT '1' <> cast(1 as double)                          FROM t;
+SELECT '1' <> cast(1 as decimal(10, 0))                  FROM t;
+SELECT '1' <> '1'                                        FROM t;
+SELECT '1' <> cast('1' as binary)                        FROM t;
+SELECT '1' <> cast(1 as boolean)                         FROM t;
+SELECT '1' <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT '1' <> cast('2017-12-11 09:30:00' as date)        FROM t;
+
+SELECT cast(1 as tinyint)                         < '1' FROM t;
+SELECT cast(1 as smallint)                        < '1' FROM t;
+SELECT cast(1 as int)                             < '1' FROM t;
+SELECT cast(1 as bigint)                          < '1' FROM t;
+SELECT cast(1 as float)                           < '1' FROM t;
+SELECT cast(1 as double)                          < '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  < '1' FROM t;
+SELECT '1'                                        < '1' FROM t;
+SELECT cast('1' as binary)                        < '1' FROM t;
+SELECT cast(1 as boolean)                         < '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        < '1' FROM t;
+
+SELECT cast(1 as tinyint)                         <= '1' FROM t;
+SELECT cast(1 as smallint)                        <= '1' FROM t;
+SELECT cast(1 as int)                             <= '1' FROM t;
+SELECT cast(1 as bigint)                          <= '1' FROM t;
+SELECT cast(1 as float)                           <= '1' FROM t;
+SELECT cast(1 as double)                          <= '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  <= '1' FROM t;
+SELECT '1'                                        <= '1' FROM t;
+SELECT cast('1' as binary)                        <= '1' FROM t;
+SELECT cast(1 as boolean)                         <= '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        <= '1' FROM t;
+
+SELECT cast(1 as tinyint)                         > '1' FROM t;
+SELECT cast(1 as smallint)                        > '1' FROM t;
+SELECT cast(1 as int)                             > '1' FROM t;
+SELECT cast(1 as bigint)                          > '1' FROM t;
+SELECT cast(1 as float)                           > '1' FROM t;
+SELECT cast(1 as double)                          > '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  > '1' FROM t;
+SELECT '1'                                        > '1' FROM t;
+SELECT cast('1' as binary)                        > '1' FROM t;
+SELECT cast(1 as boolean)                         > '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        > '1' FROM t;
+
+SELECT cast(1 as tinyint)                         >= '1' FROM t;
+SELECT cast(1 as smallint)                        >= '1' FROM t;
+SELECT cast(1 as int)                             >= '1' FROM t;
+SELECT cast(1 as bigint)                          >= '1' FROM t;
+SELECT cast(1 as float)                           >= '1' FROM t;
+SELECT cast(1 as double)                          >= '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  >= '1' FROM t;
+SELECT '1'                                        >= '1' FROM t;
+SELECT cast('1' as binary)                        >= '1' FROM t;
+SELECT cast(1 as boolean)                         >= '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        >= '1' FROM t;
+
+SELECT cast(1 as tinyint)                         <> '1' FROM t;
+SELECT cast(1 as smallint)                        <> '1' FROM t;
+SELECT cast(1 as int)                             <> '1' FROM t;
+SELECT cast(1 as bigint)                          <> '1' FROM t;
+SELECT cast(1 as float)                           <> '1' FROM t;
+SELECT cast(1 as double)                          <> '1' FROM t;
+SELECT cast(1 as decimal(10, 0))                  <> '1' FROM t;
+SELECT '1'                                        <> '1' FROM t;
+SELECT cast('1' as binary)                        <> '1' FROM t;
+SELECT cast(1 as boolean)                         <> '1' FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> '1' FROM t;
+SELECT cast('2017-12-11 09:30:00' as date)        <> '1' FROM t;
+
+-- Functions
+SELECT abs('1') FROM t;
+SELECT sum('1') FROM t;
+SELECT avg('1') FROM t;
+SELECT stddev_pop('1') FROM t;
+SELECT stddev_samp('1') FROM t;
+SELECT - '1' FROM t;
+SELECT + '1' FROM t;
+SELECT var_pop('1') FROM t;
+SELECT var_samp('1') FROM t;
+SELECT skewness('1') FROM t;
+SELECT kurtosis('1') FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
new file mode 100644
index 0000000000000..bf8ddee89b798
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
@@ -0,0 +1,2454 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 289
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT cast(1 as tinyint) in (cast(1 as tinyint)) FROM t
+-- !query 1 schema
+struct<(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT))):boolean>
+-- !query 1 output
+true
+
+
+-- !query 2
+SELECT cast(1 as tinyint) in (cast(1 as smallint)) FROM t
+-- !query 2 schema
+struct<(CAST(CAST(1 AS TINYINT) AS SMALLINT) IN (CAST(CAST(1 AS SMALLINT) AS SMALLINT))):boolean>
+-- !query 2 output
+true
+
+
+-- !query 3
+SELECT cast(1 as tinyint) in (cast(1 as int)) FROM t
+-- !query 3 schema
+struct<(CAST(CAST(1 AS TINYINT) AS INT) IN (CAST(CAST(1 AS INT) AS INT))):boolean>
+-- !query 3 output
+true
+
+
+-- !query 4
+SELECT cast(1 as tinyint) in (cast(1 as bigint)) FROM t
+-- !query 4 schema
+struct<(CAST(CAST(1 AS TINYINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 4 output
+true
+
+
+-- !query 5
+SELECT cast(1 as tinyint) in (cast(1 as float)) FROM t
+-- !query 5 schema
+struct<(CAST(CAST(1 AS TINYINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 5 output
+true
+
+
+-- !query 6
+SELECT cast(1 as tinyint) in (cast(1 as double)) FROM t
+-- !query 6 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 6 output
+true
+
+
+-- !query 7
+SELECT cast(1 as tinyint) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 7 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 7 output
+true
+
+
+-- !query 8
+SELECT cast(1 as tinyint) in (cast(1 as string)) FROM t
+-- !query 8 schema
+struct<(CAST(CAST(1 AS TINYINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 8 output
+true
+
+
+-- !query 9
+SELECT cast(1 as tinyint) in (cast('1' as binary)) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ByteType != BinaryType; line 1 pos 26
+
+
+-- !query 10
+SELECT cast(1 as tinyint) in (cast(1 as boolean)) FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ByteType != BooleanType; line 1 pos 26
+
+
+-- !query 11
+SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ByteType != TimestampType; line 1 pos 26
+
+
+-- !query 12
+SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ByteType != DateType; line 1 pos 26
+
+
+-- !query 13
+SELECT cast(1 as smallint) in (cast(1 as tinyint)) FROM t
+-- !query 13 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS SMALLINT) IN (CAST(CAST(1 AS TINYINT) AS SMALLINT))):boolean>
+-- !query 13 output
+true
+
+
+-- !query 14
+SELECT cast(1 as smallint) in (cast(1 as smallint)) FROM t
+-- !query 14 schema
+struct<(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT))):boolean>
+-- !query 14 output
+true
+
+
+-- !query 15
+SELECT cast(1 as smallint) in (cast(1 as int)) FROM t
+-- !query 15 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS INT) IN (CAST(CAST(1 AS INT) AS INT))):boolean>
+-- !query 15 output
+true
+
+
+-- !query 16
+SELECT cast(1 as smallint) in (cast(1 as bigint)) FROM t
+-- !query 16 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 16 output
+true
+
+
+-- !query 17
+SELECT cast(1 as smallint) in (cast(1 as float)) FROM t
+-- !query 17 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 17 output
+true
+
+
+-- !query 18
+SELECT cast(1 as smallint) in (cast(1 as double)) FROM t
+-- !query 18 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 18 output
+true
+
+
+-- !query 19
+SELECT cast(1 as smallint) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 19 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 19 output
+true
+
+
+-- !query 20
+SELECT cast(1 as smallint) in (cast(1 as string)) FROM t
+-- !query 20 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 20 output
+true
+
+
+-- !query 21
+SELECT cast(1 as smallint) in (cast('1' as binary)) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ShortType != BinaryType; line 1 pos 27
+
+
+-- !query 22
+SELECT cast(1 as smallint) in (cast(1 as boolean)) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ShortType != BooleanType; line 1 pos 27
+
+
+-- !query 23
+SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ShortType != TimestampType; line 1 pos 27
+
+
+-- !query 24
+SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ShortType != DateType; line 1 pos 27
+
+
+-- !query 25
+SELECT cast(1 as int) in (cast(1 as tinyint)) FROM t
+-- !query 25 schema
+struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS TINYINT) AS INT))):boolean>
+-- !query 25 output
+true
+
+
+-- !query 26
+SELECT cast(1 as int) in (cast(1 as smallint)) FROM t
+-- !query 26 schema
+struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS SMALLINT) AS INT))):boolean>
+-- !query 26 output
+true
+
+
+-- !query 27
+SELECT cast(1 as int) in (cast(1 as int)) FROM t
+-- !query 27 schema
+struct<(CAST(1 AS INT) IN (CAST(1 AS INT))):boolean>
+-- !query 27 output
+true
+
+
+-- !query 28
+SELECT cast(1 as int) in (cast(1 as bigint)) FROM t
+-- !query 28 schema
+struct<(CAST(CAST(1 AS INT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 28 output
+true
+
+
+-- !query 29
+SELECT cast(1 as int) in (cast(1 as float)) FROM t
+-- !query 29 schema
+struct<(CAST(CAST(1 AS INT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 29 output
+true
+
+
+-- !query 30
+SELECT cast(1 as int) in (cast(1 as double)) FROM t
+-- !query 30 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 30 output
+true
+
+
+-- !query 31
+SELECT cast(1 as int) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 31 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 31 output
+true
+
+
+-- !query 32
+SELECT cast(1 as int) in (cast(1 as string)) FROM t
+-- !query 32 schema
+struct<(CAST(CAST(1 AS INT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 32 output
+true
+
+
+-- !query 33
+SELECT cast(1 as int) in (cast('1' as binary)) FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BinaryType; line 1 pos 22
+
+
+-- !query 34
+SELECT cast(1 as int) in (cast(1 as boolean)) FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BooleanType; line 1 pos 22
+
+
+-- !query 35
+SELECT cast(1 as int) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: IntegerType != TimestampType; line 1 pos 22
+
+
+-- !query 36
+SELECT cast(1 as int) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: IntegerType != DateType; line 1 pos 22
+
+
+-- !query 37
+SELECT cast(1 as bigint) in (cast(1 as tinyint)) FROM t
+-- !query 37 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS TINYINT) AS BIGINT))):boolean>
+-- !query 37 output
+true
+
+
+-- !query 38
+SELECT cast(1 as bigint) in (cast(1 as smallint)) FROM t
+-- !query 38 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS SMALLINT) AS BIGINT))):boolean>
+-- !query 38 output
+true
+
+
+-- !query 39
+SELECT cast(1 as bigint) in (cast(1 as int)) FROM t
+-- !query 39 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS INT) AS BIGINT))):boolean>
+-- !query 39 output
+true
+
+
+-- !query 40
+SELECT cast(1 as bigint) in (cast(1 as bigint)) FROM t
+-- !query 40 schema
+struct<(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT))):boolean>
+-- !query 40 output
+true
+
+
+-- !query 41
+SELECT cast(1 as bigint) in (cast(1 as float)) FROM t
+-- !query 41 schema
+struct<(CAST(CAST(1 AS BIGINT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 41 output
+true
+
+
+-- !query 42
+SELECT cast(1 as bigint) in (cast(1 as double)) FROM t
+-- !query 42 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 42 output
+true
+
+
+-- !query 43
+SELECT cast(1 as bigint) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 43 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 43 output
+true
+
+
+-- !query 44
+SELECT cast(1 as bigint) in (cast(1 as string)) FROM t
+-- !query 44 schema
+struct<(CAST(CAST(1 AS BIGINT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 44 output
+true
+
+
+-- !query 45
+SELECT cast(1 as bigint) in (cast('1' as binary)) FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: LongType != BinaryType; line 1 pos 25
+
+
+-- !query 46
+SELECT cast(1 as bigint) in (cast(1 as boolean)) FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: LongType != BooleanType; line 1 pos 25
+
+
+-- !query 47
+SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: LongType != TimestampType; line 1 pos 25
+
+
+-- !query 48
+SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: LongType != DateType; line 1 pos 25
+
+
+-- !query 49
+SELECT cast(1 as float) in (cast(1 as tinyint)) FROM t
+-- !query 49 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS TINYINT) AS FLOAT))):boolean>
+-- !query 49 output
+true
+
+
+-- !query 50
+SELECT cast(1 as float) in (cast(1 as smallint)) FROM t
+-- !query 50 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS SMALLINT) AS FLOAT))):boolean>
+-- !query 50 output
+true
+
+
+-- !query 51
+SELECT cast(1 as float) in (cast(1 as int)) FROM t
+-- !query 51 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS INT) AS FLOAT))):boolean>
+-- !query 51 output
+true
+
+
+-- !query 52
+SELECT cast(1 as float) in (cast(1 as bigint)) FROM t
+-- !query 52 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS BIGINT) AS FLOAT))):boolean>
+-- !query 52 output
+true
+
+
+-- !query 53
+SELECT cast(1 as float) in (cast(1 as float)) FROM t
+-- !query 53 schema
+struct<(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT))):boolean>
+-- !query 53 output
+true
+
+
+-- !query 54
+SELECT cast(1 as float) in (cast(1 as double)) FROM t
+-- !query 54 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 54 output
+true
+
+
+-- !query 55
+SELECT cast(1 as float) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 55 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 55 output
+true
+
+
+-- !query 56
+SELECT cast(1 as float) in (cast(1 as string)) FROM t
+-- !query 56 schema
+struct<(CAST(CAST(1 AS FLOAT) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 56 output
+false
+
+
+-- !query 57
+SELECT cast(1 as float) in (cast('1' as binary)) FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: FloatType != BinaryType; line 1 pos 24
+
+
+-- !query 58
+SELECT cast(1 as float) in (cast(1 as boolean)) FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: FloatType != BooleanType; line 1 pos 24
+
+
+-- !query 59
+SELECT cast(1 as float) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: FloatType != TimestampType; line 1 pos 24
+
+
+-- !query 60
+SELECT cast(1 as float) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: FloatType != DateType; line 1 pos 24
+
+
+-- !query 61
+SELECT cast(1 as double) in (cast(1 as tinyint)) FROM t
+-- !query 61 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS TINYINT) AS DOUBLE))):boolean>
+-- !query 61 output
+true
+
+
+-- !query 62
+SELECT cast(1 as double) in (cast(1 as smallint)) FROM t
+-- !query 62 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS SMALLINT) AS DOUBLE))):boolean>
+-- !query 62 output
+true
+
+
+-- !query 63
+SELECT cast(1 as double) in (cast(1 as int)) FROM t
+-- !query 63 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS INT) AS DOUBLE))):boolean>
+-- !query 63 output
+true
+
+
+-- !query 64
+SELECT cast(1 as double) in (cast(1 as bigint)) FROM t
+-- !query 64 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS BIGINT) AS DOUBLE))):boolean>
+-- !query 64 output
+true
+
+
+-- !query 65
+SELECT cast(1 as double) in (cast(1 as float)) FROM t
+-- !query 65 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 65 output
+true
+
+
+-- !query 66
+SELECT cast(1 as double) in (cast(1 as double)) FROM t
+-- !query 66 schema
+struct<(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE))):boolean>
+-- !query 66 output
+true
+
+
+-- !query 67
+SELECT cast(1 as double) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 67 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 67 output
+true
+
+
+-- !query 68
+SELECT cast(1 as double) in (cast(1 as string)) FROM t
+-- !query 68 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 68 output
+false
+
+
+-- !query 69
+SELECT cast(1 as double) in (cast('1' as binary)) FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BinaryType; line 1 pos 25
+
+
+-- !query 70
+SELECT cast(1 as double) in (cast(1 as boolean)) FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BooleanType; line 1 pos 25
+
+
+-- !query 71
+SELECT cast(1 as double) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DoubleType != TimestampType; line 1 pos 25
+
+
+-- !query 72
+SELECT cast(1 as double) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DoubleType != DateType; line 1 pos 25
+
+
+-- !query 73
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as tinyint)) FROM t
+-- !query 73 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)))):boolean>
+-- !query 73 output
+true
+
+
+-- !query 74
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as smallint)) FROM t
+-- !query 74 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)))):boolean>
+-- !query 74 output
+true
+
+
+-- !query 75
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as int)) FROM t
+-- !query 75 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean>
+-- !query 75 output
+true
+
+
+-- !query 76
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as bigint)) FROM t
+-- !query 76 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean>
+-- !query 76 output
+true
+
+
+-- !query 77
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as float)) FROM t
+-- !query 77 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 77 output
+true
+
+
+-- !query 78
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as double)) FROM t
+-- !query 78 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 78 output
+true
+
+
+-- !query 79
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 79 schema
+struct<(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)))):boolean>
+-- !query 79 output
+true
+
+
+-- !query 80
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as string)) FROM t
+-- !query 80 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 80 output
+true
+
+
+-- !query 81
+SELECT cast(1 as decimal(10, 0)) in (cast('1' as binary)) FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BinaryType; line 1 pos 33
+
+
+-- !query 82
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as boolean)) FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BooleanType; line 1 pos 33
+
+
+-- !query 83
+SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != TimestampType; line 1 pos 33
+
+
+-- !query 84
+SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != DateType; line 1 pos 33
+
+
+-- !query 85
+SELECT cast(1 as string) in (cast(1 as tinyint)) FROM t
+-- !query 85 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS TINYINT) AS STRING))):boolean>
+-- !query 85 output
+true
+
+
+-- !query 86
+SELECT cast(1 as string) in (cast(1 as smallint)) FROM t
+-- !query 86 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS SMALLINT) AS STRING))):boolean>
+-- !query 86 output
+true
+
+
+-- !query 87
+SELECT cast(1 as string) in (cast(1 as int)) FROM t
+-- !query 87 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS INT) AS STRING))):boolean>
+-- !query 87 output
+true
+
+
+-- !query 88
+SELECT cast(1 as string) in (cast(1 as bigint)) FROM t
+-- !query 88 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS BIGINT) AS STRING))):boolean>
+-- !query 88 output
+true
+
+
+-- !query 89
+SELECT cast(1 as string) in (cast(1 as float)) FROM t
+-- !query 89 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS FLOAT) AS STRING))):boolean>
+-- !query 89 output
+false
+
+
+-- !query 90
+SELECT cast(1 as string) in (cast(1 as double)) FROM t
+-- !query 90 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS DOUBLE) AS STRING))):boolean>
+-- !query 90 output
+false
+
+
+-- !query 91
+SELECT cast(1 as string) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 91 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS STRING))):boolean>
+-- !query 91 output
+true
+
+
+-- !query 92
+SELECT cast(1 as string) in (cast(1 as string)) FROM t
+-- !query 92 schema
+struct<(CAST(1 AS STRING) IN (CAST(1 AS STRING))):boolean>
+-- !query 92 output
+true
+
+
+-- !query 93
+SELECT cast(1 as string) in (cast('1' as binary)) FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS STRING) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: StringType != BinaryType; line 1 pos 25
+
+
+-- !query 94
+SELECT cast(1 as string) in (cast(1 as boolean)) FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: StringType != BooleanType; line 1 pos 25
+
+
+-- !query 95
+SELECT cast(1 as string) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 95 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):boolean>
+-- !query 95 output
+false
+
+
+-- !query 96
+SELECT cast(1 as string) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 96 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):boolean>
+-- !query 96 output
+false
+
+
+-- !query 97
+SELECT cast('1' as binary) in (cast(1 as tinyint)) FROM t
+-- !query 97 schema
+struct<>
+-- !query 97 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ByteType; line 1 pos 27
+
+
+-- !query 98
+SELECT cast('1' as binary) in (cast(1 as smallint)) FROM t
+-- !query 98 schema
+struct<>
+-- !query 98 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ShortType; line 1 pos 27
+
+
+-- !query 99
+SELECT cast('1' as binary) in (cast(1 as int)) FROM t
+-- !query 99 schema
+struct<>
+-- !query 99 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != IntegerType; line 1 pos 27
+
+
+-- !query 100
+SELECT cast('1' as binary) in (cast(1 as bigint)) FROM t
+-- !query 100 schema
+struct<>
+-- !query 100 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != LongType; line 1 pos 27
+
+
+-- !query 101
+SELECT cast('1' as binary) in (cast(1 as float)) FROM t
+-- !query 101 schema
+struct<>
+-- !query 101 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != FloatType; line 1 pos 27
+
+
+-- !query 102
+SELECT cast('1' as binary) in (cast(1 as double)) FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DoubleType; line 1 pos 27
+
+
+-- !query 103
+SELECT cast('1' as binary) in (cast(1 as decimal(10, 0))) FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BinaryType != DecimalType(10,0); line 1 pos 27
+
+
+-- !query 104
+SELECT cast('1' as binary) in (cast(1 as string)) FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BinaryType != StringType; line 1 pos 27
+
+
+-- !query 105
+SELECT cast('1' as binary) in (cast('1' as binary)) FROM t
+-- !query 105 schema
+struct<(CAST(1 AS BINARY) IN (CAST(1 AS BINARY))):boolean>
+-- !query 105 output
+true
+
+
+-- !query 106
+SELECT cast('1' as binary) in (cast(1 as boolean)) FROM t
+-- !query 106 schema
+struct<>
+-- !query 106 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: BinaryType != BooleanType; line 1 pos 27
+
+
+-- !query 107
+SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 107 schema
+struct<>
+-- !query 107 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BinaryType != TimestampType; line 1 pos 27
+
+
+-- !query 108
+SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 108 schema
+struct<>
+-- !query 108 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DateType; line 1 pos 27
+
+
+-- !query 109
+SELECT true in (cast(1 as tinyint)) FROM t
+-- !query 109 schema
+struct<>
+-- !query 109 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ByteType; line 1 pos 12
+
+
+-- !query 110
+SELECT true in (cast(1 as smallint)) FROM t
+-- !query 110 schema
+struct<>
+-- !query 110 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ShortType; line 1 pos 12
+
+
+-- !query 111
+SELECT true in (cast(1 as int)) FROM t
+-- !query 111 schema
+struct<>
+-- !query 111 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != IntegerType; line 1 pos 12
+
+
+-- !query 112
+SELECT true in (cast(1 as bigint)) FROM t
+-- !query 112 schema
+struct<>
+-- !query 112 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != LongType; line 1 pos 12
+
+
+-- !query 113
+SELECT true in (cast(1 as float)) FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != FloatType; line 1 pos 12
+
+
+-- !query 114
+SELECT true in (cast(1 as double)) FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DoubleType; line 1 pos 12
+
+
+-- !query 115
+SELECT true in (cast(1 as decimal(10, 0))) FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BooleanType != DecimalType(10,0); line 1 pos 12
+
+
+-- !query 116
+SELECT true in (cast(1 as string)) FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BooleanType != StringType; line 1 pos 12
+
+
+-- !query 117
+SELECT true in (cast('1' as binary)) FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: BooleanType != BinaryType; line 1 pos 12
+
+
+-- !query 118
+SELECT true in (cast(1 as boolean)) FROM t
+-- !query 118 schema
+struct<(true IN (CAST(1 AS BOOLEAN))):boolean>
+-- !query 118 output
+true
+
+
+-- !query 119
+SELECT true in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BooleanType != TimestampType; line 1 pos 12
+
+
+-- !query 120
+SELECT true in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DateType; line 1 pos 12
+
+
+-- !query 121
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as tinyint)) FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ByteType; line 1 pos 50
+
+
+-- !query 122
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as smallint)) FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ShortType; line 1 pos 50
+
+
+-- !query 123
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as int)) FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != IntegerType; line 1 pos 50
+
+
+-- !query 124
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as bigint)) FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != LongType; line 1 pos 50
+
+
+-- !query 125
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as float)) FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != FloatType; line 1 pos 50
+
+
+-- !query 126
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as double)) FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: TimestampType != DoubleType; line 1 pos 50
+
+
+-- !query 127
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as decimal(10, 0))) FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: TimestampType != DecimalType(10,0); line 1 pos 50
+
+
+-- !query 128
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as string)) FROM t
+-- !query 128 schema
+struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING) IN (CAST(CAST(2 AS STRING) AS STRING))):boolean>
+-- !query 128 output
+false
+
+
+-- !query 129
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2' as binary)) FROM t
+-- !query 129 schema
+struct<>
+-- !query 129 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BinaryType; line 1 pos 50
+
+
+-- !query 130
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as boolean)) FROM t
+-- !query 130 schema
+struct<>
+-- !query 130 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BooleanType; line 1 pos 50
+
+
+-- !query 131
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 131 schema
+struct<(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) IN (CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean>
+-- !query 131 output
+false
+
+
+-- !query 132
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 132 schema
+struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP) IN (CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):boolean>
+-- !query 132 output
+false
+
+
+-- !query 133
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as tinyint)) FROM t
+-- !query 133 schema
+struct<>
+-- !query 133 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ByteType; line 1 pos 43
+
+
+-- !query 134
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as smallint)) FROM t
+-- !query 134 schema
+struct<>
+-- !query 134 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ShortType; line 1 pos 43
+
+
+-- !query 135
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as int)) FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: DateType != IntegerType; line 1 pos 43
+
+
+-- !query 136
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as bigint)) FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: DateType != LongType; line 1 pos 43
+
+
+-- !query 137
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as float)) FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: DateType != FloatType; line 1 pos 43
+
+
+-- !query 138
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as double)) FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: DateType != DoubleType; line 1 pos 43
+
+
+-- !query 139
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as decimal(10, 0))) FROM t
+-- !query 139 schema
+struct<>
+-- !query 139 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: DateType != DecimalType(10,0); line 1 pos 43
+
+
+-- !query 140
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as string)) FROM t
+-- !query 140 schema
+struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING) IN (CAST(CAST(2 AS STRING) AS STRING))):boolean>
+-- !query 140 output
+false
+
+
+-- !query 141
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2' as binary)) FROM t
+-- !query 141 schema
+struct<>
+-- !query 141 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DateType != BinaryType; line 1 pos 43
+
+
+-- !query 142
+SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as boolean)) FROM t
+-- !query 142 schema
+struct<>
+-- !query 142 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DateType != BooleanType; line 1 pos 43
+
+
+-- !query 143
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 143 schema
+struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP) IN (CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP))):boolean>
+-- !query 143 output
+false
+
+
+-- !query 144
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 144 schema
+struct<(CAST(2017-12-12 09:30:00 AS DATE) IN (CAST(2017-12-11 09:30:00 AS DATE))):boolean>
+-- !query 144 output
+false
+
+
+-- !query 145
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as tinyint)) FROM t
+-- !query 145 schema
+struct<(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS TINYINT))):boolean>
+-- !query 145 output
+true
+
+
+-- !query 146
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as smallint)) FROM t
+-- !query 146 schema
+struct<(CAST(CAST(1 AS TINYINT) AS SMALLINT) IN (CAST(CAST(1 AS TINYINT) AS SMALLINT), CAST(CAST(1 AS SMALLINT) AS SMALLINT))):boolean>
+-- !query 146 output
+true
+
+
+-- !query 147
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as int)) FROM t
+-- !query 147 schema
+struct<(CAST(CAST(1 AS TINYINT) AS INT) IN (CAST(CAST(1 AS TINYINT) AS INT), CAST(CAST(1 AS INT) AS INT))):boolean>
+-- !query 147 output
+true
+
+
+-- !query 148
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as bigint)) FROM t
+-- !query 148 schema
+struct<(CAST(CAST(1 AS TINYINT) AS BIGINT) IN (CAST(CAST(1 AS TINYINT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 148 output
+true
+
+
+-- !query 149
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as float)) FROM t
+-- !query 149 schema
+struct<(CAST(CAST(1 AS TINYINT) AS FLOAT) IN (CAST(CAST(1 AS TINYINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 149 output
+true
+
+
+-- !query 150
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as double)) FROM t
+-- !query 150 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) IN (CAST(CAST(1 AS TINYINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 150 output
+true
+
+
+-- !query 151
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t
+-- !query 151 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 151 output
+true
+
+
+-- !query 152
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as string)) FROM t
+-- !query 152 schema
+struct<(CAST(CAST(1 AS TINYINT) AS STRING) IN (CAST(CAST(1 AS TINYINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 152 output
+true
+
+
+-- !query 153
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('1' as binary)) FROM t
+-- !query 153 schema
+struct<>
+-- !query 153 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ByteType != BinaryType; line 1 pos 26
+
+
+-- !query 154
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as boolean)) FROM t
+-- !query 154 schema
+struct<>
+-- !query 154 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ByteType != BooleanType; line 1 pos 26
+
+
+-- !query 155
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 155 schema
+struct<>
+-- !query 155 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ByteType != TimestampType; line 1 pos 26
+
+
+-- !query 156
+SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 156 schema
+struct<>
+-- !query 156 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ByteType != DateType; line 1 pos 26
+
+
+-- !query 157
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as tinyint)) FROM t
+-- !query 157 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS SMALLINT) IN (CAST(CAST(1 AS SMALLINT) AS SMALLINT), CAST(CAST(1 AS TINYINT) AS SMALLINT))):boolean>
+-- !query 157 output
+true
+
+
+-- !query 158
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as smallint)) FROM t
+-- !query 158 schema
+struct<(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS SMALLINT))):boolean>
+-- !query 158 output
+true
+
+
+-- !query 159
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as int)) FROM t
+-- !query 159 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS INT) IN (CAST(CAST(1 AS SMALLINT) AS INT), CAST(CAST(1 AS INT) AS INT))):boolean>
+-- !query 159 output
+true
+
+
+-- !query 160
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as bigint)) FROM t
+-- !query 160 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS BIGINT) IN (CAST(CAST(1 AS SMALLINT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 160 output
+true
+
+
+-- !query 161
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as float)) FROM t
+-- !query 161 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS FLOAT) IN (CAST(CAST(1 AS SMALLINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 161 output
+true
+
+
+-- !query 162
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as double)) FROM t
+-- !query 162 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) IN (CAST(CAST(1 AS SMALLINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 162 output
+true
+
+
+-- !query 163
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t
+-- !query 163 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 163 output
+true
+
+
+-- !query 164
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as string)) FROM t
+-- !query 164 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS STRING) IN (CAST(CAST(1 AS SMALLINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 164 output
+true
+
+
+-- !query 165
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('1' as binary)) FROM t
+-- !query 165 schema
+struct<>
+-- !query 165 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ShortType != BinaryType; line 1 pos 27
+
+
+-- !query 166
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as boolean)) FROM t
+-- !query 166 schema
+struct<>
+-- !query 166 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ShortType != BooleanType; line 1 pos 27
+
+
+-- !query 167
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 167 schema
+struct<>
+-- !query 167 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ShortType != TimestampType; line 1 pos 27
+
+
+-- !query 168
+SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 168 schema
+struct<>
+-- !query 168 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ShortType != DateType; line 1 pos 27
+
+
+-- !query 169
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as tinyint)) FROM t
+-- !query 169 schema
+struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS INT) AS INT), CAST(CAST(1 AS TINYINT) AS INT))):boolean>
+-- !query 169 output
+true
+
+
+-- !query 170
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as smallint)) FROM t
+-- !query 170 schema
+struct<(CAST(CAST(1 AS INT) AS INT) IN (CAST(CAST(1 AS INT) AS INT), CAST(CAST(1 AS SMALLINT) AS INT))):boolean>
+-- !query 170 output
+true
+
+
+-- !query 171
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as int)) FROM t
+-- !query 171 schema
+struct<(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS INT))):boolean>
+-- !query 171 output
+true
+
+
+-- !query 172
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as bigint)) FROM t
+-- !query 172 schema
+struct<(CAST(CAST(1 AS INT) AS BIGINT) IN (CAST(CAST(1 AS INT) AS BIGINT), CAST(CAST(1 AS BIGINT) AS BIGINT))):boolean>
+-- !query 172 output
+true
+
+
+-- !query 173
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as float)) FROM t
+-- !query 173 schema
+struct<(CAST(CAST(1 AS INT) AS FLOAT) IN (CAST(CAST(1 AS INT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 173 output
+true
+
+
+-- !query 174
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as double)) FROM t
+-- !query 174 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) IN (CAST(CAST(1 AS INT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 174 output
+true
+
+
+-- !query 175
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as decimal(10, 0))) FROM t
+-- !query 175 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS INT) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 175 output
+true
+
+
+-- !query 176
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as string)) FROM t
+-- !query 176 schema
+struct<(CAST(CAST(1 AS INT) AS STRING) IN (CAST(CAST(1 AS INT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 176 output
+true
+
+
+-- !query 177
+SELECT cast(1 as int) in (cast(1 as int), cast('1' as binary)) FROM t
+-- !query 177 schema
+struct<>
+-- !query 177 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BinaryType; line 1 pos 22
+
+
+-- !query 178
+SELECT cast(1 as int) in (cast(1 as int), cast(1 as boolean)) FROM t
+-- !query 178 schema
+struct<>
+-- !query 178 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BooleanType; line 1 pos 22
+
+
+-- !query 179
+SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 179 schema
+struct<>
+-- !query 179 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: IntegerType != TimestampType; line 1 pos 22
+
+
+-- !query 180
+SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 180 schema
+struct<>
+-- !query 180 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: IntegerType != DateType; line 1 pos 22
+
+
+-- !query 181
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as tinyint)) FROM t
+-- !query 181 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS TINYINT) AS BIGINT))):boolean>
+-- !query 181 output
+true
+
+
+-- !query 182
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as smallint)) FROM t
+-- !query 182 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS SMALLINT) AS BIGINT))):boolean>
+-- !query 182 output
+true
+
+
+-- !query 183
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as int)) FROM t
+-- !query 183 schema
+struct<(CAST(CAST(1 AS BIGINT) AS BIGINT) IN (CAST(CAST(1 AS BIGINT) AS BIGINT), CAST(CAST(1 AS INT) AS BIGINT))):boolean>
+-- !query 183 output
+true
+
+
+-- !query 184
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as bigint)) FROM t
+-- !query 184 schema
+struct<(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BIGINT))):boolean>
+-- !query 184 output
+true
+
+
+-- !query 185
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as float)) FROM t
+-- !query 185 schema
+struct<(CAST(CAST(1 AS BIGINT) AS FLOAT) IN (CAST(CAST(1 AS BIGINT) AS FLOAT), CAST(CAST(1 AS FLOAT) AS FLOAT))):boolean>
+-- !query 185 output
+true
+
+
+-- !query 186
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as double)) FROM t
+-- !query 186 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) IN (CAST(CAST(1 AS BIGINT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 186 output
+true
+
+
+-- !query 187
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t
+-- !query 187 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 187 output
+true
+
+
+-- !query 188
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as string)) FROM t
+-- !query 188 schema
+struct<(CAST(CAST(1 AS BIGINT) AS STRING) IN (CAST(CAST(1 AS BIGINT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 188 output
+true
+
+
+-- !query 189
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('1' as binary)) FROM t
+-- !query 189 schema
+struct<>
+-- !query 189 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: LongType != BinaryType; line 1 pos 25
+
+
+-- !query 190
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as boolean)) FROM t
+-- !query 190 schema
+struct<>
+-- !query 190 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: LongType != BooleanType; line 1 pos 25
+
+
+-- !query 191
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 191 schema
+struct<>
+-- !query 191 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: LongType != TimestampType; line 1 pos 25
+
+
+-- !query 192
+SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 192 schema
+struct<>
+-- !query 192 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: LongType != DateType; line 1 pos 25
+
+
+-- !query 193
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as tinyint)) FROM t
+-- !query 193 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS TINYINT) AS FLOAT))):boolean>
+-- !query 193 output
+true
+
+
+-- !query 194
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as smallint)) FROM t
+-- !query 194 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS SMALLINT) AS FLOAT))):boolean>
+-- !query 194 output
+true
+
+
+-- !query 195
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as int)) FROM t
+-- !query 195 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS INT) AS FLOAT))):boolean>
+-- !query 195 output
+true
+
+
+-- !query 196
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as bigint)) FROM t
+-- !query 196 schema
+struct<(CAST(CAST(1 AS FLOAT) AS FLOAT) IN (CAST(CAST(1 AS FLOAT) AS FLOAT), CAST(CAST(1 AS BIGINT) AS FLOAT))):boolean>
+-- !query 196 output
+true
+
+
+-- !query 197
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as float)) FROM t
+-- !query 197 schema
+struct<(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS FLOAT))):boolean>
+-- !query 197 output
+true
+
+
+-- !query 198
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as double)) FROM t
+-- !query 198 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 198 output
+true
+
+
+-- !query 199
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as decimal(10, 0))) FROM t
+-- !query 199 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) IN (CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 199 output
+true
+
+
+-- !query 200
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as string)) FROM t
+-- !query 200 schema
+struct<(CAST(CAST(1 AS FLOAT) AS STRING) IN (CAST(CAST(1 AS FLOAT) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 200 output
+true
+
+
+-- !query 201
+SELECT cast(1 as float) in (cast(1 as float), cast('1' as binary)) FROM t
+-- !query 201 schema
+struct<>
+-- !query 201 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: FloatType != BinaryType; line 1 pos 24
+
+
+-- !query 202
+SELECT cast(1 as float) in (cast(1 as float), cast(1 as boolean)) FROM t
+-- !query 202 schema
+struct<>
+-- !query 202 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: FloatType != BooleanType; line 1 pos 24
+
+
+-- !query 203
+SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 203 schema
+struct<>
+-- !query 203 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: FloatType != TimestampType; line 1 pos 24
+
+
+-- !query 204
+SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 204 schema
+struct<>
+-- !query 204 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: FloatType != DateType; line 1 pos 24
+
+
+-- !query 205
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as tinyint)) FROM t
+-- !query 205 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS TINYINT) AS DOUBLE))):boolean>
+-- !query 205 output
+true
+
+
+-- !query 206
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as smallint)) FROM t
+-- !query 206 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS SMALLINT) AS DOUBLE))):boolean>
+-- !query 206 output
+true
+
+
+-- !query 207
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as int)) FROM t
+-- !query 207 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS INT) AS DOUBLE))):boolean>
+-- !query 207 output
+true
+
+
+-- !query 208
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as bigint)) FROM t
+-- !query 208 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS BIGINT) AS DOUBLE))):boolean>
+-- !query 208 output
+true
+
+
+-- !query 209
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as float)) FROM t
+-- !query 209 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 209 output
+true
+
+
+-- !query 210
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as double)) FROM t
+-- !query 210 schema
+struct<(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS DOUBLE))):boolean>
+-- !query 210 output
+true
+
+
+-- !query 211
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as decimal(10, 0))) FROM t
+-- !query 211 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS DOUBLE) IN (CAST(CAST(1 AS DOUBLE) AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 211 output
+true
+
+
+-- !query 212
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as string)) FROM t
+-- !query 212 schema
+struct<(CAST(CAST(1 AS DOUBLE) AS STRING) IN (CAST(CAST(1 AS DOUBLE) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 212 output
+true
+
+
+-- !query 213
+SELECT cast(1 as double) in (cast(1 as double), cast('1' as binary)) FROM t
+-- !query 213 schema
+struct<>
+-- !query 213 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BinaryType; line 1 pos 25
+
+
+-- !query 214
+SELECT cast(1 as double) in (cast(1 as double), cast(1 as boolean)) FROM t
+-- !query 214 schema
+struct<>
+-- !query 214 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BooleanType; line 1 pos 25
+
+
+-- !query 215
+SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 215 schema
+struct<>
+-- !query 215 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DoubleType != TimestampType; line 1 pos 25
+
+
+-- !query 216
+SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 216 schema
+struct<>
+-- !query 216 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DoubleType != DateType; line 1 pos 25
+
+
+-- !query 217
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t
+-- !query 217 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS TINYINT) AS DECIMAL(10,0)))):boolean>
+-- !query 217 output
+true
+
+
+-- !query 218
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t
+-- !query 218 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS SMALLINT) AS DECIMAL(10,0)))):boolean>
+-- !query 218 output
+true
+
+
+-- !query 219
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as int)) FROM t
+-- !query 219 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean>
+-- !query 219 output
+true
+
+
+-- !query 220
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t
+-- !query 220 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean>
+-- !query 220 output
+true
+
+
+-- !query 221
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as float)) FROM t
+-- !query 221 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 221 output
+true
+
+
+-- !query 222
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as double)) FROM t
+-- !query 222 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS DOUBLE) AS DOUBLE))):boolean>
+-- !query 222 output
+true
+
+
+-- !query 223
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t
+-- !query 223 schema
+struct<(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS DECIMAL(10,0)))):boolean>
+-- !query 223 output
+true
+
+
+-- !query 224
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as string)) FROM t
+-- !query 224 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS STRING) IN (CAST(CAST(1 AS DECIMAL(10,0)) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 224 output
+true
+
+
+-- !query 225
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t
+-- !query 225 schema
+struct<>
+-- !query 225 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BinaryType; line 1 pos 33
+
+
+-- !query 226
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t
+-- !query 226 schema
+struct<>
+-- !query 226 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BooleanType; line 1 pos 33
+
+
+-- !query 227
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 227 schema
+struct<>
+-- !query 227 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != TimestampType; line 1 pos 33
+
+
+-- !query 228
+SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 228 schema
+struct<>
+-- !query 228 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != DateType; line 1 pos 33
+
+
+-- !query 229
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as tinyint)) FROM t
+-- !query 229 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS TINYINT) AS STRING))):boolean>
+-- !query 229 output
+true
+
+
+-- !query 230
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as smallint)) FROM t
+-- !query 230 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS SMALLINT) AS STRING))):boolean>
+-- !query 230 output
+true
+
+
+-- !query 231
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as int)) FROM t
+-- !query 231 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS INT) AS STRING))):boolean>
+-- !query 231 output
+true
+
+
+-- !query 232
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as bigint)) FROM t
+-- !query 232 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS BIGINT) AS STRING))):boolean>
+-- !query 232 output
+true
+
+
+-- !query 233
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as float)) FROM t
+-- !query 233 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS FLOAT) AS STRING))):boolean>
+-- !query 233 output
+true
+
+
+-- !query 234
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as double)) FROM t
+-- !query 234 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS DOUBLE) AS STRING))):boolean>
+-- !query 234 output
+true
+
+
+-- !query 235
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as decimal(10, 0))) FROM t
+-- !query 235 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(1 AS DECIMAL(10,0)) AS STRING))):boolean>
+-- !query 235 output
+true
+
+
+-- !query 236
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as string)) FROM t
+-- !query 236 schema
+struct<(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS STRING))):boolean>
+-- !query 236 output
+true
+
+
+-- !query 237
+SELECT cast(1 as string) in (cast(1 as string), cast('1' as binary)) FROM t
+-- !query 237 schema
+struct<>
+-- !query 237 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: StringType != BinaryType; line 1 pos 25
+
+
+-- !query 238
+SELECT cast(1 as string) in (cast(1 as string), cast(1 as boolean)) FROM t
+-- !query 238 schema
+struct<>
+-- !query 238 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: StringType != BooleanType; line 1 pos 25
+
+
+-- !query 239
+SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 239 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING))):boolean>
+-- !query 239 output
+true
+
+
+-- !query 240
+SELECT cast(1 as string) in (cast(1 as string), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 240 schema
+struct<(CAST(CAST(1 AS STRING) AS STRING) IN (CAST(CAST(1 AS STRING) AS STRING), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):boolean>
+-- !query 240 output
+true
+
+
+-- !query 241
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as tinyint)) FROM t
+-- !query 241 schema
+struct<>
+-- !query 241 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ByteType; line 1 pos 27
+
+
+-- !query 242
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as smallint)) FROM t
+-- !query 242 schema
+struct<>
+-- !query 242 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ShortType; line 1 pos 27
+
+
+-- !query 243
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as int)) FROM t
+-- !query 243 schema
+struct<>
+-- !query 243 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != IntegerType; line 1 pos 27
+
+
+-- !query 244
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as bigint)) FROM t
+-- !query 244 schema
+struct<>
+-- !query 244 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != LongType; line 1 pos 27
+
+
+-- !query 245
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as float)) FROM t
+-- !query 245 schema
+struct<>
+-- !query 245 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != FloatType; line 1 pos 27
+
+
+-- !query 246
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as double)) FROM t
+-- !query 246 schema
+struct<>
+-- !query 246 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DoubleType; line 1 pos 27
+
+
+-- !query 247
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as decimal(10, 0))) FROM t
+-- !query 247 schema
+struct<>
+-- !query 247 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BinaryType != DecimalType(10,0); line 1 pos 27
+
+
+-- !query 248
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as string)) FROM t
+-- !query 248 schema
+struct<>
+-- !query 248 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BinaryType != StringType; line 1 pos 27
+
+
+-- !query 249
+SELECT cast('1' as binary) in (cast('1' as binary), cast('1' as binary)) FROM t
+-- !query 249 schema
+struct<(CAST(1 AS BINARY) IN (CAST(1 AS BINARY), CAST(1 AS BINARY))):boolean>
+-- !query 249 output
+true
+
+
+-- !query 250
+SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as boolean)) FROM t
+-- !query 250 schema
+struct<>
+-- !query 250 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: BinaryType != BooleanType; line 1 pos 27
+
+
+-- !query 251
+SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 251 schema
+struct<>
+-- !query 251 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BinaryType != TimestampType; line 1 pos 27
+
+
+-- !query 252
+SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 252 schema
+struct<>
+-- !query 252 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DateType; line 1 pos 27
+
+
+-- !query 253
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as tinyint)) FROM t
+-- !query 253 schema
+struct<>
+-- !query 253 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ByteType; line 1 pos 28
+
+
+-- !query 254
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as smallint)) FROM t
+-- !query 254 schema
+struct<>
+-- !query 254 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ShortType; line 1 pos 28
+
+
+-- !query 255
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as int)) FROM t
+-- !query 255 schema
+struct<>
+-- !query 255 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != IntegerType; line 1 pos 28
+
+
+-- !query 256
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as bigint)) FROM t
+-- !query 256 schema
+struct<>
+-- !query 256 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != LongType; line 1 pos 28
+
+
+-- !query 257
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as float)) FROM t
+-- !query 257 schema
+struct<>
+-- !query 257 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != FloatType; line 1 pos 28
+
+
+-- !query 258
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as double)) FROM t
+-- !query 258 schema
+struct<>
+-- !query 258 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DoubleType; line 1 pos 28
+
+
+-- !query 259
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as decimal(10, 0))) FROM t
+-- !query 259 schema
+struct<>
+-- !query 259 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BooleanType != DecimalType(10,0); line 1 pos 28
+
+
+-- !query 260
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as string)) FROM t
+-- !query 260 schema
+struct<>
+-- !query 260 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BooleanType != StringType; line 1 pos 28
+
+
+-- !query 261
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('1' as binary)) FROM t
+-- !query 261 schema
+struct<>
+-- !query 261 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: BooleanType != BinaryType; line 1 pos 28
+
+
+-- !query 262
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as boolean)) FROM t
+-- !query 262 schema
+struct<(CAST(1 AS BOOLEAN) IN (CAST(1 AS BOOLEAN), CAST(1 AS BOOLEAN))):boolean>
+-- !query 262 output
+true
+
+
+-- !query 263
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 263 schema
+struct<>
+-- !query 263 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BooleanType != TimestampType; line 1 pos 28
+
+
+-- !query 264
+SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 264 schema
+struct<>
+-- !query 264 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DateType; line 1 pos 28
+
+
+-- !query 265
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as tinyint)) FROM t
+-- !query 265 schema
+struct<>
+-- !query 265 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ByteType; line 1 pos 50
+
+
+-- !query 266
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as smallint)) FROM t
+-- !query 266 schema
+struct<>
+-- !query 266 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ShortType; line 1 pos 50
+
+
+-- !query 267
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as int)) FROM t
+-- !query 267 schema
+struct<>
+-- !query 267 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != IntegerType; line 1 pos 50
+
+
+-- !query 268
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as bigint)) FROM t
+-- !query 268 schema
+struct<>
+-- !query 268 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != LongType; line 1 pos 50
+
+
+-- !query 269
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as float)) FROM t
+-- !query 269 schema
+struct<>
+-- !query 269 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != FloatType; line 1 pos 50
+
+
+-- !query 270
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as double)) FROM t
+-- !query 270 schema
+struct<>
+-- !query 270 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: TimestampType != DoubleType; line 1 pos 50
+
+
+-- !query 271
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t
+-- !query 271 schema
+struct<>
+-- !query 271 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: TimestampType != DecimalType(10,0); line 1 pos 50
+
+
+-- !query 272
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as string)) FROM t
+-- !query 272 schema
+struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING) IN (CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 272 output
+true
+
+
+-- !query 273
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('1' as binary)) FROM t
+-- !query 273 schema
+struct<>
+-- !query 273 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BinaryType; line 1 pos 50
+
+
+-- !query 274
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast(1 as boolean)) FROM t
+-- !query 274 schema
+struct<>
+-- !query 274 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BooleanType; line 1 pos 50
+
+
+-- !query 275
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 275 schema
+struct<(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) IN (CAST(2017-12-12 09:30:00.0 AS TIMESTAMP), CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean>
+-- !query 275 output
+true
+
+
+-- !query 276
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.0' as timestamp), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 276 schema
+struct<(CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP) IN (CAST(CAST(2017-12-12 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP))):boolean>
+-- !query 276 output
+true
+
+
+-- !query 277
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as tinyint)) FROM t
+-- !query 277 schema
+struct<>
+-- !query 277 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ByteType; line 1 pos 43
+
+
+-- !query 278
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as smallint)) FROM t
+-- !query 278 schema
+struct<>
+-- !query 278 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ShortType; line 1 pos 43
+
+
+-- !query 279
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as int)) FROM t
+-- !query 279 schema
+struct<>
+-- !query 279 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: DateType != IntegerType; line 1 pos 43
+
+
+-- !query 280
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as bigint)) FROM t
+-- !query 280 schema
+struct<>
+-- !query 280 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: DateType != LongType; line 1 pos 43
+
+
+-- !query 281
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as float)) FROM t
+-- !query 281 schema
+struct<>
+-- !query 281 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: DateType != FloatType; line 1 pos 43
+
+
+-- !query 282
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as double)) FROM t
+-- !query 282 schema
+struct<>
+-- !query 282 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: DateType != DoubleType; line 1 pos 43
+
+
+-- !query 283
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t
+-- !query 283 schema
+struct<>
+-- !query 283 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: DateType != DecimalType(10,0); line 1 pos 43
+
+
+-- !query 284
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as string)) FROM t
+-- !query 284 schema
+struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING) IN (CAST(CAST(2017-12-12 09:30:00 AS DATE) AS STRING), CAST(CAST(1 AS STRING) AS STRING))):boolean>
+-- !query 284 output
+true
+
+
+-- !query 285
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('1' as binary)) FROM t
+-- !query 285 schema
+struct<>
+-- !query 285 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DateType != BinaryType; line 1 pos 43
+
+
+-- !query 286
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast(1 as boolean)) FROM t
+-- !query 286 schema
+struct<>
+-- !query 286 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DateType != BooleanType; line 1 pos 43
+
+
+-- !query 287
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 287 schema
+struct<(CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP) IN (CAST(CAST(2017-12-12 09:30:00 AS DATE) AS TIMESTAMP), CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS TIMESTAMP))):boolean>
+-- !query 287 output
+true
+
+
+-- !query 288
+SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as date), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 288 schema
+struct<(CAST(2017-12-12 09:30:00 AS DATE) IN (CAST(2017-12-12 09:30:00 AS DATE), CAST(2017-12-11 09:30:00 AS DATE))):boolean>
+-- !query 288 output
+true
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out
new file mode 100644
index 0000000000000..0beb1f6263d2c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/promoteStrings.sql.out
@@ -0,0 +1,2578 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 316
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT '1' + cast(1 as tinyint)                         FROM t
+-- !query 1 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 1 output
+2.0
+
+
+-- !query 2
+SELECT '1' + cast(1 as smallint)                        FROM t
+-- !query 2 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 2 output
+2.0
+
+
+-- !query 3
+SELECT '1' + cast(1 as int)                             FROM t
+-- !query 3 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 3 output
+2.0
+
+
+-- !query 4
+SELECT '1' + cast(1 as bigint)                          FROM t
+-- !query 4 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 4 output
+2.0
+
+
+-- !query 5
+SELECT '1' + cast(1 as float)                           FROM t
+-- !query 5 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 5 output
+2.0
+
+
+-- !query 6
+SELECT '1' + cast(1 as double)                          FROM t
+-- !query 6 schema
+struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 6 output
+2.0
+
+
+-- !query 7
+SELECT '1' + cast(1 as decimal(10, 0))                  FROM t
+-- !query 7 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 7 output
+2.0
+
+
+-- !query 8
+SELECT '1' + '1'                                        FROM t
+-- !query 8 schema
+struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 8 output
+2.0
+
+
+-- !query 9
+SELECT '1' + cast('1' as binary)                        FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 10
+SELECT '1' + cast(1 as boolean)                         FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 11
+SELECT '1' + cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 12
+SELECT '1' + cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) + CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 13
+SELECT '1' - cast(1 as tinyint)                         FROM t
+-- !query 13 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 13 output
+0.0
+
+
+-- !query 14
+SELECT '1' - cast(1 as smallint)                        FROM t
+-- !query 14 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 14 output
+0.0
+
+
+-- !query 15
+SELECT '1' - cast(1 as int)                             FROM t
+-- !query 15 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 15 output
+0.0
+
+
+-- !query 16
+SELECT '1' - cast(1 as bigint)                          FROM t
+-- !query 16 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 16 output
+0.0
+
+
+-- !query 17
+SELECT '1' - cast(1 as float)                           FROM t
+-- !query 17 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 17 output
+0.0
+
+
+-- !query 18
+SELECT '1' - cast(1 as double)                          FROM t
+-- !query 18 schema
+struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 18 output
+0.0
+
+
+-- !query 19
+SELECT '1' - cast(1 as decimal(10, 0))                  FROM t
+-- !query 19 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 19 output
+0.0
+
+
+-- !query 20
+SELECT '1' - '1'                                        FROM t
+-- !query 20 schema
+struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 20 output
+0.0
+
+
+-- !query 21
+SELECT '1' - cast('1' as binary)                        FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 22
+SELECT '1' - cast(1 as boolean)                         FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 23
+SELECT '1' - cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 24
+SELECT '1' - cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) - CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 25
+SELECT '1' * cast(1 as tinyint)                         FROM t
+-- !query 25 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 25 output
+1.0
+
+
+-- !query 26
+SELECT '1' * cast(1 as smallint)                        FROM t
+-- !query 26 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 26 output
+1.0
+
+
+-- !query 27
+SELECT '1' * cast(1 as int)                             FROM t
+-- !query 27 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 27 output
+1.0
+
+
+-- !query 28
+SELECT '1' * cast(1 as bigint)                          FROM t
+-- !query 28 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 28 output
+1.0
+
+
+-- !query 29
+SELECT '1' * cast(1 as float)                           FROM t
+-- !query 29 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 29 output
+1.0
+
+
+-- !query 30
+SELECT '1' * cast(1 as double)                          FROM t
+-- !query 30 schema
+struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 30 output
+1.0
+
+
+-- !query 31
+SELECT '1' * cast(1 as decimal(10, 0))                  FROM t
+-- !query 31 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 31 output
+1.0
+
+
+-- !query 32
+SELECT '1' * '1'                                        FROM t
+-- !query 32 schema
+struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 32 output
+1.0
+
+
+-- !query 33
+SELECT '1' * cast('1' as binary)                        FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 34
+SELECT '1' * cast(1 as boolean)                         FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 35
+SELECT '1' * cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 36
+SELECT '1' * cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) * CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 37
+SELECT '1' / cast(1 as tinyint)                         FROM t
+-- !query 37 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 37 output
+1.0
+
+
+-- !query 38
+SELECT '1' / cast(1 as smallint)                        FROM t
+-- !query 38 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 38 output
+1.0
+
+
+-- !query 39
+SELECT '1' / cast(1 as int)                             FROM t
+-- !query 39 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 39 output
+1.0
+
+
+-- !query 40
+SELECT '1' / cast(1 as bigint)                          FROM t
+-- !query 40 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 40 output
+1.0
+
+
+-- !query 41
+SELECT '1' / cast(1 as float)                           FROM t
+-- !query 41 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 41 output
+1.0
+
+
+-- !query 42
+SELECT '1' / cast(1 as double)                          FROM t
+-- !query 42 schema
+struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 42 output
+1.0
+
+
+-- !query 43
+SELECT '1' / cast(1 as decimal(10, 0))                  FROM t
+-- !query 43 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 43 output
+1.0
+
+
+-- !query 44
+SELECT '1' / '1'                                        FROM t
+-- !query 44 schema
+struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 44 output
+1.0
+
+
+-- !query 45
+SELECT '1' / cast('1' as binary)                        FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 46
+SELECT '1' / cast(1 as boolean)                         FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 47
+SELECT '1' / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 48
+SELECT '1' / cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 49
+SELECT '1' % cast(1 as tinyint)                         FROM t
+-- !query 49 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 49 output
+0.0
+
+
+-- !query 50
+SELECT '1' % cast(1 as smallint)                        FROM t
+-- !query 50 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 50 output
+0.0
+
+
+-- !query 51
+SELECT '1' % cast(1 as int)                             FROM t
+-- !query 51 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 51 output
+0.0
+
+
+-- !query 52
+SELECT '1' % cast(1 as bigint)                          FROM t
+-- !query 52 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 52 output
+0.0
+
+
+-- !query 53
+SELECT '1' % cast(1 as float)                           FROM t
+-- !query 53 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 53 output
+0.0
+
+
+-- !query 54
+SELECT '1' % cast(1 as double)                          FROM t
+-- !query 54 schema
+struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 54 output
+0.0
+
+
+-- !query 55
+SELECT '1' % cast(1 as decimal(10, 0))                  FROM t
+-- !query 55 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 55 output
+0.0
+
+
+-- !query 56
+SELECT '1' % '1'                                        FROM t
+-- !query 56 schema
+struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 56 output
+0.0
+
+
+-- !query 57
+SELECT '1' % cast('1' as binary)                        FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 58
+SELECT '1' % cast(1 as boolean)                         FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 59
+SELECT '1' % cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 60
+SELECT '1' % cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS DOUBLE) % CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 61
+SELECT pmod('1', cast(1 as tinyint))                         FROM t
+-- !query 61 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 61 output
+0.0
+
+
+-- !query 62
+SELECT pmod('1', cast(1 as smallint))                        FROM t
+-- !query 62 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 62 output
+0.0
+
+
+-- !query 63
+SELECT pmod('1', cast(1 as int))                             FROM t
+-- !query 63 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 63 output
+0.0
+
+
+-- !query 64
+SELECT pmod('1', cast(1 as bigint))                          FROM t
+-- !query 64 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 64 output
+0.0
+
+
+-- !query 65
+SELECT pmod('1', cast(1 as float))                           FROM t
+-- !query 65 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 65 output
+0.0
+
+
+-- !query 66
+SELECT pmod('1', cast(1 as double))                          FROM t
+-- !query 66 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 66 output
+0.0
+
+
+-- !query 67
+SELECT pmod('1', cast(1 as decimal(10, 0)))                  FROM t
+-- !query 67 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 67 output
+0.0
+
+
+-- !query 68
+SELECT pmod('1', '1')                                        FROM t
+-- !query 68 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 68 output
+0.0
+
+
+-- !query 69
+SELECT pmod('1', cast('1' as binary))                        FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 70
+SELECT pmod('1', cast(1 as boolean))                         FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 71
+SELECT pmod('1', cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 72
+SELECT pmod('1', cast('2017-12-11 09:30:00' as date))        FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST('1' AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 73
+SELECT cast(1 as tinyint)                         + '1' FROM t
+-- !query 73 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 73 output
+2.0
+
+
+-- !query 74
+SELECT cast(1 as smallint)                        + '1' FROM t
+-- !query 74 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 74 output
+2.0
+
+
+-- !query 75
+SELECT cast(1 as int)                             + '1' FROM t
+-- !query 75 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 75 output
+2.0
+
+
+-- !query 76
+SELECT cast(1 as bigint)                          + '1' FROM t
+-- !query 76 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 76 output
+2.0
+
+
+-- !query 77
+SELECT cast(1 as float)                           + '1' FROM t
+-- !query 77 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 77 output
+2.0
+
+
+-- !query 78
+SELECT cast(1 as double)                          + '1' FROM t
+-- !query 78 schema
+struct<(CAST(1 AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 78 output
+2.0
+
+
+-- !query 79
+SELECT cast(1 as decimal(10, 0))                  + '1' FROM t
+-- !query 79 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 79 output
+2.0
+
+
+-- !query 80
+SELECT cast('1' as binary)                        + '1' FROM t
+-- !query 80 schema
+struct<>
+-- !query 80 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 81
+SELECT cast(1 as boolean)                         + '1' FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) + CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 82
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + '1' FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 83
+SELECT cast('2017-12-11 09:30:00' as date)        + '1' FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 84
+SELECT cast(1 as tinyint)                         - '1' FROM t
+-- !query 84 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 84 output
+0.0
+
+
+-- !query 85
+SELECT cast(1 as smallint)                        - '1' FROM t
+-- !query 85 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 85 output
+0.0
+
+
+-- !query 86
+SELECT cast(1 as int)                             - '1' FROM t
+-- !query 86 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 86 output
+0.0
+
+
+-- !query 87
+SELECT cast(1 as bigint)                          - '1' FROM t
+-- !query 87 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 87 output
+0.0
+
+
+-- !query 88
+SELECT cast(1 as float)                           - '1' FROM t
+-- !query 88 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 88 output
+0.0
+
+
+-- !query 89
+SELECT cast(1 as double)                          - '1' FROM t
+-- !query 89 schema
+struct<(CAST(1 AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 89 output
+0.0
+
+
+-- !query 90
+SELECT cast(1 as decimal(10, 0))                  - '1' FROM t
+-- !query 90 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 90 output
+0.0
+
+
+-- !query 91
+SELECT cast('1' as binary)                        - '1' FROM t
+-- !query 91 schema
+struct<>
+-- !query 91 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 92
+SELECT cast(1 as boolean)                         - '1' FROM t
+-- !query 92 schema
+struct<>
+-- !query 92 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) - CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 93
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - '1' FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 94
+SELECT cast('2017-12-11 09:30:00' as date)        - '1' FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 95
+SELECT cast(1 as tinyint)                         * '1' FROM t
+-- !query 95 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 95 output
+1.0
+
+
+-- !query 96
+SELECT cast(1 as smallint)                        * '1' FROM t
+-- !query 96 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 96 output
+1.0
+
+
+-- !query 97
+SELECT cast(1 as int)                             * '1' FROM t
+-- !query 97 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 97 output
+1.0
+
+
+-- !query 98
+SELECT cast(1 as bigint)                          * '1' FROM t
+-- !query 98 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 98 output
+1.0
+
+
+-- !query 99
+SELECT cast(1 as float)                           * '1' FROM t
+-- !query 99 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 99 output
+1.0
+
+
+-- !query 100
+SELECT cast(1 as double)                          * '1' FROM t
+-- !query 100 schema
+struct<(CAST(1 AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 100 output
+1.0
+
+
+-- !query 101
+SELECT cast(1 as decimal(10, 0))                  * '1' FROM t
+-- !query 101 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 101 output
+1.0
+
+
+-- !query 102
+SELECT cast('1' as binary)                        * '1' FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 103
+SELECT cast(1 as boolean)                         * '1' FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) * CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 104
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) * '1' FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) * CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 105
+SELECT cast('2017-12-11 09:30:00' as date)        * '1' FROM t
+-- !query 105 schema
+struct<>
+-- !query 105 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) * CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) * CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 106
+SELECT cast(1 as tinyint)                         / '1' FROM t
+-- !query 106 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 106 output
+1.0
+
+
+-- !query 107
+SELECT cast(1 as smallint)                        / '1' FROM t
+-- !query 107 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 107 output
+1.0
+
+
+-- !query 108
+SELECT cast(1 as int)                             / '1' FROM t
+-- !query 108 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 108 output
+1.0
+
+
+-- !query 109
+SELECT cast(1 as bigint)                          / '1' FROM t
+-- !query 109 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 109 output
+1.0
+
+
+-- !query 110
+SELECT cast(1 as float)                           / '1' FROM t
+-- !query 110 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 110 output
+1.0
+
+
+-- !query 111
+SELECT cast(1 as double)                          / '1' FROM t
+-- !query 111 schema
+struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 111 output
+1.0
+
+
+-- !query 112
+SELECT cast(1 as decimal(10, 0))                  / '1' FROM t
+-- !query 112 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 112 output
+1.0
+
+
+-- !query 113
+SELECT cast('1' as binary)                        / '1' FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 114
+SELECT cast(1 as boolean)                         / '1' FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 115
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / '1' FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 116
+SELECT cast('2017-12-11 09:30:00' as date)        / '1' FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 117
+SELECT cast(1 as tinyint)                         % '1' FROM t
+-- !query 117 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 117 output
+0.0
+
+
+-- !query 118
+SELECT cast(1 as smallint)                        % '1' FROM t
+-- !query 118 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 118 output
+0.0
+
+
+-- !query 119
+SELECT cast(1 as int)                             % '1' FROM t
+-- !query 119 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 119 output
+0.0
+
+
+-- !query 120
+SELECT cast(1 as bigint)                          % '1' FROM t
+-- !query 120 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 120 output
+0.0
+
+
+-- !query 121
+SELECT cast(1 as float)                           % '1' FROM t
+-- !query 121 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 121 output
+0.0
+
+
+-- !query 122
+SELECT cast(1 as double)                          % '1' FROM t
+-- !query 122 schema
+struct<(CAST(1 AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 122 output
+0.0
+
+
+-- !query 123
+SELECT cast(1 as decimal(10, 0))                  % '1' FROM t
+-- !query 123 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 123 output
+0.0
+
+
+-- !query 124
+SELECT cast('1' as binary)                        % '1' FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 125
+SELECT cast(1 as boolean)                         % '1' FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) % CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 126
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % '1' FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 127
+SELECT cast('2017-12-11 09:30:00' as date)        % '1' FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST('1' AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 128
+SELECT pmod(cast(1 as tinyint), '1')                         FROM t
+-- !query 128 schema
+struct<pmod(CAST(CAST(1 AS TINYINT) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 128 output
+0.0
+
+
+-- !query 129
+SELECT pmod(cast(1 as smallint), '1')                        FROM t
+-- !query 129 schema
+struct<pmod(CAST(CAST(1 AS SMALLINT) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 129 output
+0.0
+
+
+-- !query 130
+SELECT pmod(cast(1 as int), '1')                             FROM t
+-- !query 130 schema
+struct<pmod(CAST(CAST(1 AS INT) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 130 output
+0.0
+
+
+-- !query 131
+SELECT pmod(cast(1 as bigint), '1')                          FROM t
+-- !query 131 schema
+struct<pmod(CAST(CAST(1 AS BIGINT) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 131 output
+0.0
+
+
+-- !query 132
+SELECT pmod(cast(1 as float), '1')                           FROM t
+-- !query 132 schema
+struct<pmod(CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 132 output
+0.0
+
+
+-- !query 133
+SELECT pmod(cast(1 as double), '1')                          FROM t
+-- !query 133 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 133 output
+0.0
+
+
+-- !query 134
+SELECT pmod(cast(1 as decimal(10, 0)), '1')                  FROM t
+-- !query 134 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 134 output
+0.0
+
+
+-- !query 135
+SELECT pmod(cast('1' as binary), '1')                        FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS BINARY), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST('1' AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 136
+SELECT pmod(cast(1 as boolean), '1')                         FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS BOOLEAN), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS BOOLEAN), CAST('1' AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 137
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), '1') FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST('1' AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 138
+SELECT pmod(cast('2017-12-11 09:30:00' as date), '1')        FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST('1' AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 139
+SELECT '1' = cast(1 as tinyint)                         FROM t
+-- !query 139 schema
+struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 139 output
+true
+
+
+-- !query 140
+SELECT '1' = cast(1 as smallint)                        FROM t
+-- !query 140 schema
+struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 140 output
+true
+
+
+-- !query 141
+SELECT '1' = cast(1 as int)                             FROM t
+-- !query 141 schema
+struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean>
+-- !query 141 output
+true
+
+
+-- !query 142
+SELECT '1' = cast(1 as bigint)                          FROM t
+-- !query 142 schema
+struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 142 output
+true
+
+
+-- !query 143
+SELECT '1' = cast(1 as float)                           FROM t
+-- !query 143 schema
+struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 143 output
+true
+
+
+-- !query 144
+SELECT '1' = cast(1 as double)                          FROM t
+-- !query 144 schema
+struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 144 output
+true
+
+
+-- !query 145
+SELECT '1' = cast(1 as decimal(10, 0))                  FROM t
+-- !query 145 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 145 output
+true
+
+
+-- !query 146
+SELECT '1' = '1'                                        FROM t
+-- !query 146 schema
+struct<(1 = 1):boolean>
+-- !query 146 output
+true
+
+
+-- !query 147
+SELECT '1' = cast('1' as binary)                        FROM t
+-- !query 147 schema
+struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean>
+-- !query 147 output
+true
+
+
+-- !query 148
+SELECT '1' = cast(1 as boolean)                         FROM t
+-- !query 148 schema
+struct<(CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN)):boolean>
+-- !query 148 output
+true
+
+
+-- !query 149
+SELECT '1' = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 149 schema
+struct<(CAST(1 AS TIMESTAMP) = CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean>
+-- !query 149 output
+NULL
+
+
+-- !query 150
+SELECT '1' = cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 150 schema
+struct<(1 = CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 150 output
+false
+
+
+-- !query 151
+SELECT cast(1 as tinyint)                         = '1' FROM t
+-- !query 151 schema
+struct<(CAST(1 AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 151 output
+true
+
+
+-- !query 152
+SELECT cast(1 as smallint)                        = '1' FROM t
+-- !query 152 schema
+struct<(CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 152 output
+true
+
+
+-- !query 153
+SELECT cast(1 as int)                             = '1' FROM t
+-- !query 153 schema
+struct<(CAST(1 AS INT) = CAST(1 AS INT)):boolean>
+-- !query 153 output
+true
+
+
+-- !query 154
+SELECT cast(1 as bigint)                          = '1' FROM t
+-- !query 154 schema
+struct<(CAST(1 AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 154 output
+true
+
+
+-- !query 155
+SELECT cast(1 as float)                           = '1' FROM t
+-- !query 155 schema
+struct<(CAST(1 AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 155 output
+true
+
+
+-- !query 156
+SELECT cast(1 as double)                          = '1' FROM t
+-- !query 156 schema
+struct<(CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 156 output
+true
+
+
+-- !query 157
+SELECT cast(1 as decimal(10, 0))                  = '1' FROM t
+-- !query 157 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 157 output
+true
+
+
+-- !query 158
+SELECT cast('1' as binary)                        = '1' FROM t
+-- !query 158 schema
+struct<(CAST(1 AS BINARY) = CAST(1 AS BINARY)):boolean>
+-- !query 158 output
+true
+
+
+-- !query 159
+SELECT cast(1 as boolean)                         = '1' FROM t
+-- !query 159 schema
+struct<(CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN)):boolean>
+-- !query 159 output
+true
+
+
+-- !query 160
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = '1' FROM t
+-- !query 160 schema
+struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) = CAST(1 AS TIMESTAMP)):boolean>
+-- !query 160 output
+NULL
+
+
+-- !query 161
+SELECT cast('2017-12-11 09:30:00' as date)        = '1' FROM t
+-- !query 161 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) = 1):boolean>
+-- !query 161 output
+false
+
+
+-- !query 162
+SELECT '1' <=> cast(1 as tinyint)                         FROM t
+-- !query 162 schema
+struct<(CAST(1 AS TINYINT) <=> CAST(1 AS TINYINT)):boolean>
+-- !query 162 output
+true
+
+
+-- !query 163
+SELECT '1' <=> cast(1 as smallint)                        FROM t
+-- !query 163 schema
+struct<(CAST(1 AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean>
+-- !query 163 output
+true
+
+
+-- !query 164
+SELECT '1' <=> cast(1 as int)                             FROM t
+-- !query 164 schema
+struct<(CAST(1 AS INT) <=> CAST(1 AS INT)):boolean>
+-- !query 164 output
+true
+
+
+-- !query 165
+SELECT '1' <=> cast(1 as bigint)                          FROM t
+-- !query 165 schema
+struct<(CAST(1 AS BIGINT) <=> CAST(1 AS BIGINT)):boolean>
+-- !query 165 output
+true
+
+
+-- !query 166
+SELECT '1' <=> cast(1 as float)                           FROM t
+-- !query 166 schema
+struct<(CAST(1 AS FLOAT) <=> CAST(1 AS FLOAT)):boolean>
+-- !query 166 output
+true
+
+
+-- !query 167
+SELECT '1' <=> cast(1 as double)                          FROM t
+-- !query 167 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 167 output
+true
+
+
+-- !query 168
+SELECT '1' <=> cast(1 as decimal(10, 0))                  FROM t
+-- !query 168 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 168 output
+true
+
+
+-- !query 169
+SELECT '1' <=> '1'                                        FROM t
+-- !query 169 schema
+struct<(1 <=> 1):boolean>
+-- !query 169 output
+true
+
+
+-- !query 170
+SELECT '1' <=> cast('1' as binary)                        FROM t
+-- !query 170 schema
+struct<(CAST(1 AS BINARY) <=> CAST(1 AS BINARY)):boolean>
+-- !query 170 output
+true
+
+
+-- !query 171
+SELECT '1' <=> cast(1 as boolean)                         FROM t
+-- !query 171 schema
+struct<(CAST(1 AS BOOLEAN) <=> CAST(1 AS BOOLEAN)):boolean>
+-- !query 171 output
+true
+
+
+-- !query 172
+SELECT '1' <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 172 schema
+struct<(CAST(1 AS TIMESTAMP) <=> CAST(2017-12-11 09:30:00.0 AS TIMESTAMP)):boolean>
+-- !query 172 output
+false
+
+
+-- !query 173
+SELECT '1' <=> cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 173 schema
+struct<(1 <=> CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 173 output
+false
+
+
+-- !query 174
+SELECT cast(1 as tinyint)                         <=> '1' FROM t
+-- !query 174 schema
+struct<(CAST(1 AS TINYINT) <=> CAST(1 AS TINYINT)):boolean>
+-- !query 174 output
+true
+
+
+-- !query 175
+SELECT cast(1 as smallint)                        <=> '1' FROM t
+-- !query 175 schema
+struct<(CAST(1 AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean>
+-- !query 175 output
+true
+
+
+-- !query 176
+SELECT cast(1 as int)                             <=> '1' FROM t
+-- !query 176 schema
+struct<(CAST(1 AS INT) <=> CAST(1 AS INT)):boolean>
+-- !query 176 output
+true
+
+
+-- !query 177
+SELECT cast(1 as bigint)                          <=> '1' FROM t
+-- !query 177 schema
+struct<(CAST(1 AS BIGINT) <=> CAST(1 AS BIGINT)):boolean>
+-- !query 177 output
+true
+
+
+-- !query 178
+SELECT cast(1 as float)                           <=> '1' FROM t
+-- !query 178 schema
+struct<(CAST(1 AS FLOAT) <=> CAST(1 AS FLOAT)):boolean>
+-- !query 178 output
+true
+
+
+-- !query 179
+SELECT cast(1 as double)                          <=> '1' FROM t
+-- !query 179 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 179 output
+true
+
+
+-- !query 180
+SELECT cast(1 as decimal(10, 0))                  <=> '1' FROM t
+-- !query 180 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 180 output
+true
+
+
+-- !query 181
+SELECT cast('1' as binary)                        <=> '1' FROM t
+-- !query 181 schema
+struct<(CAST(1 AS BINARY) <=> CAST(1 AS BINARY)):boolean>
+-- !query 181 output
+true
+
+
+-- !query 182
+SELECT cast(1 as boolean)                         <=> '1' FROM t
+-- !query 182 schema
+struct<(CAST(1 AS BOOLEAN) <=> CAST(1 AS BOOLEAN)):boolean>
+-- !query 182 output
+true
+
+
+-- !query 183
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> '1' FROM t
+-- !query 183 schema
+struct<(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) <=> CAST(1 AS TIMESTAMP)):boolean>
+-- !query 183 output
+false
+
+
+-- !query 184
+SELECT cast('2017-12-11 09:30:00' as date)        <=> '1' FROM t
+-- !query 184 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) <=> 1):boolean>
+-- !query 184 output
+false
+
+
+-- !query 185
+SELECT '1' < cast(1 as tinyint)                         FROM t
+-- !query 185 schema
+struct<(CAST(1 AS TINYINT) < CAST(1 AS TINYINT)):boolean>
+-- !query 185 output
+false
+
+
+-- !query 186
+SELECT '1' < cast(1 as smallint)                        FROM t
+-- !query 186 schema
+struct<(CAST(1 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean>
+-- !query 186 output
+false
+
+
+-- !query 187
+SELECT '1' < cast(1 as int)                             FROM t
+-- !query 187 schema
+struct<(CAST(1 AS INT) < CAST(1 AS INT)):boolean>
+-- !query 187 output
+false
+
+
+-- !query 188
+SELECT '1' < cast(1 as bigint)                          FROM t
+-- !query 188 schema
+struct<(CAST(1 AS BIGINT) < CAST(1 AS BIGINT)):boolean>
+-- !query 188 output
+false
+
+
+-- !query 189
+SELECT '1' < cast(1 as float)                           FROM t
+-- !query 189 schema
+struct<(CAST(1 AS FLOAT) < CAST(1 AS FLOAT)):boolean>
+-- !query 189 output
+false
+
+
+-- !query 190
+SELECT '1' < cast(1 as double)                          FROM t
+-- !query 190 schema
+struct<(CAST(1 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 190 output
+false
+
+
+-- !query 191
+SELECT '1' < cast(1 as decimal(10, 0))                  FROM t
+-- !query 191 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 191 output
+false
+
+
+-- !query 192
+SELECT '1' < '1'                                        FROM t
+-- !query 192 schema
+struct<(1 < 1):boolean>
+-- !query 192 output
+false
+
+
+-- !query 193
+SELECT '1' < cast('1' as binary)                        FROM t
+-- !query 193 schema
+struct<(CAST(1 AS BINARY) < CAST(1 AS BINARY)):boolean>
+-- !query 193 output
+false
+
+
+-- !query 194
+SELECT '1' < cast(1 as boolean)                         FROM t
+-- !query 194 schema
+struct<(CAST(1 AS BOOLEAN) < CAST(1 AS BOOLEAN)):boolean>
+-- !query 194 output
+false
+
+
+-- !query 195
+SELECT '1' < cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 195 schema
+struct<(1 < CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 195 output
+true
+
+
+-- !query 196
+SELECT '1' < cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 196 schema
+struct<(1 < CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 196 output
+true
+
+
+-- !query 197
+SELECT '1' <= cast(1 as tinyint)                         FROM t
+-- !query 197 schema
+struct<(CAST(1 AS TINYINT) <= CAST(1 AS TINYINT)):boolean>
+-- !query 197 output
+true
+
+
+-- !query 198
+SELECT '1' <= cast(1 as smallint)                        FROM t
+-- !query 198 schema
+struct<(CAST(1 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean>
+-- !query 198 output
+true
+
+
+-- !query 199
+SELECT '1' <= cast(1 as int)                             FROM t
+-- !query 199 schema
+struct<(CAST(1 AS INT) <= CAST(1 AS INT)):boolean>
+-- !query 199 output
+true
+
+
+-- !query 200
+SELECT '1' <= cast(1 as bigint)                          FROM t
+-- !query 200 schema
+struct<(CAST(1 AS BIGINT) <= CAST(1 AS BIGINT)):boolean>
+-- !query 200 output
+true
+
+
+-- !query 201
+SELECT '1' <= cast(1 as float)                           FROM t
+-- !query 201 schema
+struct<(CAST(1 AS FLOAT) <= CAST(1 AS FLOAT)):boolean>
+-- !query 201 output
+true
+
+
+-- !query 202
+SELECT '1' <= cast(1 as double)                          FROM t
+-- !query 202 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 202 output
+true
+
+
+-- !query 203
+SELECT '1' <= cast(1 as decimal(10, 0))                  FROM t
+-- !query 203 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 203 output
+true
+
+
+-- !query 204
+SELECT '1' <= '1'                                        FROM t
+-- !query 204 schema
+struct<(1 <= 1):boolean>
+-- !query 204 output
+true
+
+
+-- !query 205
+SELECT '1' <= cast('1' as binary)                        FROM t
+-- !query 205 schema
+struct<(CAST(1 AS BINARY) <= CAST(1 AS BINARY)):boolean>
+-- !query 205 output
+true
+
+
+-- !query 206
+SELECT '1' <= cast(1 as boolean)                         FROM t
+-- !query 206 schema
+struct<(CAST(1 AS BOOLEAN) <= CAST(1 AS BOOLEAN)):boolean>
+-- !query 206 output
+true
+
+
+-- !query 207
+SELECT '1' <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 207 schema
+struct<(1 <= CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 207 output
+true
+
+
+-- !query 208
+SELECT '1' <= cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 208 schema
+struct<(1 <= CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 208 output
+true
+
+
+-- !query 209
+SELECT '1' > cast(1 as tinyint)                         FROM t
+-- !query 209 schema
+struct<(CAST(1 AS TINYINT) > CAST(1 AS TINYINT)):boolean>
+-- !query 209 output
+false
+
+
+-- !query 210
+SELECT '1' > cast(1 as smallint)                        FROM t
+-- !query 210 schema
+struct<(CAST(1 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean>
+-- !query 210 output
+false
+
+
+-- !query 211
+SELECT '1' > cast(1 as int)                             FROM t
+-- !query 211 schema
+struct<(CAST(1 AS INT) > CAST(1 AS INT)):boolean>
+-- !query 211 output
+false
+
+
+-- !query 212
+SELECT '1' > cast(1 as bigint)                          FROM t
+-- !query 212 schema
+struct<(CAST(1 AS BIGINT) > CAST(1 AS BIGINT)):boolean>
+-- !query 212 output
+false
+
+
+-- !query 213
+SELECT '1' > cast(1 as float)                           FROM t
+-- !query 213 schema
+struct<(CAST(1 AS FLOAT) > CAST(1 AS FLOAT)):boolean>
+-- !query 213 output
+false
+
+
+-- !query 214
+SELECT '1' > cast(1 as double)                          FROM t
+-- !query 214 schema
+struct<(CAST(1 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 214 output
+false
+
+
+-- !query 215
+SELECT '1' > cast(1 as decimal(10, 0))                  FROM t
+-- !query 215 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 215 output
+false
+
+
+-- !query 216
+SELECT '1' > '1'                                        FROM t
+-- !query 216 schema
+struct<(1 > 1):boolean>
+-- !query 216 output
+false
+
+
+-- !query 217
+SELECT '1' > cast('1' as binary)                        FROM t
+-- !query 217 schema
+struct<(CAST(1 AS BINARY) > CAST(1 AS BINARY)):boolean>
+-- !query 217 output
+false
+
+
+-- !query 218
+SELECT '1' > cast(1 as boolean)                         FROM t
+-- !query 218 schema
+struct<(CAST(1 AS BOOLEAN) > CAST(1 AS BOOLEAN)):boolean>
+-- !query 218 output
+false
+
+
+-- !query 219
+SELECT '1' > cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 219 schema
+struct<(1 > CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 219 output
+false
+
+
+-- !query 220
+SELECT '1' > cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 220 schema
+struct<(1 > CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 220 output
+false
+
+
+-- !query 221
+SELECT '1' >= cast(1 as tinyint)                         FROM t
+-- !query 221 schema
+struct<(CAST(1 AS TINYINT) >= CAST(1 AS TINYINT)):boolean>
+-- !query 221 output
+true
+
+
+-- !query 222
+SELECT '1' >= cast(1 as smallint)                        FROM t
+-- !query 222 schema
+struct<(CAST(1 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean>
+-- !query 222 output
+true
+
+
+-- !query 223
+SELECT '1' >= cast(1 as int)                             FROM t
+-- !query 223 schema
+struct<(CAST(1 AS INT) >= CAST(1 AS INT)):boolean>
+-- !query 223 output
+true
+
+
+-- !query 224
+SELECT '1' >= cast(1 as bigint)                          FROM t
+-- !query 224 schema
+struct<(CAST(1 AS BIGINT) >= CAST(1 AS BIGINT)):boolean>
+-- !query 224 output
+true
+
+
+-- !query 225
+SELECT '1' >= cast(1 as float)                           FROM t
+-- !query 225 schema
+struct<(CAST(1 AS FLOAT) >= CAST(1 AS FLOAT)):boolean>
+-- !query 225 output
+true
+
+
+-- !query 226
+SELECT '1' >= cast(1 as double)                          FROM t
+-- !query 226 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 226 output
+true
+
+
+-- !query 227
+SELECT '1' >= cast(1 as decimal(10, 0))                  FROM t
+-- !query 227 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 227 output
+true
+
+
+-- !query 228
+SELECT '1' >= '1'                                        FROM t
+-- !query 228 schema
+struct<(1 >= 1):boolean>
+-- !query 228 output
+true
+
+
+-- !query 229
+SELECT '1' >= cast('1' as binary)                        FROM t
+-- !query 229 schema
+struct<(CAST(1 AS BINARY) >= CAST(1 AS BINARY)):boolean>
+-- !query 229 output
+true
+
+
+-- !query 230
+SELECT '1' >= cast(1 as boolean)                         FROM t
+-- !query 230 schema
+struct<(CAST(1 AS BOOLEAN) >= CAST(1 AS BOOLEAN)):boolean>
+-- !query 230 output
+true
+
+
+-- !query 231
+SELECT '1' >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 231 schema
+struct<(1 >= CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING)):boolean>
+-- !query 231 output
+false
+
+
+-- !query 232
+SELECT '1' >= cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 232 schema
+struct<(1 >= CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING)):boolean>
+-- !query 232 output
+false
+
+
+-- !query 233
+SELECT '1' <> cast(1 as tinyint)                         FROM t
+-- !query 233 schema
+struct<(NOT (CAST(1 AS TINYINT) = CAST(1 AS TINYINT))):boolean>
+-- !query 233 output
+false
+
+
+-- !query 234
+SELECT '1' <> cast(1 as smallint)                        FROM t
+-- !query 234 schema
+struct<(NOT (CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean>
+-- !query 234 output
+false
+
+
+-- !query 235
+SELECT '1' <> cast(1 as int)                             FROM t
+-- !query 235 schema
+struct<(NOT (CAST(1 AS INT) = CAST(1 AS INT))):boolean>
+-- !query 235 output
+false
+
+
+-- !query 236
+SELECT '1' <> cast(1 as bigint)                          FROM t
+-- !query 236 schema
+struct<(NOT (CAST(1 AS BIGINT) = CAST(1 AS BIGINT))):boolean>
+-- !query 236 output
+false
+
+
+-- !query 237
+SELECT '1' <> cast(1 as float)                           FROM t
+-- !query 237 schema
+struct<(NOT (CAST(1 AS FLOAT) = CAST(1 AS FLOAT))):boolean>
+-- !query 237 output
+false
+
+
+-- !query 238
+SELECT '1' <> cast(1 as double)                          FROM t
+-- !query 238 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 238 output
+false
+
+
+-- !query 239
+SELECT '1' <> cast(1 as decimal(10, 0))                  FROM t
+-- !query 239 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 239 output
+false
+
+
+-- !query 240
+SELECT '1' <> '1'                                        FROM t
+-- !query 240 schema
+struct<(NOT (1 = 1)):boolean>
+-- !query 240 output
+false
+
+
+-- !query 241
+SELECT '1' <> cast('1' as binary)                        FROM t
+-- !query 241 schema
+struct<(NOT (CAST(1 AS BINARY) = CAST(1 AS BINARY))):boolean>
+-- !query 241 output
+false
+
+
+-- !query 242
+SELECT '1' <> cast(1 as boolean)                         FROM t
+-- !query 242 schema
+struct<(NOT (CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN))):boolean>
+-- !query 242 output
+false
+
+
+-- !query 243
+SELECT '1' <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 243 schema
+struct<(NOT (CAST(1 AS TIMESTAMP) = CAST(2017-12-11 09:30:00.0 AS TIMESTAMP))):boolean>
+-- !query 243 output
+NULL
+
+
+-- !query 244
+SELECT '1' <> cast('2017-12-11 09:30:00' as date)        FROM t
+-- !query 244 schema
+struct<(NOT (1 = CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING))):boolean>
+-- !query 244 output
+true
+
+
+-- !query 245
+SELECT cast(1 as tinyint)                         < '1' FROM t
+-- !query 245 schema
+struct<(CAST(1 AS TINYINT) < CAST(1 AS TINYINT)):boolean>
+-- !query 245 output
+false
+
+
+-- !query 246
+SELECT cast(1 as smallint)                        < '1' FROM t
+-- !query 246 schema
+struct<(CAST(1 AS SMALLINT) < CAST(1 AS SMALLINT)):boolean>
+-- !query 246 output
+false
+
+
+-- !query 247
+SELECT cast(1 as int)                             < '1' FROM t
+-- !query 247 schema
+struct<(CAST(1 AS INT) < CAST(1 AS INT)):boolean>
+-- !query 247 output
+false
+
+
+-- !query 248
+SELECT cast(1 as bigint)                          < '1' FROM t
+-- !query 248 schema
+struct<(CAST(1 AS BIGINT) < CAST(1 AS BIGINT)):boolean>
+-- !query 248 output
+false
+
+
+-- !query 249
+SELECT cast(1 as float)                           < '1' FROM t
+-- !query 249 schema
+struct<(CAST(1 AS FLOAT) < CAST(1 AS FLOAT)):boolean>
+-- !query 249 output
+false
+
+
+-- !query 250
+SELECT cast(1 as double)                          < '1' FROM t
+-- !query 250 schema
+struct<(CAST(1 AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 250 output
+false
+
+
+-- !query 251
+SELECT cast(1 as decimal(10, 0))                  < '1' FROM t
+-- !query 251 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 251 output
+false
+
+
+-- !query 252
+SELECT '1'                                        < '1' FROM t
+-- !query 252 schema
+struct<(1 < 1):boolean>
+-- !query 252 output
+false
+
+
+-- !query 253
+SELECT cast('1' as binary)                        < '1' FROM t
+-- !query 253 schema
+struct<(CAST(1 AS BINARY) < CAST(1 AS BINARY)):boolean>
+-- !query 253 output
+false
+
+
+-- !query 254
+SELECT cast(1 as boolean)                         < '1' FROM t
+-- !query 254 schema
+struct<(CAST(1 AS BOOLEAN) < CAST(1 AS BOOLEAN)):boolean>
+-- !query 254 output
+false
+
+
+-- !query 255
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < '1' FROM t
+-- !query 255 schema
+struct<(CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING) < 1):boolean>
+-- !query 255 output
+false
+
+
+-- !query 256
+SELECT cast('2017-12-11 09:30:00' as date)        < '1' FROM t
+-- !query 256 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) < 1):boolean>
+-- !query 256 output
+false
+
+
+-- !query 257
+SELECT cast(1 as tinyint)                         <= '1' FROM t
+-- !query 257 schema
+struct<(CAST(1 AS TINYINT) <= CAST(1 AS TINYINT)):boolean>
+-- !query 257 output
+true
+
+
+-- !query 258
+SELECT cast(1 as smallint)                        <= '1' FROM t
+-- !query 258 schema
+struct<(CAST(1 AS SMALLINT) <= CAST(1 AS SMALLINT)):boolean>
+-- !query 258 output
+true
+
+
+-- !query 259
+SELECT cast(1 as int)                             <= '1' FROM t
+-- !query 259 schema
+struct<(CAST(1 AS INT) <= CAST(1 AS INT)):boolean>
+-- !query 259 output
+true
+
+
+-- !query 260
+SELECT cast(1 as bigint)                          <= '1' FROM t
+-- !query 260 schema
+struct<(CAST(1 AS BIGINT) <= CAST(1 AS BIGINT)):boolean>
+-- !query 260 output
+true
+
+
+-- !query 261
+SELECT cast(1 as float)                           <= '1' FROM t
+-- !query 261 schema
+struct<(CAST(1 AS FLOAT) <= CAST(1 AS FLOAT)):boolean>
+-- !query 261 output
+true
+
+
+-- !query 262
+SELECT cast(1 as double)                          <= '1' FROM t
+-- !query 262 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 262 output
+true
+
+
+-- !query 263
+SELECT cast(1 as decimal(10, 0))                  <= '1' FROM t
+-- !query 263 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 263 output
+true
+
+
+-- !query 264
+SELECT '1'                                        <= '1' FROM t
+-- !query 264 schema
+struct<(1 <= 1):boolean>
+-- !query 264 output
+true
+
+
+-- !query 265
+SELECT cast('1' as binary)                        <= '1' FROM t
+-- !query 265 schema
+struct<(CAST(1 AS BINARY) <= CAST(1 AS BINARY)):boolean>
+-- !query 265 output
+true
+
+
+-- !query 266
+SELECT cast(1 as boolean)                         <= '1' FROM t
+-- !query 266 schema
+struct<(CAST(1 AS BOOLEAN) <= CAST(1 AS BOOLEAN)):boolean>
+-- !query 266 output
+true
+
+
+-- !query 267
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= '1' FROM t
+-- !query 267 schema
+struct<(CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING) <= 1):boolean>
+-- !query 267 output
+false
+
+
+-- !query 268
+SELECT cast('2017-12-11 09:30:00' as date)        <= '1' FROM t
+-- !query 268 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) <= 1):boolean>
+-- !query 268 output
+false
+
+
+-- !query 269
+SELECT cast(1 as tinyint)                         > '1' FROM t
+-- !query 269 schema
+struct<(CAST(1 AS TINYINT) > CAST(1 AS TINYINT)):boolean>
+-- !query 269 output
+false
+
+
+-- !query 270
+SELECT cast(1 as smallint)                        > '1' FROM t
+-- !query 270 schema
+struct<(CAST(1 AS SMALLINT) > CAST(1 AS SMALLINT)):boolean>
+-- !query 270 output
+false
+
+
+-- !query 271
+SELECT cast(1 as int)                             > '1' FROM t
+-- !query 271 schema
+struct<(CAST(1 AS INT) > CAST(1 AS INT)):boolean>
+-- !query 271 output
+false
+
+
+-- !query 272
+SELECT cast(1 as bigint)                          > '1' FROM t
+-- !query 272 schema
+struct<(CAST(1 AS BIGINT) > CAST(1 AS BIGINT)):boolean>
+-- !query 272 output
+false
+
+
+-- !query 273
+SELECT cast(1 as float)                           > '1' FROM t
+-- !query 273 schema
+struct<(CAST(1 AS FLOAT) > CAST(1 AS FLOAT)):boolean>
+-- !query 273 output
+false
+
+
+-- !query 274
+SELECT cast(1 as double)                          > '1' FROM t
+-- !query 274 schema
+struct<(CAST(1 AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 274 output
+false
+
+
+-- !query 275
+SELECT cast(1 as decimal(10, 0))                  > '1' FROM t
+-- !query 275 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 275 output
+false
+
+
+-- !query 276
+SELECT '1'                                        > '1' FROM t
+-- !query 276 schema
+struct<(1 > 1):boolean>
+-- !query 276 output
+false
+
+
+-- !query 277
+SELECT cast('1' as binary)                        > '1' FROM t
+-- !query 277 schema
+struct<(CAST(1 AS BINARY) > CAST(1 AS BINARY)):boolean>
+-- !query 277 output
+false
+
+
+-- !query 278
+SELECT cast(1 as boolean)                         > '1' FROM t
+-- !query 278 schema
+struct<(CAST(1 AS BOOLEAN) > CAST(1 AS BOOLEAN)):boolean>
+-- !query 278 output
+false
+
+
+-- !query 279
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > '1' FROM t
+-- !query 279 schema
+struct<(CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING) > 1):boolean>
+-- !query 279 output
+true
+
+
+-- !query 280
+SELECT cast('2017-12-11 09:30:00' as date)        > '1' FROM t
+-- !query 280 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) > 1):boolean>
+-- !query 280 output
+true
+
+
+-- !query 281
+SELECT cast(1 as tinyint)                         >= '1' FROM t
+-- !query 281 schema
+struct<(CAST(1 AS TINYINT) >= CAST(1 AS TINYINT)):boolean>
+-- !query 281 output
+true
+
+
+-- !query 282
+SELECT cast(1 as smallint)                        >= '1' FROM t
+-- !query 282 schema
+struct<(CAST(1 AS SMALLINT) >= CAST(1 AS SMALLINT)):boolean>
+-- !query 282 output
+true
+
+
+-- !query 283
+SELECT cast(1 as int)                             >= '1' FROM t
+-- !query 283 schema
+struct<(CAST(1 AS INT) >= CAST(1 AS INT)):boolean>
+-- !query 283 output
+true
+
+
+-- !query 284
+SELECT cast(1 as bigint)                          >= '1' FROM t
+-- !query 284 schema
+struct<(CAST(1 AS BIGINT) >= CAST(1 AS BIGINT)):boolean>
+-- !query 284 output
+true
+
+
+-- !query 285
+SELECT cast(1 as float)                           >= '1' FROM t
+-- !query 285 schema
+struct<(CAST(1 AS FLOAT) >= CAST(1 AS FLOAT)):boolean>
+-- !query 285 output
+true
+
+
+-- !query 286
+SELECT cast(1 as double)                          >= '1' FROM t
+-- !query 286 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 286 output
+true
+
+
+-- !query 287
+SELECT cast(1 as decimal(10, 0))                  >= '1' FROM t
+-- !query 287 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 287 output
+true
+
+
+-- !query 288
+SELECT '1'                                        >= '1' FROM t
+-- !query 288 schema
+struct<(1 >= 1):boolean>
+-- !query 288 output
+true
+
+
+-- !query 289
+SELECT cast('1' as binary)                        >= '1' FROM t
+-- !query 289 schema
+struct<(CAST(1 AS BINARY) >= CAST(1 AS BINARY)):boolean>
+-- !query 289 output
+true
+
+
+-- !query 290
+SELECT cast(1 as boolean)                         >= '1' FROM t
+-- !query 290 schema
+struct<(CAST(1 AS BOOLEAN) >= CAST(1 AS BOOLEAN)):boolean>
+-- !query 290 output
+true
+
+
+-- !query 291
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= '1' FROM t
+-- !query 291 schema
+struct<(CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) AS STRING) >= 1):boolean>
+-- !query 291 output
+true
+
+
+-- !query 292
+SELECT cast('2017-12-11 09:30:00' as date)        >= '1' FROM t
+-- !query 292 schema
+struct<(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) >= 1):boolean>
+-- !query 292 output
+true
+
+
+-- !query 293
+SELECT cast(1 as tinyint)                         <> '1' FROM t
+-- !query 293 schema
+struct<(NOT (CAST(1 AS TINYINT) = CAST(1 AS TINYINT))):boolean>
+-- !query 293 output
+false
+
+
+-- !query 294
+SELECT cast(1 as smallint)                        <> '1' FROM t
+-- !query 294 schema
+struct<(NOT (CAST(1 AS SMALLINT) = CAST(1 AS SMALLINT))):boolean>
+-- !query 294 output
+false
+
+
+-- !query 295
+SELECT cast(1 as int)                             <> '1' FROM t
+-- !query 295 schema
+struct<(NOT (CAST(1 AS INT) = CAST(1 AS INT))):boolean>
+-- !query 295 output
+false
+
+
+-- !query 296
+SELECT cast(1 as bigint)                          <> '1' FROM t
+-- !query 296 schema
+struct<(NOT (CAST(1 AS BIGINT) = CAST(1 AS BIGINT))):boolean>
+-- !query 296 output
+false
+
+
+-- !query 297
+SELECT cast(1 as float)                           <> '1' FROM t
+-- !query 297 schema
+struct<(NOT (CAST(1 AS FLOAT) = CAST(1 AS FLOAT))):boolean>
+-- !query 297 output
+false
+
+
+-- !query 298
+SELECT cast(1 as double)                          <> '1' FROM t
+-- !query 298 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 298 output
+false
+
+
+-- !query 299
+SELECT cast(1 as decimal(10, 0))                  <> '1' FROM t
+-- !query 299 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 299 output
+false
+
+
+-- !query 300
+SELECT '1'                                        <> '1' FROM t
+-- !query 300 schema
+struct<(NOT (1 = 1)):boolean>
+-- !query 300 output
+false
+
+
+-- !query 301
+SELECT cast('1' as binary)                        <> '1' FROM t
+-- !query 301 schema
+struct<(NOT (CAST(1 AS BINARY) = CAST(1 AS BINARY))):boolean>
+-- !query 301 output
+false
+
+
+-- !query 302
+SELECT cast(1 as boolean)                         <> '1' FROM t
+-- !query 302 schema
+struct<(NOT (CAST(1 AS BOOLEAN) = CAST(1 AS BOOLEAN))):boolean>
+-- !query 302 output
+false
+
+
+-- !query 303
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> '1' FROM t
+-- !query 303 schema
+struct<(NOT (CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) = CAST(1 AS TIMESTAMP))):boolean>
+-- !query 303 output
+NULL
+
+
+-- !query 304
+SELECT cast('2017-12-11 09:30:00' as date)        <> '1' FROM t
+-- !query 304 schema
+struct<(NOT (CAST(CAST(2017-12-11 09:30:00 AS DATE) AS STRING) = 1)):boolean>
+-- !query 304 output
+true
+
+
+-- !query 305
+SELECT abs('1') FROM t
+-- !query 305 schema
+struct<abs(CAST(1 AS DOUBLE)):double>
+-- !query 305 output
+1.0
+
+
+-- !query 306
+SELECT sum('1') FROM t
+-- !query 306 schema
+struct<sum(CAST(1 AS DOUBLE)):double>
+-- !query 306 output
+1.0
+
+
+-- !query 307
+SELECT avg('1') FROM t
+-- !query 307 schema
+struct<avg(CAST(1 AS DOUBLE)):double>
+-- !query 307 output
+1.0
+
+
+-- !query 308
+SELECT stddev_pop('1') FROM t
+-- !query 308 schema
+struct<stddev_pop(CAST(1 AS DOUBLE)):double>
+-- !query 308 output
+0.0
+
+
+-- !query 309
+SELECT stddev_samp('1') FROM t
+-- !query 309 schema
+struct<stddev_samp(CAST(1 AS DOUBLE)):double>
+-- !query 309 output
+NaN
+
+
+-- !query 310
+SELECT - '1' FROM t
+-- !query 310 schema
+struct<(- CAST(1 AS DOUBLE)):double>
+-- !query 310 output
+-1.0
+
+
+-- !query 311
+SELECT + '1' FROM t
+-- !query 311 schema
+struct<1:string>
+-- !query 311 output
+1
+
+
+-- !query 312
+SELECT var_pop('1') FROM t
+-- !query 312 schema
+struct<var_pop(CAST(1 AS DOUBLE)):double>
+-- !query 312 output
+0.0
+
+
+-- !query 313
+SELECT var_samp('1') FROM t
+-- !query 313 schema
+struct<var_samp(CAST(1 AS DOUBLE)):double>
+-- !query 313 output
+NaN
+
+
+-- !query 314
+SELECT skewness('1') FROM t
+-- !query 314 schema
+struct<skewness(CAST(1 AS DOUBLE)):double>
+-- !query 314 output
+NaN
+
+
+-- !query 315
+SELECT kurtosis('1') FROM t
+-- !query 315 schema
+struct<kurtosis(CAST(1 AS DOUBLE)):double>
+-- !query 315 output
+NaN

From fb3636b482be3d0940345b1528c1d5090bbc25e6 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Mon, 18 Dec 2017 11:29:32 -0800
Subject: [PATCH 136/356] [SPARK-22807][SCHEDULER] Remove config that says
 docker and replace with container

## What changes were proposed in this pull request?
Changes discussed in https://github.com/apache/spark/pull/19946#discussion_r157063535
docker -> container, since with CRI, we are not limited to running only docker images.

## How was this patch tested?
Manual testing

Author: foxish <ramanathana@google.com>

Closes #19995 from foxish/make-docker-container.
---
 .../apache/spark/deploy/SparkSubmitSuite.scala  |  4 ++--
 .../org/apache/spark/deploy/k8s/Config.scala    | 17 ++++++++---------
 .../DriverConfigurationStepsOrchestrator.scala  |  4 ++--
 .../steps/BaseDriverConfigurationStep.scala     | 12 ++++++------
 .../cluster/k8s/ExecutorPodFactory.scala        | 12 ++++++------
 ...verConfigurationStepsOrchestratorSuite.scala |  6 +++---
 .../BaseDriverConfigurationStepSuite.scala      |  8 ++++----
 .../cluster/k8s/ExecutorPodFactorySuite.scala   |  2 +-
 8 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index 2eb8a1fee104c..27dd435332348 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -396,7 +396,7 @@ class SparkSubmitSuite
       "--class", "org.SomeClass",
       "--driver-memory", "4g",
       "--conf", "spark.kubernetes.namespace=spark",
-      "--conf", "spark.kubernetes.driver.docker.image=bar",
+      "--conf", "spark.kubernetes.driver.container.image=bar",
       "/home/thejar.jar",
       "arg1")
     val appArgs = new SparkSubmitArguments(clArgs)
@@ -412,7 +412,7 @@ class SparkSubmitSuite
     conf.get("spark.executor.memory") should be ("5g")
     conf.get("spark.driver.memory") should be ("4g")
     conf.get("spark.kubernetes.namespace") should be ("spark")
-    conf.get("spark.kubernetes.driver.docker.image") should be ("bar")
+    conf.get("spark.kubernetes.driver.container.image") should be ("bar")
   }
 
   test("handles confs with flag equivalents") {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index f35fb38798218..04aadb4b06af4 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -30,21 +30,20 @@ private[spark] object Config extends Logging {
       .stringConf
       .createWithDefault("default")
 
-  val DRIVER_DOCKER_IMAGE =
-    ConfigBuilder("spark.kubernetes.driver.docker.image")
-      .doc("Docker image to use for the driver. Specify this using the standard Docker tag format.")
+  val DRIVER_CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.driver.container.image")
+      .doc("Container image to use for the driver.")
       .stringConf
       .createOptional
 
-  val EXECUTOR_DOCKER_IMAGE =
-    ConfigBuilder("spark.kubernetes.executor.docker.image")
-      .doc("Docker image to use for the executors. Specify this using the standard Docker tag " +
-        "format.")
+  val EXECUTOR_CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.executor.container.image")
+      .doc("Container image to use for the executors.")
       .stringConf
       .createOptional
 
-  val DOCKER_IMAGE_PULL_POLICY =
-    ConfigBuilder("spark.kubernetes.docker.image.pullPolicy")
+  val CONTAINER_IMAGE_PULL_POLICY =
+    ConfigBuilder("spark.kubernetes.container.image.pullPolicy")
       .doc("Kubernetes image pull policy. Valid values are Always, Never, and IfNotPresent.")
       .stringConf
       .checkValues(Set("Always", "Never", "IfNotPresent"))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
index c563fc5bfbadf..1411e6f40b468 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
@@ -49,7 +49,7 @@ private[spark] class DriverConfigurationStepsOrchestrator(
     s"$appName-$uuid".toLowerCase.replaceAll("\\.", "-")
   }
 
-  private val dockerImagePullPolicy = submissionSparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+  private val imagePullPolicy = submissionSparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
   private val jarsDownloadPath = submissionSparkConf.get(JARS_DOWNLOAD_LOCATION)
   private val filesDownloadPath = submissionSparkConf.get(FILES_DOWNLOAD_LOCATION)
 
@@ -72,7 +72,7 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       kubernetesAppId,
       kubernetesResourceNamePrefix,
       allDriverLabels,
-      dockerImagePullPolicy,
+      imagePullPolicy,
       appName,
       mainClass,
       appArgs,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
index ba2a11b9e6689..c335fcce4036e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
@@ -34,7 +34,7 @@ private[spark] class BaseDriverConfigurationStep(
     kubernetesAppId: String,
     kubernetesResourceNamePrefix: String,
     driverLabels: Map[String, String],
-    dockerImagePullPolicy: String,
+    imagePullPolicy: String,
     appName: String,
     mainClass: String,
     appArgs: Array[String],
@@ -46,9 +46,9 @@ private[spark] class BaseDriverConfigurationStep(
   private val driverExtraClasspath = submissionSparkConf.get(
     DRIVER_CLASS_PATH)
 
-  private val driverDockerImage = submissionSparkConf
-    .get(DRIVER_DOCKER_IMAGE)
-    .getOrElse(throw new SparkException("Must specify the driver Docker image"))
+  private val driverContainerImage = submissionSparkConf
+    .get(DRIVER_CONTAINER_IMAGE)
+    .getOrElse(throw new SparkException("Must specify the driver container image"))
 
   // CPU settings
   private val driverCpuCores = submissionSparkConf.getOption("spark.driver.cores").getOrElse("1")
@@ -110,8 +110,8 @@ private[spark] class BaseDriverConfigurationStep(
 
     val driverContainer = new ContainerBuilder(driverSpec.driverContainer)
       .withName(DRIVER_CONTAINER_NAME)
-      .withImage(driverDockerImage)
-      .withImagePullPolicy(dockerImagePullPolicy)
+      .withImage(driverContainerImage)
+      .withImagePullPolicy(imagePullPolicy)
       .addAllToEnv(driverCustomEnvs.asJava)
       .addToEnv(driverExtraClasspathEnv.toSeq: _*)
       .addNewEnv()
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index 9d8f3b912c33d..70226157dd68b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -72,10 +72,10 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
       sparkConf,
       KUBERNETES_NODE_SELECTOR_PREFIX)
 
-  private val executorDockerImage = sparkConf
-    .get(EXECUTOR_DOCKER_IMAGE)
-    .getOrElse(throw new SparkException("Must specify the executor Docker image"))
-  private val dockerImagePullPolicy = sparkConf.get(DOCKER_IMAGE_PULL_POLICY)
+  private val executorContainerImage = sparkConf
+    .get(EXECUTOR_CONTAINER_IMAGE)
+    .getOrElse(throw new SparkException("Must specify the executor container image"))
+  private val imagePullPolicy = sparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
   private val blockManagerPort = sparkConf
     .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
 
@@ -166,8 +166,8 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
 
     val executorContainer = new ContainerBuilder()
       .withName("executor")
-      .withImage(executorDockerImage)
-      .withImagePullPolicy(dockerImagePullPolicy)
+      .withImage(executorContainerImage)
+      .withImagePullPolicy(imagePullPolicy)
       .withNewResources()
         .addToRequests("memory", executorMemoryQuantity)
         .addToLimits("memory", executorMemoryLimitQuantity)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
index c7291d49b465e..98f9f27da5cde 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.deploy.k8s.submit
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config.DRIVER_DOCKER_IMAGE
+import org.apache.spark.deploy.k8s.Config.DRIVER_CONTAINER_IMAGE
 import org.apache.spark.deploy.k8s.submit.steps._
 
 class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
@@ -32,7 +32,7 @@ class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
 
   test("Base submission steps with a main app resource.") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_DOCKER_IMAGE, DRIVER_IMAGE)
+      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigurationStepsOrchestrator(
       NAMESPACE,
@@ -54,7 +54,7 @@ class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
 
   test("Base submission steps without a main app resource.") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_DOCKER_IMAGE, DRIVER_IMAGE)
+      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
     val orchestrator = new DriverConfigurationStepsOrchestrator(
       NAMESPACE,
       APP_ID,
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
index 83c5f98254829..f7c1b3142cf71 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
@@ -30,7 +30,7 @@ class BaseDriverConfigurationStepSuite extends SparkFunSuite {
   private val APP_ID = "spark-app-id"
   private val RESOURCE_NAME_PREFIX = "spark"
   private val DRIVER_LABELS = Map("labelkey" -> "labelvalue")
-  private val DOCKER_IMAGE_PULL_POLICY = "IfNotPresent"
+  private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val APP_NAME = "spark-test"
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
   private val APP_ARGS = Array("arg1", "arg2", "arg 3")
@@ -47,7 +47,7 @@ class BaseDriverConfigurationStepSuite extends SparkFunSuite {
       .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
-      .set(DRIVER_DOCKER_IMAGE, "spark-driver:latest")
+      .set(DRIVER_CONTAINER_IMAGE, "spark-driver:latest")
       .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
@@ -56,7 +56,7 @@ class BaseDriverConfigurationStepSuite extends SparkFunSuite {
       APP_ID,
       RESOURCE_NAME_PREFIX,
       DRIVER_LABELS,
-      DOCKER_IMAGE_PULL_POLICY,
+      CONTAINER_IMAGE_PULL_POLICY,
       APP_NAME,
       MAIN_CLASS,
       APP_ARGS,
@@ -71,7 +71,7 @@ class BaseDriverConfigurationStepSuite extends SparkFunSuite {
 
     assert(preparedDriverSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
     assert(preparedDriverSpec.driverContainer.getImage === "spark-driver:latest")
-    assert(preparedDriverSpec.driverContainer.getImagePullPolicy === DOCKER_IMAGE_PULL_POLICY)
+    assert(preparedDriverSpec.driverContainer.getImagePullPolicy === CONTAINER_IMAGE_PULL_POLICY)
 
     assert(preparedDriverSpec.driverContainer.getEnv.size === 7)
     val envs = preparedDriverSpec.driverContainer
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 1c7717c238096..3a55d7cb37b1f 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -50,7 +50,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     baseConf = new SparkConf()
       .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
-      .set(EXECUTOR_DOCKER_IMAGE, executorImage)
+      .set(EXECUTOR_CONTAINER_IMAGE, executorImage)
   }
 
   test("basic executor pod has reasonable defaults") {

From 772e4648d95bda3353723337723543c741ea8476 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 18 Dec 2017 14:08:48 -0600
Subject: [PATCH 137/356] [SPARK-20653][CORE] Add cleaning of old elements from
 the status store.

This change restores the functionality that keeps a limited number of
different types (jobs, stages, etc) depending on configuration, to avoid
the store growing indefinitely over time.

The feature is implemented by creating a new type (ElementTrackingStore)
that wraps a KVStore and allows triggers to be set up for when elements
of a certain type meet a certain threshold. Triggers don't need to
necessarily only delete elements, but the current API is set up in a way
that makes that use case easier.

The new store also has a trigger for the "close" call, which makes it
easier for listeners to register code for cleaning things up and flushing
partial state to the store.

The old configurations for cleaning up the stored elements from the core
and SQL UIs are now active again, and the old unit tests are re-enabled.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19751 from vanzin/SPARK-20653.
---
 .../deploy/history/FsHistoryProvider.scala    |  17 +-
 .../spark/internal/config/package.scala       |   5 -
 .../spark/status/AppStatusListener.scala      | 188 ++++++++++++++++--
 .../apache/spark/status/AppStatusPlugin.scala |   2 +-
 .../apache/spark/status/AppStatusStore.scala  |   6 +-
 .../spark/status/ElementTrackingStore.scala   | 160 +++++++++++++++
 .../org/apache/spark/status/KVUtils.scala     |  14 ++
 .../org/apache/spark/status/LiveEntity.scala  |  13 +-
 .../spark/status/api/v1/StagesResource.scala  |  10 +-
 .../org/apache/spark/status/config.scala      |  20 ++
 .../org/apache/spark/status/storeTypes.scala  |  16 ++
 .../scala/org/apache/spark/ui/SparkUI.scala   |   2 -
 .../apache/spark/ui/jobs/AllJobsPage.scala    |   8 +-
 .../apache/spark/ui/jobs/AllStagesPage.scala  |   8 +-
 .../deploy/history/HistoryServerSuite.scala   |   2 +-
 .../spark/status/AppStatusListenerSuite.scala | 148 +++++++++++---
 .../status/ElementTrackingStoreSuite.scala    |  91 +++++++++
 .../org/apache/spark/ui/UISeleniumSuite.scala |   8 +-
 .../spark/sql/internal/StaticSQLConf.scala    |   7 +
 .../execution/ui/SQLAppStatusListener.scala   |  33 ++-
 .../sql/execution/ui/SQLAppStatusStore.scala  |   7 +-
 .../sql/execution/ui/SQLListenerSuite.scala   |  29 +--
 22 files changed, 713 insertions(+), 81 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala
 create mode 100644 core/src/test/scala/org/apache/spark/status/ElementTrackingStoreSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index a8e1becc56ab7..fa2c5194aa41b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -44,6 +44,7 @@ import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.status._
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo}
+import org.apache.spark.status.config._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 import org.apache.spark.util.kvstore._
@@ -304,6 +305,9 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     val (kvstore, needReplay) = uiStorePath match {
       case Some(path) =>
         try {
+          // The store path is not guaranteed to exist - maybe it hasn't been created, or was
+          // invalidated because changes to the event log were detected. Need to replay in that
+          // case.
           val _replay = !path.isDirectory()
           (createDiskStore(path, conf), _replay)
         } catch {
@@ -318,24 +322,23 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         (new InMemoryStore(), true)
     }
 
+    val trackingStore = new ElementTrackingStore(kvstore, conf)
     if (needReplay) {
       val replayBus = new ReplayListenerBus()
-      val listener = new AppStatusListener(kvstore, conf, false,
+      val listener = new AppStatusListener(trackingStore, conf, false,
         lastUpdateTime = Some(attempt.info.lastUpdated.getTime()))
       replayBus.addListener(listener)
       AppStatusPlugin.loadPlugins().foreach { plugin =>
-        plugin.setupListeners(conf, kvstore, l => replayBus.addListener(l), false)
+        plugin.setupListeners(conf, trackingStore, l => replayBus.addListener(l), false)
       }
       try {
         val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
         replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
-        listener.flush()
+        trackingStore.close(false)
       } catch {
         case e: Exception =>
-          try {
-            kvstore.close()
-          } catch {
-            case _e: Exception => logInfo("Error closing store.", _e)
+          Utils.tryLogNonFatalError {
+            trackingStore.close()
           }
           uiStorePath.foreach(Utils.deleteRecursively)
           if (e.isInstanceOf[FileNotFoundException]) {
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 172ba85359da7..eb12ddf961314 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -240,11 +240,6 @@ package object config {
     .stringConf
     .createOptional
 
-  // To limit memory usage, we only track information for a fixed number of tasks
-  private[spark] val UI_RETAINED_TASKS = ConfigBuilder("spark.ui.retainedTasks")
-    .intConf
-    .createWithDefault(100000)
-
   // To limit how many applications are shown in the History Server summary ui
   private[spark] val HISTORY_UI_MAX_APPS =
     ConfigBuilder("spark.history.ui.maxApplications").intConf.createWithDefault(Integer.MAX_VALUE)
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 6da44cbc44c4d..1fb7b76d43d04 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -32,7 +32,6 @@ import org.apache.spark.status.api.v1
 import org.apache.spark.storage._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.ui.scope._
-import org.apache.spark.util.kvstore.KVStore
 
 /**
  * A Spark listener that writes application information to a data store. The types written to the
@@ -42,7 +41,7 @@ import org.apache.spark.util.kvstore.KVStore
  *                       unfinished tasks can be more accurately calculated (see SPARK-21922).
  */
 private[spark] class AppStatusListener(
-    kvstore: KVStore,
+    kvstore: ElementTrackingStore,
     conf: SparkConf,
     live: Boolean,
     lastUpdateTime: Option[Long] = None) extends SparkListener with Logging {
@@ -51,6 +50,7 @@ private[spark] class AppStatusListener(
 
   private var sparkVersion = SPARK_VERSION
   private var appInfo: v1.ApplicationInfo = null
+  private var appSummary = new AppSummary(0, 0)
   private var coresPerTask: Int = 1
 
   // How often to update live entities. -1 means "never update" when replaying applications,
@@ -58,6 +58,7 @@ private[spark] class AppStatusListener(
   // operations that we can live without when rapidly processing incoming task events.
   private val liveUpdatePeriodNs = if (live) conf.get(LIVE_ENTITY_UPDATE_PERIOD) else -1L
 
+  private val maxTasksPerStage = conf.get(MAX_RETAINED_TASKS_PER_STAGE)
   private val maxGraphRootNodes = conf.get(MAX_RETAINED_ROOT_NODES)
 
   // Keep track of live entities, so that task metrics can be efficiently updated (without
@@ -68,10 +69,25 @@ private[spark] class AppStatusListener(
   private val liveTasks = new HashMap[Long, LiveTask]()
   private val liveRDDs = new HashMap[Int, LiveRDD]()
   private val pools = new HashMap[String, SchedulerPool]()
+  // Keep the active executor count as a separate variable to avoid having to do synchronization
+  // around liveExecutors.
+  @volatile private var activeExecutorCount = 0
 
-  override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
-    case SparkListenerLogStart(version) => sparkVersion = version
-    case _ =>
+  kvstore.addTrigger(classOf[ExecutorSummaryWrapper], conf.get(MAX_RETAINED_DEAD_EXECUTORS))
+    { count => cleanupExecutors(count) }
+
+  kvstore.addTrigger(classOf[JobDataWrapper], conf.get(MAX_RETAINED_JOBS)) { count =>
+    cleanupJobs(count)
+  }
+
+  kvstore.addTrigger(classOf[StageDataWrapper], conf.get(MAX_RETAINED_STAGES)) { count =>
+    cleanupStages(count)
+  }
+
+  kvstore.onFlush {
+    if (!live) {
+      flush()
+    }
   }
 
   override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
@@ -97,6 +113,7 @@ private[spark] class AppStatusListener(
       Seq(attempt))
 
     kvstore.write(new ApplicationInfoWrapper(appInfo))
+    kvstore.write(appSummary)
   }
 
   override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = {
@@ -158,10 +175,11 @@ private[spark] class AppStatusListener(
   override def onExecutorRemoved(event: SparkListenerExecutorRemoved): Unit = {
     liveExecutors.remove(event.executorId).foreach { exec =>
       val now = System.nanoTime()
+      activeExecutorCount = math.max(0, activeExecutorCount - 1)
       exec.isActive = false
       exec.removeTime = new Date(event.time)
       exec.removeReason = event.reason
-      update(exec, now)
+      update(exec, now, last = true)
 
       // Remove all RDD distributions that reference the removed executor, in case there wasn't
       // a corresponding event.
@@ -290,8 +308,11 @@ private[spark] class AppStatusListener(
       }
 
       job.completionTime = if (event.time > 0) Some(new Date(event.time)) else None
-      update(job, now)
+      update(job, now, last = true)
     }
+
+    appSummary = new AppSummary(appSummary.numCompletedJobs + 1, appSummary.numCompletedStages)
+    kvstore.write(appSummary)
   }
 
   override def onStageSubmitted(event: SparkListenerStageSubmitted): Unit = {
@@ -350,6 +371,13 @@ private[spark] class AppStatusListener(
         job.activeTasks += 1
         maybeUpdate(job, now)
       }
+
+      if (stage.savedTasks.incrementAndGet() > maxTasksPerStage && !stage.cleaning) {
+        stage.cleaning = true
+        kvstore.doAsync {
+          cleanupTasks(stage)
+        }
+      }
     }
 
     liveExecutors.get(event.taskInfo.executorId).foreach { exec =>
@@ -449,6 +477,13 @@ private[spark] class AppStatusListener(
         esummary.metrics.update(metricsDelta)
       }
       maybeUpdate(esummary, now)
+
+      if (!stage.cleaning && stage.savedTasks.get() > maxTasksPerStage) {
+        stage.cleaning = true
+        kvstore.doAsync {
+          cleanupTasks(stage)
+        }
+      }
     }
 
     liveExecutors.get(event.taskInfo.executorId).foreach { exec =>
@@ -516,8 +551,11 @@ private[spark] class AppStatusListener(
       }
 
       stage.executorSummaries.values.foreach(update(_, now))
-      update(stage, now)
+      update(stage, now, last = true)
     }
+
+    appSummary = new AppSummary(appSummary.numCompletedJobs, appSummary.numCompletedStages + 1)
+    kvstore.write(appSummary)
   }
 
   override def onBlockManagerAdded(event: SparkListenerBlockManagerAdded): Unit = {
@@ -573,7 +611,7 @@ private[spark] class AppStatusListener(
   }
 
   /** Flush all live entities' data to the underlying store. */
-  def flush(): Unit = {
+  private def flush(): Unit = {
     val now = System.nanoTime()
     liveStages.values.asScala.foreach { stage =>
       update(stage, now)
@@ -708,7 +746,10 @@ private[spark] class AppStatusListener(
   }
 
   private def getOrCreateExecutor(executorId: String, addTime: Long): LiveExecutor = {
-    liveExecutors.getOrElseUpdate(executorId, new LiveExecutor(executorId, addTime))
+    liveExecutors.getOrElseUpdate(executorId, {
+      activeExecutorCount += 1
+      new LiveExecutor(executorId, addTime)
+    })
   }
 
   private def updateStreamBlock(event: SparkListenerBlockUpdated, stream: StreamBlockId): Unit = {
@@ -754,8 +795,8 @@ private[spark] class AppStatusListener(
     }
   }
 
-  private def update(entity: LiveEntity, now: Long): Unit = {
-    entity.write(kvstore, now)
+  private def update(entity: LiveEntity, now: Long, last: Boolean = false): Unit = {
+    entity.write(kvstore, now, checkTriggers = last)
   }
 
   /** Update a live entity only if it hasn't been updated in the last configured period. */
@@ -772,4 +813,127 @@ private[spark] class AppStatusListener(
     }
   }
 
+  private def cleanupExecutors(count: Long): Unit = {
+    // Because the limit is on the number of *dead* executors, we need to calculate whether
+    // there are actually enough dead executors to be deleted.
+    val threshold = conf.get(MAX_RETAINED_DEAD_EXECUTORS)
+    val dead = count - activeExecutorCount
+
+    if (dead > threshold) {
+      val countToDelete = calculateNumberToRemove(dead, threshold)
+      val toDelete = kvstore.view(classOf[ExecutorSummaryWrapper]).index("active")
+        .max(countToDelete).first(false).last(false).asScala.toSeq
+      toDelete.foreach { e => kvstore.delete(e.getClass(), e.info.id) }
+    }
+  }
+
+  private def cleanupJobs(count: Long): Unit = {
+    val countToDelete = calculateNumberToRemove(count, conf.get(MAX_RETAINED_JOBS))
+    if (countToDelete <= 0L) {
+      return
+    }
+
+    val toDelete = KVUtils.viewToSeq(kvstore.view(classOf[JobDataWrapper]),
+        countToDelete.toInt) { j =>
+      j.info.status != JobExecutionStatus.RUNNING && j.info.status != JobExecutionStatus.UNKNOWN
+    }
+    toDelete.foreach { j => kvstore.delete(j.getClass(), j.info.jobId) }
+  }
+
+  private def cleanupStages(count: Long): Unit = {
+    val countToDelete = calculateNumberToRemove(count, conf.get(MAX_RETAINED_STAGES))
+    if (countToDelete <= 0L) {
+      return
+    }
+
+    val stages = KVUtils.viewToSeq(kvstore.view(classOf[StageDataWrapper]),
+        countToDelete.toInt) { s =>
+      s.info.status != v1.StageStatus.ACTIVE && s.info.status != v1.StageStatus.PENDING
+    }
+
+    stages.foreach { s =>
+      val key = s.id
+      kvstore.delete(s.getClass(), key)
+
+      val execSummaries = kvstore.view(classOf[ExecutorStageSummaryWrapper])
+        .index("stage")
+        .first(key)
+        .last(key)
+        .asScala
+        .toSeq
+      execSummaries.foreach { e =>
+        kvstore.delete(e.getClass(), e.id)
+      }
+
+      val tasks = kvstore.view(classOf[TaskDataWrapper])
+        .index("stage")
+        .first(key)
+        .last(key)
+        .asScala
+
+      tasks.foreach { t =>
+        kvstore.delete(t.getClass(), t.info.taskId)
+      }
+
+      // Check whether there are remaining attempts for the same stage. If there aren't, then
+      // also delete the RDD graph data.
+      val remainingAttempts = kvstore.view(classOf[StageDataWrapper])
+        .index("stageId")
+        .first(s.stageId)
+        .last(s.stageId)
+        .closeableIterator()
+
+      val hasMoreAttempts = try {
+        remainingAttempts.asScala.exists { other =>
+          other.info.attemptId != s.info.attemptId
+        }
+      } finally {
+        remainingAttempts.close()
+      }
+
+      if (!hasMoreAttempts) {
+        kvstore.delete(classOf[RDDOperationGraphWrapper], s.stageId)
+      }
+    }
+  }
+
+  private def cleanupTasks(stage: LiveStage): Unit = {
+    val countToDelete = calculateNumberToRemove(stage.savedTasks.get(), maxTasksPerStage).toInt
+    if (countToDelete > 0) {
+      val stageKey = Array(stage.info.stageId, stage.info.attemptId)
+      val view = kvstore.view(classOf[TaskDataWrapper]).index("stage").first(stageKey)
+        .last(stageKey)
+
+      // Try to delete finished tasks only.
+      val toDelete = KVUtils.viewToSeq(view, countToDelete) { t =>
+        !live || t.info.status != TaskState.RUNNING.toString()
+      }
+      toDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+      stage.savedTasks.addAndGet(-toDelete.size)
+
+      // If there are more running tasks than the configured limit, delete running tasks. This
+      // should be extremely rare since the limit should generally far exceed the number of tasks
+      // that can run in parallel.
+      val remaining = countToDelete - toDelete.size
+      if (remaining > 0) {
+        val runningTasksToDelete = view.max(remaining).iterator().asScala.toList
+        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+        stage.savedTasks.addAndGet(-remaining)
+      }
+    }
+    stage.cleaning = false
+  }
+
+  /**
+   * Remove at least (retainedSize / 10) items to reduce friction. Because tracking may be done
+   * asynchronously, this method may return 0 in case enough items have been deleted already.
+   */
+  private def calculateNumberToRemove(dataSize: Long, retainedSize: Long): Long = {
+    if (dataSize > retainedSize) {
+      math.max(retainedSize / 10L, dataSize - retainedSize)
+    } else {
+      0L
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala b/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
index 69ca02ec76293..4cada5c7b0de4 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
@@ -48,7 +48,7 @@ private[spark] trait AppStatusPlugin {
    */
   def setupListeners(
       conf: SparkConf,
-      store: KVStore,
+      store: ElementTrackingStore,
       addListenerFn: SparkListener => Unit,
       live: Boolean): Unit
 
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 22d768b3cb990..9987419b170f6 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -330,6 +330,10 @@ private[spark] class AppStatusStore(
     store.read(classOf[PoolData], name)
   }
 
+  def appSummary(): AppSummary = {
+    store.read(classOf[AppSummary], classOf[AppSummary].getName())
+  }
+
   def close(): Unit = {
     store.close()
   }
@@ -347,7 +351,7 @@ private[spark] object AppStatusStore {
    * @param addListenerFn Function to register a listener with a bus.
    */
   def createLiveStore(conf: SparkConf, addListenerFn: SparkListener => Unit): AppStatusStore = {
-    val store = new InMemoryStore()
+    val store = new ElementTrackingStore(new InMemoryStore(), conf)
     val listener = new AppStatusListener(store, conf, true)
     addListenerFn(listener)
     AppStatusPlugin.loadPlugins().foreach { p =>
diff --git a/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala b/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala
new file mode 100644
index 0000000000000..863b0967f765e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/ElementTrackingStore.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import java.util.concurrent.TimeUnit
+
+import scala.collection.mutable.{HashMap, ListBuffer}
+
+import com.google.common.util.concurrent.MoreExecutors
+
+import org.apache.spark.SparkConf
+import org.apache.spark.util.{ThreadUtils, Utils}
+import org.apache.spark.util.kvstore._
+
+/**
+ * A KVStore wrapper that allows tracking the number of elements of specific types, and triggering
+ * actions once they reach a threshold. This allows writers, for example, to control how much data
+ * is stored by potentially deleting old data as new data is added.
+ *
+ * This store is used when populating data either from a live UI or an event log. On top of firing
+ * triggers when elements reach a certain threshold, it provides two extra bits of functionality:
+ *
+ * - a generic worker thread that can be used to run expensive tasks asynchronously; the tasks can
+ *   be configured to run on the calling thread when more determinism is desired (e.g. unit tests).
+ * - a generic flush mechanism so that listeners can be notified about when they should flush
+ *   internal state to the store (e.g. after the SHS finishes parsing an event log).
+ *
+ * The configured triggers are run on a separate thread by default; they can be forced to run on
+ * the calling thread by setting the `ASYNC_TRACKING_ENABLED` configuration to `false`.
+ */
+private[spark] class ElementTrackingStore(store: KVStore, conf: SparkConf) extends KVStore {
+
+  import config._
+
+  private val triggers = new HashMap[Class[_], Seq[Trigger[_]]]()
+  private val flushTriggers = new ListBuffer[() => Unit]()
+  private val executor = if (conf.get(ASYNC_TRACKING_ENABLED)) {
+    ThreadUtils.newDaemonSingleThreadExecutor("element-tracking-store-worker")
+  } else {
+    MoreExecutors.sameThreadExecutor()
+  }
+
+  @volatile private var stopped = false
+
+  /**
+   * Register a trigger that will be fired once the number of elements of a given type reaches
+   * the given threshold.
+   *
+   * @param klass The type to monitor.
+   * @param threshold The number of elements that should trigger the action.
+   * @param action Action to run when the threshold is reached; takes as a parameter the number
+   *               of elements of the registered type currently known to be in the store.
+   */
+  def addTrigger(klass: Class[_], threshold: Long)(action: Long => Unit): Unit = {
+    val existing = triggers.getOrElse(klass, Seq())
+    triggers(klass) = existing :+ Trigger(threshold, action)
+  }
+
+  /**
+   * Adds a trigger to be executed before the store is flushed. This normally happens before
+   * closing, and is useful for flushing intermediate state to the store, e.g. when replaying
+   * in-progress applications through the SHS.
+   *
+   * Flush triggers are called synchronously in the same thread that is closing the store.
+   */
+  def onFlush(action: => Unit): Unit = {
+    flushTriggers += { () => action }
+  }
+
+  /**
+   * Enqueues an action to be executed asynchronously. The task will run on the calling thread if
+   * `ASYNC_TRACKING_ENABLED` is `false`.
+   */
+  def doAsync(fn: => Unit): Unit = {
+    executor.submit(new Runnable() {
+      override def run(): Unit = Utils.tryLog { fn }
+    })
+  }
+
+  override def read[T](klass: Class[T], naturalKey: Any): T = store.read(klass, naturalKey)
+
+  override def write(value: Any): Unit = store.write(value)
+
+  /** Write an element to the store, optionally checking for whether to fire triggers. */
+  def write(value: Any, checkTriggers: Boolean): Unit = {
+    write(value)
+
+    if (checkTriggers && !stopped) {
+      triggers.get(value.getClass()).foreach { list =>
+        doAsync {
+          val count = store.count(value.getClass())
+          list.foreach { t =>
+            if (count > t.threshold) {
+              t.action(count)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  override def delete(klass: Class[_], naturalKey: Any): Unit = store.delete(klass, naturalKey)
+
+  override def getMetadata[T](klass: Class[T]): T = store.getMetadata(klass)
+
+  override def setMetadata(value: Any): Unit = store.setMetadata(value)
+
+  override def view[T](klass: Class[T]): KVStoreView[T] = store.view(klass)
+
+  override def count(klass: Class[_]): Long = store.count(klass)
+
+  override def count(klass: Class[_], index: String, indexedValue: Any): Long = {
+    store.count(klass, index, indexedValue)
+  }
+
+  override def close(): Unit = {
+    close(true)
+  }
+
+  /** A close() method that optionally leaves the parent store open. */
+  def close(closeParent: Boolean): Unit = synchronized {
+    if (stopped) {
+      return
+    }
+
+    stopped = true
+    executor.shutdown()
+    if (!executor.awaitTermination(5, TimeUnit.SECONDS)) {
+      executor.shutdownNow()
+    }
+
+    flushTriggers.foreach { trigger =>
+      Utils.tryLog(trigger())
+    }
+
+    if (closeParent) {
+      store.close()
+    }
+  }
+
+  private case class Trigger[T](
+      threshold: Long,
+      action: Long => Unit)
+
+}
diff --git a/core/src/main/scala/org/apache/spark/status/KVUtils.scala b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
index 4638511944c61..99b1843d8e1c0 100644
--- a/core/src/main/scala/org/apache/spark/status/KVUtils.scala
+++ b/core/src/main/scala/org/apache/spark/status/KVUtils.scala
@@ -20,6 +20,7 @@ package org.apache.spark.status
 import java.io.File
 
 import scala.annotation.meta.getter
+import scala.collection.JavaConverters._
 import scala.language.implicitConversions
 import scala.reflect.{classTag, ClassTag}
 
@@ -68,6 +69,19 @@ private[spark] object KVUtils extends Logging {
     db
   }
 
+  /** Turns a KVStoreView into a Scala sequence, applying a filter. */
+  def viewToSeq[T](
+      view: KVStoreView[T],
+      max: Int)
+      (filter: T => Boolean): Seq[T] = {
+    val iter = view.closeableIterator()
+    try {
+      iter.asScala.filter(filter).take(max).toList
+    } finally {
+      iter.close()
+    }
+  }
+
   private[spark] class MetadataMismatchException extends Exception
 
 }
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 983c58a607aa8..52e83f250d34e 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.status
 
 import java.util.Date
+import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.HashMap
 
@@ -38,10 +39,12 @@ import org.apache.spark.util.kvstore.KVStore
  */
 private[spark] abstract class LiveEntity {
 
-  var lastWriteTime = 0L
+  var lastWriteTime = -1L
 
-  def write(store: KVStore, now: Long): Unit = {
-    store.write(doUpdate())
+  def write(store: ElementTrackingStore, now: Long, checkTriggers: Boolean = false): Unit = {
+    // Always check triggers on the first write, since adding an element to the store may
+    // cause the maximum count for the element type to be exceeded.
+    store.write(doUpdate(), checkTriggers || lastWriteTime == -1L)
     lastWriteTime = now
   }
 
@@ -403,6 +406,10 @@ private class LiveStage extends LiveEntity {
 
   val executorSummaries = new HashMap[String, LiveExecutorStageSummary]()
 
+  // Used for cleanup of tasks after they reach the configured limit. Not written to the store.
+  @volatile var cleaning = false
+  var savedTasks = new AtomicInteger(0)
+
   def executorSummary(executorId: String): LiveExecutorStageSummary = {
     executorSummaries.getOrElseUpdate(executorId,
       new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
index b3561109bc636..3b879545b3d2e 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
@@ -59,7 +59,15 @@ private[v1] class StagesResource extends BaseAppResource {
       ui.store.stageAttempt(stageId, stageAttemptId, details = details)
     } catch {
       case _: NoSuchElementException =>
-        throw new NotFoundException(s"unknown attempt $stageAttemptId for stage $stageId.")
+        // Change the message depending on whether there are any attempts for the requested stage.
+        val all = ui.store.stageData(stageId)
+        val msg = if (all.nonEmpty) {
+          val ids = all.map(_.attemptId)
+          s"unknown attempt for stage $stageId.  Found attempts: [${ids.mkString(",")}]"
+        } else {
+          s"unknown stage: $stageId"
+        }
+        throw new NotFoundException(msg)
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/status/config.scala b/core/src/main/scala/org/apache/spark/status/config.scala
index 7af9dff977a86..67801b8f046f4 100644
--- a/core/src/main/scala/org/apache/spark/status/config.scala
+++ b/core/src/main/scala/org/apache/spark/status/config.scala
@@ -23,10 +23,30 @@ import org.apache.spark.internal.config._
 
 private[spark] object config {
 
+  val ASYNC_TRACKING_ENABLED = ConfigBuilder("spark.appStateStore.asyncTracking.enable")
+    .booleanConf
+    .createWithDefault(true)
+
   val LIVE_ENTITY_UPDATE_PERIOD = ConfigBuilder("spark.ui.liveUpdate.period")
     .timeConf(TimeUnit.NANOSECONDS)
     .createWithDefaultString("100ms")
 
+  val MAX_RETAINED_JOBS = ConfigBuilder("spark.ui.retainedJobs")
+    .intConf
+    .createWithDefault(1000)
+
+  val MAX_RETAINED_STAGES = ConfigBuilder("spark.ui.retainedStages")
+    .intConf
+    .createWithDefault(1000)
+
+  val MAX_RETAINED_TASKS_PER_STAGE = ConfigBuilder("spark.ui.retainedTasks")
+    .intConf
+    .createWithDefault(100000)
+
+  val MAX_RETAINED_DEAD_EXECUTORS = ConfigBuilder("spark.ui.retainedDeadExecutors")
+    .intConf
+    .createWithDefault(100)
+
   val MAX_RETAINED_ROOT_NODES = ConfigBuilder("spark.ui.dagGraph.retainedRootRDDs")
     .intConf
     .createWithDefault(Int.MaxValue)
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index c1ea87542d6cc..d9ead0071d3bf 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -112,6 +112,9 @@ private[spark] class TaskDataWrapper(
     Array(stageId: JInteger, stageAttemptId: JInteger, info.launchTime.getTime(): JLong)
   }
 
+  @JsonIgnore @KVIndex("active")
+  def active: Boolean = info.duration.isEmpty
+
 }
 
 private[spark] class RDDStorageInfoWrapper(val info: RDDStorageInfo) {
@@ -187,3 +190,16 @@ private[spark] class RDDOperationGraphWrapper(
 private[spark] class PoolData(
     @KVIndexParam val name: String,
     val stageIds: Set[Int])
+
+/**
+ * A class with information about an app, to be used by the UI. There's only one instance of
+ * this summary per application, so its ID in the store is the class name.
+ */
+private[spark] class AppSummary(
+    val numCompletedJobs: Int,
+    val numCompletedStages: Int) {
+
+  @KVIndex
+  def id: String = classOf[AppSummary].getName()
+
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
index 35da3c3bfd1a2..b44ac0ea1febc 100644
--- a/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
+++ b/core/src/main/scala/org/apache/spark/ui/SparkUI.scala
@@ -154,8 +154,6 @@ private[spark] object SparkUI {
   val DEFAULT_PORT = 4040
   val STATIC_RESOURCE_DIR = "org/apache/spark/ui/static"
   val DEFAULT_POOL_NAME = "default"
-  val DEFAULT_RETAINED_STAGES = 1000
-  val DEFAULT_RETAINED_JOBS = 1000
 
   def getUIPort(conf: SparkConf): Int = {
     conf.getInt("spark.ui.port", SparkUI.DEFAULT_PORT)
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index b60d39b21b4bf..37e3b3b304a63 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -300,7 +300,13 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
     val shouldShowCompletedJobs = completedJobs.nonEmpty
     val shouldShowFailedJobs = failedJobs.nonEmpty
 
-    val completedJobNumStr = s"${completedJobs.size}"
+    val appSummary = store.appSummary()
+    val completedJobNumStr = if (completedJobs.size == appSummary.numCompletedJobs) {
+      s"${completedJobs.size}"
+    } else {
+      s"${appSummary.numCompletedJobs}, only showing ${completedJobs.size}"
+    }
+
     val schedulingMode = store.environmentInfo().sparkProperties.toMap
       .get("spark.scheduler.mode")
       .map { mode => SchedulingMode.withName(mode).toString }
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
index e4cf99e7b9e04..b1e343451e28e 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
@@ -39,7 +39,6 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
     val completedStages = allStages.filter(_.status == StageStatus.COMPLETE)
     val failedStages = allStages.filter(_.status == StageStatus.FAILED).reverse
 
-    val numCompletedStages = completedStages.size
     val numFailedStages = failedStages.size
     val subPath = "stages"
 
@@ -69,10 +68,11 @@ private[ui] class AllStagesPage(parent: StagesTab) extends WebUIPage("") {
     val shouldShowCompletedStages = completedStages.nonEmpty
     val shouldShowFailedStages = failedStages.nonEmpty
 
-    val completedStageNumStr = if (numCompletedStages == completedStages.size) {
-      s"$numCompletedStages"
+    val appSummary = parent.store.appSummary()
+    val completedStageNumStr = if (appSummary.numCompletedStages == completedStages.size) {
+      s"${appSummary.numCompletedStages}"
     } else {
-      s"$numCompletedStages, only showing ${completedStages.size}"
+      s"${appSummary.numCompletedStages}, only showing ${completedStages.size}"
     }
 
     val summary: NodeSeq =
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 3a9790cd57270..3738f85da5831 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -264,7 +264,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
 
     val badStageAttemptId = getContentAndCode("applications/local-1422981780767/stages/1/1")
     badStageAttemptId._1 should be (HttpServletResponse.SC_NOT_FOUND)
-    badStageAttemptId._3 should be (Some("unknown attempt 1 for stage 1."))
+    badStageAttemptId._3 should be (Some("unknown attempt for stage 1.  Found attempts: [0]"))
 
     val badStageId2 = getContentAndCode("applications/local-1422981780767/stages/flimflam")
     badStageId2._1 should be (HttpServletResponse.SC_NOT_FOUND)
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 88fe6bd70a14e..9cf4f7efb24a8 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -39,16 +39,20 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
   import config._
 
-  private val conf = new SparkConf().set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
+  private val conf = new SparkConf()
+    .set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
+    .set(ASYNC_TRACKING_ENABLED, false)
 
   private var time: Long = _
   private var testDir: File = _
-  private var store: KVStore = _
+  private var store: ElementTrackingStore = _
+  private var taskIdTracker = -1L
 
   before {
     time = 0L
     testDir = Utils.createTempDir()
-    store = KVUtils.open(testDir, getClass().getName())
+    store = new ElementTrackingStore(KVUtils.open(testDir, getClass().getName()), conf)
+    taskIdTracker = -1L
   }
 
   after {
@@ -185,22 +189,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start tasks from stage 1
     time += 1
-    var _taskIdTracker = -1L
-    def nextTaskId(): Long = {
-      _taskIdTracker += 1
-      _taskIdTracker
-    }
-
-    def createTasks(count: Int, time: Long): Seq[TaskInfo] = {
-      (1 to count).map { id =>
-        val exec = execIds(id.toInt % execIds.length)
-        val taskId = nextTaskId()
-        new TaskInfo(taskId, taskId.toInt, 1, time, exec, s"$exec.example.com",
-          TaskLocality.PROCESS_LOCAL, id % 2 == 0)
-      }
-    }
 
-    val s1Tasks = createTasks(4, time)
+    val s1Tasks = createTasks(4, execIds)
     s1Tasks.foreach { task =>
       listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId, task))
     }
@@ -419,7 +409,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start and fail all tasks of stage 2.
     time += 1
-    val s2Tasks = createTasks(4, time)
+    val s2Tasks = createTasks(4, execIds)
     s2Tasks.foreach { task =>
       listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, stages.last.attemptId, task))
     }
@@ -470,7 +460,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     listener.onStageSubmitted(SparkListenerStageSubmitted(newS2, jobProps))
     assert(store.count(classOf[StageDataWrapper]) === 3)
 
-    val newS2Tasks = createTasks(4, time)
+    val newS2Tasks = createTasks(4, execIds)
 
     newS2Tasks.foreach { task =>
       listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptId, task))
@@ -526,7 +516,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     assert(store.count(classOf[StageDataWrapper]) === 5)
 
     time += 1
-    val j2s2Tasks = createTasks(4, time)
+    val j2s2Tasks = createTasks(4, execIds)
 
     j2s2Tasks.foreach { task =>
       listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId, j2Stages.last.attemptId,
@@ -587,8 +577,9 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     // Stop executors.
-    listener.onExecutorRemoved(SparkListenerExecutorRemoved(41L, "1", "Test"))
-    listener.onExecutorRemoved(SparkListenerExecutorRemoved(41L, "2", "Test"))
+    time += 1
+    listener.onExecutorRemoved(SparkListenerExecutorRemoved(time, "1", "Test"))
+    listener.onExecutorRemoved(SparkListenerExecutorRemoved(time, "2", "Test"))
 
     Seq("1", "2").foreach { id =>
       check[ExecutorSummaryWrapper](id) { exec =>
@@ -851,6 +842,103 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  test("eviction of old data") {
+    val testConf = conf.clone()
+      .set(MAX_RETAINED_JOBS, 2)
+      .set(MAX_RETAINED_STAGES, 2)
+      .set(MAX_RETAINED_TASKS_PER_STAGE, 2)
+      .set(MAX_RETAINED_DEAD_EXECUTORS, 1)
+    val listener = new AppStatusListener(store, testConf, true)
+
+    // Start 3 jobs, all should be kept. Stop one, it should be evicted.
+    time += 1
+    listener.onJobStart(SparkListenerJobStart(1, time, Nil, null))
+    listener.onJobStart(SparkListenerJobStart(2, time, Nil, null))
+    listener.onJobStart(SparkListenerJobStart(3, time, Nil, null))
+    assert(store.count(classOf[JobDataWrapper]) === 3)
+
+    time += 1
+    listener.onJobEnd(SparkListenerJobEnd(2, time, JobSucceeded))
+    assert(store.count(classOf[JobDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[JobDataWrapper], 2)
+    }
+
+    // Start 3 stages, all should be kept. Stop 2 of them, the stopped one with the lowest id should
+    // be deleted. Start a new attempt of the second stopped one, and verify that the stage graph
+    // data is not deleted.
+    time += 1
+    val stages = Seq(
+      new StageInfo(1, 0, "stage1", 4, Nil, Nil, "details1"),
+      new StageInfo(2, 0, "stage2", 4, Nil, Nil, "details2"),
+      new StageInfo(3, 0, "stage3", 4, Nil, Nil, "details3"))
+
+    // Graph data is generated by the job start event, so fire it.
+    listener.onJobStart(SparkListenerJobStart(4, time, stages, null))
+
+    stages.foreach { s =>
+      time += 1
+      s.submissionTime = Some(time)
+      listener.onStageSubmitted(SparkListenerStageSubmitted(s, new Properties()))
+    }
+
+    assert(store.count(classOf[StageDataWrapper]) === 3)
+    assert(store.count(classOf[RDDOperationGraphWrapper]) === 3)
+
+    stages.drop(1).foreach { s =>
+      time += 1
+      s.completionTime = Some(time)
+      listener.onStageCompleted(SparkListenerStageCompleted(s))
+    }
+
+    assert(store.count(classOf[StageDataWrapper]) === 2)
+    assert(store.count(classOf[RDDOperationGraphWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[StageDataWrapper], Array(2, 0))
+    }
+
+    val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3")
+    time += 1
+    attempt2.submissionTime = Some(time)
+    listener.onStageSubmitted(SparkListenerStageSubmitted(attempt2, new Properties()))
+
+    assert(store.count(classOf[StageDataWrapper]) === 2)
+    assert(store.count(classOf[RDDOperationGraphWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[StageDataWrapper], Array(2, 0))
+    }
+    intercept[NoSuchElementException] {
+      store.read(classOf[StageDataWrapper], Array(3, 0))
+    }
+    store.read(classOf[StageDataWrapper], Array(3, 1))
+
+    // Start 2 tasks. Finish the second one.
+    time += 1
+    val tasks = createTasks(2, Array("1"))
+    tasks.foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+    }
+    assert(store.count(classOf[TaskDataWrapper]) === 2)
+
+    // Start a 3rd task. The finished tasks should be deleted.
+    createTasks(1, Array("1")).foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+    }
+    assert(store.count(classOf[TaskDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[TaskDataWrapper], tasks.last.id)
+    }
+
+    // Start a 4th task. The first task should be deleted, even if it's still running.
+    createTasks(1, Array("1")).foreach { task =>
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+    }
+    assert(store.count(classOf[TaskDataWrapper]) === 2)
+    intercept[NoSuchElementException] {
+      store.read(classOf[TaskDataWrapper], tasks.head.id)
+    }
+  }
+
   private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
 
   private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {
@@ -864,6 +952,20 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
       s"${orig.executorId}.example.com", TaskLocality.PROCESS_LOCAL, orig.speculative)
   }
 
+  private def createTasks(count: Int, execs: Array[String]): Seq[TaskInfo] = {
+    (1 to count).map { id =>
+      val exec = execs(id.toInt % execs.length)
+      val taskId = nextTaskId()
+      new TaskInfo(taskId, taskId.toInt, 1, time, exec, s"$exec.example.com",
+        TaskLocality.PROCESS_LOCAL, id % 2 == 0)
+    }
+  }
+
+  private def nextTaskId(): Long = {
+    taskIdTracker += 1
+    taskIdTracker
+  }
+
   private case class RddBlock(
       rddId: Int,
       partId: Int,
diff --git a/core/src/test/scala/org/apache/spark/status/ElementTrackingStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/ElementTrackingStoreSuite.scala
new file mode 100644
index 0000000000000..07a7b58404c29
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/ElementTrackingStoreSuite.scala
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.mockito.Mockito._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.util.kvstore._
+
+class ElementTrackingStoreSuite extends SparkFunSuite {
+
+  import config._
+
+  test("tracking for multiple types") {
+    val store = mock(classOf[KVStore])
+    val tracking = new ElementTrackingStore(store, new SparkConf()
+      .set(ASYNC_TRACKING_ENABLED, false))
+
+    var type1 = 0L
+    var type2 = 0L
+    var flushed = false
+
+    tracking.addTrigger(classOf[Type1], 100) { count =>
+      type1 = count
+    }
+    tracking.addTrigger(classOf[Type2], 1000) { count =>
+      type2 = count
+    }
+    tracking.onFlush {
+      flushed = true
+    }
+
+    when(store.count(classOf[Type1])).thenReturn(1L)
+    tracking.write(new Type1, true)
+    assert(type1 === 0L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type1])).thenReturn(100L)
+    tracking.write(new Type1, true)
+    assert(type1 === 0L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type1])).thenReturn(101L)
+    tracking.write(new Type1, true)
+    assert(type1 === 101L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type1])).thenReturn(200L)
+    tracking.write(new Type1, true)
+    assert(type1 === 200L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type2])).thenReturn(500L)
+    tracking.write(new Type2, true)
+    assert(type1 === 200L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type2])).thenReturn(1000L)
+    tracking.write(new Type2, true)
+    assert(type1 === 200L)
+    assert(type2 === 0L)
+
+    when(store.count(classOf[Type2])).thenReturn(2000L)
+    tracking.write(new Type2, true)
+    assert(type1 === 200L)
+    assert(type2 === 2000L)
+
+    tracking.close(false)
+    assert(flushed)
+    verify(store, never()).close()
+  }
+
+  private class Type1
+  private class Type2
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index df5f0b5335e82..326546787ab6c 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -42,6 +42,7 @@ import org.apache.spark.deploy.history.HistoryServerSuite
 import org.apache.spark.internal.config.MEMORY_OFFHEAP_SIZE
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.status.api.v1.{JacksonMessageWriter, RDDDataDistribution, StageStatus}
+import org.apache.spark.status.config._
 
 private[spark] class SparkUICssErrorHandler extends DefaultCssErrorHandler {
 
@@ -525,14 +526,15 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
     }
   }
 
-  ignore("stage & job retention") {
+  test("stage & job retention") {
     val conf = new SparkConf()
       .setMaster("local")
       .setAppName("test")
       .set("spark.ui.enabled", "true")
       .set("spark.ui.port", "0")
-      .set("spark.ui.retainedStages", "3")
-      .set("spark.ui.retainedJobs", "2")
+      .set(MAX_RETAINED_STAGES, 3)
+      .set(MAX_RETAINED_JOBS, 2)
+      .set(ASYNC_TRACKING_ENABLED, false)
     val sc = new SparkContext(conf)
     assert(sc.ui.isDefined)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
index c018fc8a332fa..fe0ad39c29025 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/StaticSQLConf.scala
@@ -95,4 +95,11 @@ object StaticSQLConf {
     .stringConf
     .toSequence
     .createOptional
+
+  val UI_RETAINED_EXECUTIONS =
+    buildStaticConf("spark.sql.ui.retainedExecutions")
+      .doc("Number of executions to retain in the Spark UI.")
+      .intConf
+      .createWithDefault(1000)
+
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 43cec4807ae4d..cf0000c6393a3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -27,14 +27,15 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler._
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric._
-import org.apache.spark.status.LiveEntity
+import org.apache.spark.sql.internal.StaticSQLConf._
+import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity}
 import org.apache.spark.status.config._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.kvstore.KVStore
 
 private[sql] class SQLAppStatusListener(
     conf: SparkConf,
-    kvstore: KVStore,
+    kvstore: ElementTrackingStore,
     live: Boolean,
     ui: Option[SparkUI] = None)
   extends SparkListener with Logging {
@@ -51,6 +52,23 @@ private[sql] class SQLAppStatusListener(
 
   private var uiInitialized = false
 
+  kvstore.addTrigger(classOf[SQLExecutionUIData], conf.get(UI_RETAINED_EXECUTIONS)) { count =>
+    cleanupExecutions(count)
+  }
+
+  kvstore.onFlush {
+    if (!live) {
+      val now = System.nanoTime()
+      liveExecutions.values.asScala.foreach { exec =>
+        // This saves the partial aggregated metrics to the store; this works currently because
+        // when the SHS sees an updated event log, all old data for the application is thrown
+        // away.
+        exec.metricsValues = aggregateMetrics(exec)
+        exec.write(kvstore, now)
+      }
+    }
+  }
+
   override def onJobStart(event: SparkListenerJobStart): Unit = {
     val executionIdString = event.properties.getProperty(SQLExecution.EXECUTION_ID_KEY)
     if (executionIdString == null) {
@@ -317,6 +335,17 @@ private[sql] class SQLAppStatusListener(
     }
   }
 
+  private def cleanupExecutions(count: Long): Unit = {
+    val countToDelete = count - conf.get(UI_RETAINED_EXECUTIONS)
+    if (countToDelete <= 0) {
+      return
+    }
+
+    val toDelete = KVUtils.viewToSeq(kvstore.view(classOf[SQLExecutionUIData]),
+        countToDelete.toInt) { e => e.completionTime.isDefined }
+    toDelete.foreach { e => kvstore.delete(e.getClass(), e.executionId) }
+  }
+
 }
 
 private class LiveExecutionData(val executionId: Long) extends LiveEntity {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
index 586d3ae411c74..7fd5f7395cdf3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
@@ -27,7 +27,7 @@ import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.scheduler.SparkListener
-import org.apache.spark.status.AppStatusPlugin
+import org.apache.spark.status.{AppStatusPlugin, ElementTrackingStore}
 import org.apache.spark.status.KVUtils.KVIndexParam
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.Utils
@@ -84,7 +84,7 @@ private[sql] class SQLAppStatusPlugin extends AppStatusPlugin {
 
   override def setupListeners(
       conf: SparkConf,
-      store: KVStore,
+      store: ElementTrackingStore,
       addListenerFn: SparkListener => Unit,
       live: Boolean): Unit = {
     // For live applications, the listener is installed in [[setupUI]]. This also avoids adding
@@ -100,7 +100,8 @@ private[sql] class SQLAppStatusPlugin extends AppStatusPlugin {
       case Some(sc) =>
         // If this is a live application, then install a listener that will enable the SQL
         // tab as soon as there's a SQL event posted to the bus.
-        val listener = new SQLAppStatusListener(sc.conf, ui.store.store, true, Some(ui))
+        val listener = new SQLAppStatusListener(sc.conf,
+          ui.store.store.asInstanceOf[ElementTrackingStore], true, Some(ui))
         sc.listenerBus.addToStatusQueue(listener)
 
       case _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index eba8d55daad58..932950687942c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.execution.{LeafExecNode, QueryExecution, SparkPlanInfo, SQLExecution}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.status.ElementTrackingStore
 import org.apache.spark.status.config._
 import org.apache.spark.util.{AccumulatorMetadata, JsonProtocol, LongAccumulator}
 import org.apache.spark.util.kvstore.InMemoryStore
@@ -43,7 +44,9 @@ import org.apache.spark.util.kvstore.InMemoryStore
 class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTestUtils {
   import testImplicits._
 
-  override protected def sparkConf = super.sparkConf.set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
+  override protected def sparkConf = {
+    super.sparkConf.set(LIVE_ENTITY_UPDATE_PERIOD, 0L).set(ASYNC_TRACKING_ENABLED, false)
+  }
 
   private def createTestDataFrame: DataFrame = {
     Seq(
@@ -107,10 +110,12 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
   private def sqlStoreTest(name: String)
       (fn: (SQLAppStatusStore, SparkListenerBus) => Unit): Unit = {
     test(name) {
-      val store = new InMemoryStore()
+      val conf = sparkConf
+      val store = new ElementTrackingStore(new InMemoryStore(), conf)
       val bus = new ReplayListenerBus()
-      val listener = new SQLAppStatusListener(sparkConf, store, true)
+      val listener = new SQLAppStatusListener(conf, store, true)
       bus.addListener(listener)
+      store.close(false)
       val sqlStore = new SQLAppStatusStore(store, Some(listener))
       fn(sqlStore, bus)
     }
@@ -491,15 +496,15 @@ private case class MyPlan(sc: SparkContext, expectedValue: Long) extends LeafExe
 
 class SQLListenerMemoryLeakSuite extends SparkFunSuite {
 
-  // TODO: this feature is not yet available in SQLAppStatusStore.
-  ignore("no memory leak") {
-    quietly {
-      val conf = new SparkConf()
-        .setMaster("local")
-        .setAppName("test")
-        .set(config.MAX_TASK_FAILURES, 1) // Don't retry the tasks to run this test quickly
-        .set("spark.sql.ui.retainedExecutions", "50") // Set it to 50 to run this test quickly
-      withSpark(new SparkContext(conf)) { sc =>
+  test("no memory leak") {
+    val conf = new SparkConf()
+      .setMaster("local")
+      .setAppName("test")
+      .set(config.MAX_TASK_FAILURES, 1) // Don't retry the tasks to run this test quickly
+      .set("spark.sql.ui.retainedExecutions", "50") // Set it to 50 to run this test quickly
+      .set(ASYNC_TRACKING_ENABLED, false)
+    withSpark(new SparkContext(conf)) { sc =>
+      quietly {
         val spark = new SparkSession(sc)
         import spark.implicits._
         // Run 100 successful executions and 100 failed executions.

From fbfa9be7e0df0b4489571422c45d0d64d05d3050 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 19 Dec 2017 07:30:29 +0900
Subject: [PATCH 138/356] Revert "Revert "[SPARK-22496][SQL] thrift server adds
 operation logs""

This reverts commit e58f275678fb4f904124a4a2a1762f04c835eb0e.
---
 .../cli/operation/ExecuteStatementOperation.java    | 13 +++++++++++++
 .../hive/service/cli/operation/SQLOperation.java    | 12 ------------
 .../SparkExecuteStatementOperation.scala            |  1 +
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
index 3f2de108f069a..6740d3bb59dc3 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/ExecuteStatementOperation.java
@@ -23,6 +23,7 @@
 
 import org.apache.hadoop.hive.ql.processors.CommandProcessor;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorFactory;
+import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hive.service.cli.HiveSQLException;
 import org.apache.hive.service.cli.OperationType;
 import org.apache.hive.service.cli.session.HiveSession;
@@ -67,4 +68,16 @@ protected void setConfOverlay(Map<String, String> confOverlay) {
       this.confOverlay = confOverlay;
     }
   }
+
+  protected void registerCurrentOperationLog() {
+    if (isOperationLogEnabled) {
+      if (operationLog == null) {
+        LOG.warn("Failed to get current OperationLog object of Operation: " +
+          getHandle().getHandleIdentifier());
+        isOperationLogEnabled = false;
+        return;
+      }
+      OperationLog.setCurrentOperationLog(operationLog);
+    }
+  }
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index 5014cedd870b6..fd9108eb53ca9 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -274,18 +274,6 @@ private Hive getSessionHive() throws HiveSQLException {
     }
   }
 
-  private void registerCurrentOperationLog() {
-    if (isOperationLogEnabled) {
-      if (operationLog == null) {
-        LOG.warn("Failed to get current OperationLog object of Operation: " +
-            getHandle().getHandleIdentifier());
-        isOperationLogEnabled = false;
-        return;
-      }
-      OperationLog.setCurrentOperationLog(operationLog);
-    }
-  }
-
   private void cleanup(OperationState state) throws HiveSQLException {
     setState(state);
     if (shouldRunAsync()) {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index f5191fa9132bd..664bc20601eaa 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -170,6 +170,7 @@ private[hive] class SparkExecuteStatementOperation(
         override def run(): Unit = {
           val doAsAction = new PrivilegedExceptionAction[Unit]() {
             override def run(): Unit = {
+              registerCurrentOperationLog()
               try {
                 execute()
               } catch {

From 3a07eff5af601511e97a05e6fea0e3d48f74c4f0 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Tue, 19 Dec 2017 07:35:03 +0900
Subject: [PATCH 139/356] [SPARK-22813][BUILD] Use lsof or /usr/sbin/lsof in
 run-tests.py

## What changes were proposed in this pull request?

In [the environment where `/usr/sbin/lsof` does not exist](https://github.com/apache/spark/pull/19695#issuecomment-342865001), `./dev/run-tests.py` for `maven` causes the following error. This is because the current `./dev/run-tests.py` checks existence of only `/usr/sbin/lsof` and aborts immediately if it does not exist.

This PR changes to check whether `lsof` or `/usr/sbin/lsof` exists.

```
/bin/sh: 1: /usr/sbin/lsof: not found

Usage:
 kill [options] <pid> [...]

Options:
 <pid> [...]            send signal to every <pid> listed
 -<signal>, -s, --signal <signal>
                        specify the <signal> to be sent
 -l, --list=[<signal>]  list all signal names, or convert one to a name
 -L, --table            list all signal names in a nice table

 -h, --help     display this help and exit
 -V, --version  output version information and exit

For more details see kill(1).
Traceback (most recent call last):
  File "./dev/run-tests.py", line 626, in <module>
    main()
  File "./dev/run-tests.py", line 597, in main
    build_apache_spark(build_tool, hadoop_version)
  File "./dev/run-tests.py", line 389, in build_apache_spark
    build_spark_maven(hadoop_version)
  File "./dev/run-tests.py", line 329, in build_spark_maven
    exec_maven(profiles_and_goals)
  File "./dev/run-tests.py", line 270, in exec_maven
    kill_zinc_on_port(zinc_port)
  File "./dev/run-tests.py", line 258, in kill_zinc_on_port
    subprocess.check_call(cmd, shell=True)
  File "/usr/lib/python2.7/subprocess.py", line 541, in check_call
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '/usr/sbin/lsof -P |grep 3156 | grep LISTEN | awk '{ print $2; }' | xargs kill' returned non-zero exit status 123
```

## How was this patch tested?

manually tested

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19998 from kiszk/SPARK-22813.
---
 dev/run-tests.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index ef0e788a91606..7e6f7ff060351 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -253,9 +253,9 @@ def kill_zinc_on_port(zinc_port):
     """
     Kill the Zinc process running on the given port, if one exists.
     """
-    cmd = ("/usr/sbin/lsof -P |grep %s | grep LISTEN "
-           "| awk '{ print $2; }' | xargs kill") % zinc_port
-    subprocess.check_call(cmd, shell=True)
+    cmd = "%s -P |grep %s | grep LISTEN | awk '{ print $2; }' | xargs kill"
+    lsof_exe = which("lsof")
+    subprocess.check_call(cmd % (lsof_exe if lsof_exe else "/usr/sbin/lsof", zinc_port), shell=True)
 
 
 def exec_maven(mvn_args=()):

From 0609dcc0382227f65e8a3340e33db438f7f1e6e7 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Mon, 18 Dec 2017 15:31:47 -0800
Subject: [PATCH 140/356] [SPARK-22777][SCHEDULER] Kubernetes mode dockerfile
 permission and distribution

# What changes were proposed in this pull request?
1. entrypoint.sh for Kubernetes spark-base image is marked as executable (644 -> 755)
2. make-distribution script will now create kubernetes/dockerfiles directory when Kubernetes support is compiled.

## How was this patch tested?
Manual testing

cc/ ueshin jiangxb1987 mridulm vanzin rxin liyinan926

Author: foxish <ramanathana@google.com>

Closes #20007 from foxish/fix-dockerfiles.
---
 dev/make-distribution.sh                                  | 8 +++++++-
 .../docker/src/main/dockerfiles/spark-base/entrypoint.sh  | 0
 2 files changed, 7 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh

diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 48a824499acb9..7245163ea2a51 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -168,12 +168,18 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 # Copy jars
 cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
 
-# Only create the yarn directory if the yarn artifacts were build.
+# Only create the yarn directory if the yarn artifacts were built.
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
   mkdir "$DISTDIR/yarn"
   cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
 fi
 
+# Only create and copy the dockerfiles directory if the kubernetes artifacts were built.
+if [ -d "$SPARK_HOME"/resource-managers/kubernetes/core/target/ ]; then
+  mkdir -p "$DISTDIR/kubernetes/"
+  cp -a "$SPARK_HOME"/resource-managers/kubernetes/docker/src/main/dockerfiles "$DISTDIR/kubernetes/"
+fi
+
 # Copy examples and dependencies
 mkdir -p "$DISTDIR/examples/jars"
 cp "$SPARK_HOME"/examples/target/scala*/jars/* "$DISTDIR/examples/jars"
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
old mode 100644
new mode 100755

From d4e69595dd17e4c31757410f91274122497dddbd Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 19 Dec 2017 09:48:31 +0800
Subject: [PATCH 141/356] [MINOR][SQL] Remove Useless zipWithIndex from
 ResolveAliases

## What changes were proposed in this pull request?
Remove useless `zipWithIndex` from `ResolveAliases `.

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20009 from gatorsmile/try22.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 28 +++++++++----------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 0d5e866c0683e..10b237fb22b96 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -222,22 +222,20 @@ class Analyzer(
    */
   object ResolveAliases extends Rule[LogicalPlan] {
     private def assignAliases(exprs: Seq[NamedExpression]) = {
-      exprs.zipWithIndex.map {
-        case (expr, i) =>
-          expr.transformUp { case u @ UnresolvedAlias(child, optGenAliasFunc) =>
-            child match {
-              case ne: NamedExpression => ne
-              case go @ GeneratorOuter(g: Generator) if g.resolved => MultiAlias(go, Nil)
-              case e if !e.resolved => u
-              case g: Generator => MultiAlias(g, Nil)
-              case c @ Cast(ne: NamedExpression, _, _) => Alias(c, ne.name)()
-              case e: ExtractValue => Alias(e, toPrettySQL(e))()
-              case e if optGenAliasFunc.isDefined =>
-                Alias(child, optGenAliasFunc.get.apply(e))()
-              case e => Alias(e, toPrettySQL(e))()
-            }
+      exprs.map(_.transformUp { case u @ UnresolvedAlias(child, optGenAliasFunc) =>
+          child match {
+            case ne: NamedExpression => ne
+            case go @ GeneratorOuter(g: Generator) if g.resolved => MultiAlias(go, Nil)
+            case e if !e.resolved => u
+            case g: Generator => MultiAlias(g, Nil)
+            case c @ Cast(ne: NamedExpression, _, _) => Alias(c, ne.name)()
+            case e: ExtractValue => Alias(e, toPrettySQL(e))()
+            case e if optGenAliasFunc.isDefined =>
+              Alias(child, optGenAliasFunc.get.apply(e))()
+            case e => Alias(e, toPrettySQL(e))()
           }
-      }.asInstanceOf[Seq[NamedExpression]]
+        }
+      ).asInstanceOf[Seq[NamedExpression]]
     }
 
     private def hasUnresolvedAlias(exprs: Seq[NamedExpression]) =

From ab7346f20c515c5af63b9400a9fe6e3a72138ef3 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Tue, 19 Dec 2017 21:51:56 +0800
Subject: [PATCH 142/356] [SPARK-22673][SQL] InMemoryRelation should utilize
 existing stats whenever possible

## What changes were proposed in this pull request?

The current implementation of InMemoryRelation always uses the most expensive execution plan when writing cache
With CBO enabled, we can actually have a more exact estimation of the underlying table size...

## How was this patch tested?

existing test

Author: CodingCat <zhunansjtu@gmail.com>
Author: Nan Zhu <CodingCat@users.noreply.github.com>
Author: Nan Zhu <nanzhu@uber.com>

Closes #19864 from CodingCat/SPARK-22673.
---
 .../spark/sql/execution/CacheManager.scala    | 20 +++---
 .../execution/columnar/InMemoryRelation.scala | 21 ++++---
 .../columnar/InMemoryColumnarQuerySuite.scala | 61 ++++++++++++++++---
 3 files changed, 75 insertions(+), 27 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index 5a1d680c99f66..b05fe49a6ac3b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -26,7 +26,7 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.columnar.InMemoryRelation
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
 import org.apache.spark.storage.StorageLevel
@@ -94,14 +94,13 @@ class CacheManager extends Logging {
       logWarning("Asked to cache already cached data.")
     } else {
       val sparkSession = query.sparkSession
-      cachedData.add(CachedData(
-        planToCache,
-        InMemoryRelation(
-          sparkSession.sessionState.conf.useCompression,
-          sparkSession.sessionState.conf.columnBatchSize,
-          storageLevel,
-          sparkSession.sessionState.executePlan(planToCache).executedPlan,
-          tableName)))
+      val inMemoryRelation = InMemoryRelation(
+        sparkSession.sessionState.conf.useCompression,
+        sparkSession.sessionState.conf.columnBatchSize, storageLevel,
+        sparkSession.sessionState.executePlan(planToCache).executedPlan,
+        tableName,
+        planToCache.stats)
+      cachedData.add(CachedData(planToCache, inMemoryRelation))
     }
   }
 
@@ -148,7 +147,8 @@ class CacheManager extends Logging {
           batchSize = cd.cachedRepresentation.batchSize,
           storageLevel = cd.cachedRepresentation.storageLevel,
           child = spark.sessionState.executePlan(cd.plan).executedPlan,
-          tableName = cd.cachedRepresentation.tableName)
+          tableName = cd.cachedRepresentation.tableName,
+          statsOfPlanToCache = cd.plan.stats)
         needToRecache += cd.copy(cachedRepresentation = newCache)
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
index a1c62a729900e..51928d914841e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryRelation.scala
@@ -37,8 +37,10 @@ object InMemoryRelation {
       batchSize: Int,
       storageLevel: StorageLevel,
       child: SparkPlan,
-      tableName: Option[String]): InMemoryRelation =
-    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)()
+      tableName: Option[String],
+      statsOfPlanToCache: Statistics): InMemoryRelation =
+    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)(
+      statsOfPlanToCache = statsOfPlanToCache)
 }
 
 
@@ -60,7 +62,8 @@ case class InMemoryRelation(
     @transient child: SparkPlan,
     tableName: Option[String])(
     @transient var _cachedColumnBuffers: RDD[CachedBatch] = null,
-    val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator)
+    val batchStats: LongAccumulator = child.sqlContext.sparkContext.longAccumulator,
+    statsOfPlanToCache: Statistics = null)
   extends logical.LeafNode with MultiInstanceRelation {
 
   override protected def innerChildren: Seq[SparkPlan] = Seq(child)
@@ -71,9 +74,8 @@ case class InMemoryRelation(
 
   override def computeStats(): Statistics = {
     if (batchStats.value == 0L) {
-      // Underlying columnar RDD hasn't been materialized, no useful statistics information
-      // available, return the default statistics.
-      Statistics(sizeInBytes = child.sqlContext.conf.defaultSizeInBytes)
+      // Underlying columnar RDD hasn't been materialized, use the stats from the plan to cache
+      statsOfPlanToCache
     } else {
       Statistics(sizeInBytes = batchStats.value.longValue)
     }
@@ -142,7 +144,7 @@ case class InMemoryRelation(
   def withOutput(newOutput: Seq[Attribute]): InMemoryRelation = {
     InMemoryRelation(
       newOutput, useCompression, batchSize, storageLevel, child, tableName)(
-        _cachedColumnBuffers, batchStats)
+        _cachedColumnBuffers, batchStats, statsOfPlanToCache)
   }
 
   override def newInstance(): this.type = {
@@ -154,11 +156,12 @@ case class InMemoryRelation(
       child,
       tableName)(
         _cachedColumnBuffers,
-        batchStats).asInstanceOf[this.type]
+        batchStats,
+        statsOfPlanToCache).asInstanceOf[this.type]
   }
 
   def cachedColumnBuffers: RDD[CachedBatch] = _cachedColumnBuffers
 
   override protected def otherCopyArgs: Seq[AnyRef] =
-    Seq(_cachedColumnBuffers, batchStats)
+    Seq(_cachedColumnBuffers, batchStats, statsOfPlanToCache)
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index e662e294228db..ff7c5e58e9863 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.types._
 import org.apache.spark.storage.StorageLevel._
+import org.apache.spark.util.Utils
 
 class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   import testImplicits._
@@ -40,7 +41,8 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
     data.createOrReplaceTempView(s"testData$dataType")
     val storageLevel = MEMORY_ONLY
     val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan
-    val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None)
+    val inMemoryRelation = InMemoryRelation(useCompression = true, 5, storageLevel, plan, None,
+      data.logicalPlan.stats)
 
     assert(inMemoryRelation.cachedColumnBuffers.getStorageLevel == storageLevel)
     inMemoryRelation.cachedColumnBuffers.collect().head match {
@@ -116,7 +118,8 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
   test("simple columnar query") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
-    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
+    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None,
+      testData.logicalPlan.stats)
 
     checkAnswer(scan, testData.collect().toSeq)
   }
@@ -132,8 +135,10 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   }
 
   test("projection") {
-    val plan = spark.sessionState.executePlan(testData.select('value, 'key).logicalPlan).sparkPlan
-    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
+    val logicalPlan = testData.select('value, 'key).logicalPlan
+    val plan = spark.sessionState.executePlan(logicalPlan).sparkPlan
+    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None,
+      logicalPlan.stats)
 
     checkAnswer(scan, testData.collect().map {
       case Row(key: Int, value: String) => value -> key
@@ -149,7 +154,8 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-1436 regression: in-memory columns must be able to be accessed multiple times") {
     val plan = spark.sessionState.executePlan(testData.logicalPlan).sparkPlan
-    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None)
+    val scan = InMemoryRelation(useCompression = true, 5, MEMORY_ONLY, plan, None,
+      testData.logicalPlan.stats)
 
     checkAnswer(scan, testData.collect().toSeq)
     checkAnswer(scan, testData.collect().toSeq)
@@ -323,7 +329,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
   test("SPARK-17549: cached table size should be correctly calculated") {
     val data = spark.sparkContext.parallelize(1 to 10, 5).toDF()
     val plan = spark.sessionState.executePlan(data.logicalPlan).sparkPlan
-    val cached = InMemoryRelation(true, 5, MEMORY_ONLY, plan, None)
+    val cached = InMemoryRelation(true, 5, MEMORY_ONLY, plan, None, data.logicalPlan.stats)
 
     // Materialize the data.
     val expectedAnswer = data.collect()
@@ -448,8 +454,8 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
   test("SPARK-22249: buildFilter should not throw exception when In contains an empty list") {
     val attribute = AttributeReference("a", IntegerType)()
-    val testRelation = InMemoryRelation(false, 1, MEMORY_ONLY,
-      LocalTableScanExec(Seq(attribute), Nil), None)
+    val localTableScanExec = LocalTableScanExec(Seq(attribute), Nil)
+    val testRelation = InMemoryRelation(false, 1, MEMORY_ONLY, localTableScanExec, None, null)
     val tableScanExec = InMemoryTableScanExec(Seq(attribute),
       Seq(In(attribute, Nil)), testRelation)
     assert(tableScanExec.partitionFilters.isEmpty)
@@ -479,4 +485,43 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-22673: InMemoryRelation should utilize existing stats of the plan to be cached") {
+    withSQLConf("spark.sql.cbo.enabled" -> "true") {
+      withTempPath { workDir =>
+        withTable("table1") {
+          val workDirPath = workDir.getAbsolutePath
+          val data = Seq(100, 200, 300, 400).toDF("count")
+          data.write.parquet(workDirPath)
+          val dfFromFile = spark.read.parquet(workDirPath).cache()
+          val inMemoryRelation = dfFromFile.queryExecution.optimizedPlan.collect {
+            case plan: InMemoryRelation => plan
+          }.head
+          // InMemoryRelation's stats is file size before the underlying RDD is materialized
+          assert(inMemoryRelation.computeStats().sizeInBytes === 740)
+
+          // InMemoryRelation's stats is updated after materializing RDD
+          dfFromFile.collect()
+          assert(inMemoryRelation.computeStats().sizeInBytes === 16)
+
+          // test of catalog table
+          val dfFromTable = spark.catalog.createTable("table1", workDirPath).cache()
+          val inMemoryRelation2 = dfFromTable.queryExecution.optimizedPlan.
+            collect { case plan: InMemoryRelation => plan }.head
+
+          // Even CBO enabled, InMemoryRelation's stats keeps as the file size before table's stats
+          // is calculated
+          assert(inMemoryRelation2.computeStats().sizeInBytes === 740)
+
+          // InMemoryRelation's stats should be updated after calculating stats of the table
+          // clear cache to simulate a fresh environment
+          dfFromTable.unpersist(blocking = true)
+          spark.sql("ANALYZE TABLE table1 COMPUTE STATISTICS")
+          val inMemoryRelation3 = spark.read.table("table1").cache().queryExecution.optimizedPlan.
+            collect { case plan: InMemoryRelation => plan }.head
+          assert(inMemoryRelation3.computeStats().sizeInBytes === 48)
+        }
+      }
+    }
+  }
 }

From 571aa275541d71dbef8f0c86eab4ef04d56e4394 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Tue, 19 Dec 2017 21:55:21 +0800
Subject: [PATCH 143/356] [SPARK-21984][SQL] Join estimation based on
 equi-height histogram

## What changes were proposed in this pull request?

Equi-height histogram is one of the state-of-the-art statistics for cardinality estimation, which can provide better estimation accuracy, and good at cases with skew data.

This PR is to improve join estimation based on equi-height histogram. The difference from basic estimation (based on ndv) is the logic for computing join cardinality and the new ndv after join.

The main idea is as follows:
1. find overlapped ranges between two histograms from two join keys;
2. apply the formula `T(A IJ B) = T(A) * T(B) / max(V(A.k1), V(B.k1))` in each overlapped range.

## How was this patch tested?
Added new test cases.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #19594 from wzhfy/join_estimation_histogram.
---
 .../statsEstimation/EstimationUtils.scala     | 169 ++++++++++++++
 .../statsEstimation/JoinEstimation.scala      |  54 ++++-
 .../statsEstimation/JoinEstimationSuite.scala | 209 +++++++++++++++++-
 3 files changed, 428 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 6f868cbd072c8..71e852afe0659 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical.statsEstimation
 
+import scala.collection.mutable.ArrayBuffer
 import scala.math.BigDecimal.RoundingMode
 
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap}
@@ -212,4 +213,172 @@ object EstimationUtils {
     }
   }
 
+  /**
+   * Returns overlapped ranges between two histograms, in the given value range
+   * [lowerBound, upperBound].
+   */
+  def getOverlappedRanges(
+    leftHistogram: Histogram,
+    rightHistogram: Histogram,
+    lowerBound: Double,
+    upperBound: Double): Seq[OverlappedRange] = {
+    val overlappedRanges = new ArrayBuffer[OverlappedRange]()
+    // Only bins whose range intersect [lowerBound, upperBound] have join possibility.
+    val leftBins = leftHistogram.bins
+      .filter(b => b.lo <= upperBound && b.hi >= lowerBound)
+    val rightBins = rightHistogram.bins
+      .filter(b => b.lo <= upperBound && b.hi >= lowerBound)
+
+    leftBins.foreach { lb =>
+      rightBins.foreach { rb =>
+        val (left, leftHeight) = trimBin(lb, leftHistogram.height, lowerBound, upperBound)
+        val (right, rightHeight) = trimBin(rb, rightHistogram.height, lowerBound, upperBound)
+        // Only collect overlapped ranges.
+        if (left.lo <= right.hi && left.hi >= right.lo) {
+          // Collect overlapped ranges.
+          val range = if (right.lo >= left.lo && right.hi >= left.hi) {
+            // Case1: the left bin is "smaller" than the right bin
+            //      left.lo            right.lo     left.hi          right.hi
+            // --------+------------------+------------+----------------+------->
+            if (left.hi == right.lo) {
+              // The overlapped range has only one value.
+              OverlappedRange(
+                lo = right.lo,
+                hi = right.lo,
+                leftNdv = 1,
+                rightNdv = 1,
+                leftNumRows = leftHeight / left.ndv,
+                rightNumRows = rightHeight / right.ndv
+              )
+            } else {
+              val leftRatio = (left.hi - right.lo) / (left.hi - left.lo)
+              val rightRatio = (left.hi - right.lo) / (right.hi - right.lo)
+              OverlappedRange(
+                lo = right.lo,
+                hi = left.hi,
+                leftNdv = left.ndv * leftRatio,
+                rightNdv = right.ndv * rightRatio,
+                leftNumRows = leftHeight * leftRatio,
+                rightNumRows = rightHeight * rightRatio
+              )
+            }
+          } else if (right.lo <= left.lo && right.hi <= left.hi) {
+            // Case2: the left bin is "larger" than the right bin
+            //      right.lo           left.lo      right.hi         left.hi
+            // --------+------------------+------------+----------------+------->
+            if (right.hi == left.lo) {
+              // The overlapped range has only one value.
+              OverlappedRange(
+                lo = right.hi,
+                hi = right.hi,
+                leftNdv = 1,
+                rightNdv = 1,
+                leftNumRows = leftHeight / left.ndv,
+                rightNumRows = rightHeight / right.ndv
+              )
+            } else {
+              val leftRatio = (right.hi - left.lo) / (left.hi - left.lo)
+              val rightRatio = (right.hi - left.lo) / (right.hi - right.lo)
+              OverlappedRange(
+                lo = left.lo,
+                hi = right.hi,
+                leftNdv = left.ndv * leftRatio,
+                rightNdv = right.ndv * rightRatio,
+                leftNumRows = leftHeight * leftRatio,
+                rightNumRows = rightHeight * rightRatio
+              )
+            }
+          } else if (right.lo >= left.lo && right.hi <= left.hi) {
+            // Case3: the left bin contains the right bin
+            //      left.lo            right.lo     right.hi         left.hi
+            // --------+------------------+------------+----------------+------->
+            val leftRatio = (right.hi - right.lo) / (left.hi - left.lo)
+            OverlappedRange(
+              lo = right.lo,
+              hi = right.hi,
+              leftNdv = left.ndv * leftRatio,
+              rightNdv = right.ndv,
+              leftNumRows = leftHeight * leftRatio,
+              rightNumRows = rightHeight
+            )
+          } else {
+            assert(right.lo <= left.lo && right.hi >= left.hi)
+            // Case4: the right bin contains the left bin
+            //      right.lo           left.lo      left.hi          right.hi
+            // --------+------------------+------------+----------------+------->
+            val rightRatio = (left.hi - left.lo) / (right.hi - right.lo)
+            OverlappedRange(
+              lo = left.lo,
+              hi = left.hi,
+              leftNdv = left.ndv,
+              rightNdv = right.ndv * rightRatio,
+              leftNumRows = leftHeight,
+              rightNumRows = rightHeight * rightRatio
+            )
+          }
+          overlappedRanges += range
+        }
+      }
+    }
+    overlappedRanges
+  }
+
+  /**
+   * Given an original bin and a value range [lowerBound, upperBound], returns the trimmed part
+   * of the bin in that range and its number of rows.
+   * @param bin the input histogram bin.
+   * @param height the number of rows of the given histogram bin inside an equi-height histogram.
+   * @param lowerBound lower bound of the given range.
+   * @param upperBound upper bound of the given range.
+   * @return trimmed part of the given bin and its number of rows.
+   */
+  def trimBin(bin: HistogramBin, height: Double, lowerBound: Double, upperBound: Double)
+  : (HistogramBin, Double) = {
+    val (lo, hi) = if (bin.lo <= lowerBound && bin.hi >= upperBound) {
+      //       bin.lo          lowerBound     upperBound      bin.hi
+      // --------+------------------+------------+-------------+------->
+      (lowerBound, upperBound)
+    } else if (bin.lo <= lowerBound && bin.hi >= lowerBound) {
+      //       bin.lo          lowerBound      bin.hi      upperBound
+      // --------+------------------+------------+-------------+------->
+      (lowerBound, bin.hi)
+    } else if (bin.lo <= upperBound && bin.hi >= upperBound) {
+      //    lowerBound            bin.lo     upperBound       bin.hi
+      // --------+------------------+------------+-------------+------->
+      (bin.lo, upperBound)
+    } else {
+      //    lowerBound            bin.lo        bin.hi     upperBound
+      // --------+------------------+------------+-------------+------->
+      assert(bin.lo >= lowerBound && bin.hi <= upperBound)
+      (bin.lo, bin.hi)
+    }
+
+    if (hi == lo) {
+      // Note that bin.hi == bin.lo also falls into this branch.
+      (HistogramBin(lo, hi, 1), height / bin.ndv)
+    } else {
+      assert(bin.hi != bin.lo)
+      val ratio = (hi - lo) / (bin.hi - bin.lo)
+      (HistogramBin(lo, hi, math.ceil(bin.ndv * ratio).toLong), height * ratio)
+    }
+  }
+
+  /**
+   * A join between two equi-height histograms may produce multiple overlapped ranges.
+   * Each overlapped range is produced by a part of one bin in the left histogram and a part of
+   * one bin in the right histogram.
+   * @param lo lower bound of this overlapped range.
+   * @param hi higher bound of this overlapped range.
+   * @param leftNdv ndv in the left part.
+   * @param rightNdv ndv in the right part.
+   * @param leftNumRows number of rows in the left part.
+   * @param rightNumRows number of rows in the right part.
+   */
+  case class OverlappedRange(
+    lo: Double,
+    hi: Double,
+    leftNdv: Double,
+    rightNdv: Double,
+    leftNumRows: Double,
+    rightNumRows: Double)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
index b073108c26ee5..f0294a4246703 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference, Expression}
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, Join, Statistics}
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 
 
@@ -191,8 +191,19 @@ case class JoinEstimation(join: Join) extends Logging {
       val rInterval = ValueInterval(rightKeyStat.min, rightKeyStat.max, rightKey.dataType)
       if (ValueInterval.isIntersected(lInterval, rInterval)) {
         val (newMin, newMax) = ValueInterval.intersect(lInterval, rInterval, leftKey.dataType)
-        val (card, joinStat) = computeByNdv(leftKey, rightKey, newMin, newMax)
-        keyStatsAfterJoin += (leftKey -> joinStat, rightKey -> joinStat)
+        val (card, joinStat) = (leftKeyStat.histogram, rightKeyStat.histogram) match {
+          case (Some(l: Histogram), Some(r: Histogram)) =>
+            computeByHistogram(leftKey, rightKey, l, r, newMin, newMax)
+          case _ =>
+            computeByNdv(leftKey, rightKey, newMin, newMax)
+        }
+        keyStatsAfterJoin += (
+          // Histograms are propagated as unchanged. During future estimation, they should be
+          // truncated by the updated max/min. In this way, only pointers of the histograms are
+          // propagated and thus reduce memory consumption.
+          leftKey -> joinStat.copy(histogram = leftKeyStat.histogram),
+          rightKey -> joinStat.copy(histogram = rightKeyStat.histogram)
+        )
         // Return cardinality estimated from the most selective join keys.
         if (card < joinCard) joinCard = card
       } else {
@@ -225,6 +236,43 @@ case class JoinEstimation(join: Join) extends Logging {
     (ceil(card), newStats)
   }
 
+  /** Compute join cardinality using equi-height histograms. */
+  private def computeByHistogram(
+      leftKey: AttributeReference,
+      rightKey: AttributeReference,
+      leftHistogram: Histogram,
+      rightHistogram: Histogram,
+      newMin: Option[Any],
+      newMax: Option[Any]): (BigInt, ColumnStat) = {
+    val overlappedRanges = getOverlappedRanges(
+      leftHistogram = leftHistogram,
+      rightHistogram = rightHistogram,
+      // Only numeric values have equi-height histograms.
+      lowerBound = newMin.get.toString.toDouble,
+      upperBound = newMax.get.toString.toDouble)
+
+    var card: BigDecimal = 0
+    var totalNdv: Double = 0
+    for (i <- overlappedRanges.indices) {
+      val range = overlappedRanges(i)
+      if (i == 0 || range.hi != overlappedRanges(i - 1).hi) {
+        // If range.hi == overlappedRanges(i - 1).hi, that means the current range has only one
+        // value, and this value is already counted in the previous range. So there is no need to
+        // count it in this range.
+        totalNdv += math.min(range.leftNdv, range.rightNdv)
+      }
+      // Apply the formula in this overlapped range.
+      card += range.leftNumRows * range.rightNumRows / math.max(range.leftNdv, range.rightNdv)
+    }
+
+    val leftKeyStat = leftStats.attributeStats(leftKey)
+    val rightKeyStat = rightStats.attributeStats(rightKey)
+    val newMaxLen = math.min(leftKeyStat.maxLen, rightKeyStat.maxLen)
+    val newAvgLen = (leftKeyStat.avgLen + rightKeyStat.avgLen) / 2
+    val newStats = ColumnStat(ceil(totalNdv), newMin, newMax, 0, newAvgLen, newMaxLen)
+    (ceil(card), newStats)
+  }
+
   /**
    * Propagate or update column stats for output attributes.
    */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
index 097c78eb27fca..26139d85d25fb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/JoinEstimationSuite.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeMap, AttributeReference, EqualTo}
 import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Join, Project, Statistics}
+import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types.{DateType, TimestampType, _}
@@ -67,6 +67,213 @@ class JoinEstimationSuite extends StatsEstimationTestBase {
     rowCount = 2,
     attributeStats = AttributeMap(Seq("key-1-2", "key-2-3").map(nameToColInfo)))
 
+  private def estimateByHistogram(
+      leftHistogram: Histogram,
+      rightHistogram: Histogram,
+      expectedMin: Double,
+      expectedMax: Double,
+      expectedNdv: Long,
+      expectedRows: Long): Unit = {
+    val col1 = attr("key1")
+    val col2 = attr("key2")
+    val c1 = generateJoinChild(col1, leftHistogram, expectedMin, expectedMax)
+    val c2 = generateJoinChild(col2, rightHistogram, expectedMin, expectedMax)
+
+    val c1JoinC2 = Join(c1, c2, Inner, Some(EqualTo(col1, col2)))
+    val c2JoinC1 = Join(c2, c1, Inner, Some(EqualTo(col2, col1)))
+    val expectedStatsAfterJoin = Statistics(
+      sizeInBytes = expectedRows * (8 + 2 * 4),
+      rowCount = Some(expectedRows),
+      attributeStats = AttributeMap(Seq(
+        col1 -> c1.stats.attributeStats(col1).copy(
+          distinctCount = expectedNdv, min = Some(expectedMin), max = Some(expectedMax)),
+        col2 -> c2.stats.attributeStats(col2).copy(
+          distinctCount = expectedNdv, min = Some(expectedMin), max = Some(expectedMax))))
+    )
+
+    // Join order should not affect estimation result.
+    Seq(c1JoinC2, c2JoinC1).foreach { join =>
+      assert(join.stats == expectedStatsAfterJoin)
+    }
+  }
+
+  private def generateJoinChild(
+      col: Attribute,
+      histogram: Histogram,
+      expectedMin: Double,
+      expectedMax: Double): LogicalPlan = {
+    val colStat = inferColumnStat(histogram)
+    StatsTestPlan(
+      outputList = Seq(col),
+      rowCount = (histogram.height * histogram.bins.length).toLong,
+      attributeStats = AttributeMap(Seq(col -> colStat)))
+  }
+
+  /** Column statistics should be consistent with histograms in tests. */
+  private def inferColumnStat(histogram: Histogram): ColumnStat = {
+    var ndv = 0L
+    for (i <- histogram.bins.indices) {
+      val bin = histogram.bins(i)
+      if (i == 0 || bin.hi != histogram.bins(i - 1).hi) {
+        ndv += bin.ndv
+      }
+    }
+    ColumnStat(distinctCount = ndv, min = Some(histogram.bins.head.lo),
+      max = Some(histogram.bins.last.hi), nullCount = 0, avgLen = 4, maxLen = 4,
+      histogram = Some(histogram))
+  }
+
+  test("equi-height histograms: a bin is contained by another one") {
+    val histogram1 = Histogram(height = 300, Array(
+      HistogramBin(lo = 10, hi = 30, ndv = 10), HistogramBin(lo = 30, hi = 60, ndv = 30)))
+    val histogram2 = Histogram(height = 100, Array(
+      HistogramBin(lo = 0, hi = 50, ndv = 50), HistogramBin(lo = 50, hi = 100, ndv = 40)))
+    // test bin trimming
+    val (t0, h0) = trimBin(histogram2.bins(0), height = 100, lowerBound = 10, upperBound = 60)
+    assert(t0 == HistogramBin(lo = 10, hi = 50, ndv = 40) && h0 == 80)
+    val (t1, h1) = trimBin(histogram2.bins(1), height = 100, lowerBound = 10, upperBound = 60)
+    assert(t1 == HistogramBin(lo = 50, hi = 60, ndv = 8) && h1 == 20)
+
+    val expectedRanges = Seq(
+      // histogram1.bins(0) overlaps t0
+      OverlappedRange(10, 30, 10, 40 * 1 / 2, 300, 80 * 1 / 2),
+      // histogram1.bins(1) overlaps t0
+      OverlappedRange(30, 50, 30 * 2 / 3, 40 * 1 / 2, 300 * 2 / 3, 80 * 1 / 2),
+      // histogram1.bins(1) overlaps t1
+      OverlappedRange(50, 60, 30 * 1 / 3, 8, 300 * 1 / 3, 20)
+    )
+    assert(expectedRanges.equals(
+      getOverlappedRanges(histogram1, histogram2, lowerBound = 10, upperBound = 60)))
+
+    estimateByHistogram(
+      leftHistogram = histogram1,
+      rightHistogram = histogram2,
+      expectedMin = 10,
+      expectedMax = 60,
+      expectedNdv = 10 + 20 + 8,
+      expectedRows = 300 * 40 / 20 + 200 * 40 / 20 + 100 * 20 / 10)
+  }
+
+  test("equi-height histograms: a bin has only one value after trimming") {
+    val histogram1 = Histogram(height = 300, Array(
+      HistogramBin(lo = 50, hi = 60, ndv = 10), HistogramBin(lo = 60, hi = 75, ndv = 3)))
+    val histogram2 = Histogram(height = 100, Array(
+      HistogramBin(lo = 0, hi = 50, ndv = 50), HistogramBin(lo = 50, hi = 100, ndv = 40)))
+    // test bin trimming
+    val (t0, h0) = trimBin(histogram2.bins(0), height = 100, lowerBound = 50, upperBound = 75)
+    assert(t0 == HistogramBin(lo = 50, hi = 50, ndv = 1) && h0 == 2)
+    val (t1, h1) = trimBin(histogram2.bins(1), height = 100, lowerBound = 50, upperBound = 75)
+    assert(t1 == HistogramBin(lo = 50, hi = 75, ndv = 20) && h1 == 50)
+
+    val expectedRanges = Seq(
+      // histogram1.bins(0) overlaps t0
+      OverlappedRange(50, 50, 1, 1, 300 / 10, 2),
+      // histogram1.bins(0) overlaps t1
+      OverlappedRange(50, 60, 10, 20 * 10 / 25, 300, 50 * 10 / 25),
+      // histogram1.bins(1) overlaps t1
+      OverlappedRange(60, 75, 3, 20 * 15 / 25, 300, 50 * 15 / 25)
+    )
+    assert(expectedRanges.equals(
+      getOverlappedRanges(histogram1, histogram2, lowerBound = 50, upperBound = 75)))
+
+    estimateByHistogram(
+      leftHistogram = histogram1,
+      rightHistogram = histogram2,
+      expectedMin = 50,
+      expectedMax = 75,
+      expectedNdv = 1 + 8 + 3,
+      expectedRows = 30 * 2 / 1 + 300 * 20 / 10 + 300 * 30 / 12)
+  }
+
+  test("equi-height histograms: skew distribution (some bins have only one value)") {
+    val histogram1 = Histogram(height = 300, Array(
+      HistogramBin(lo = 30, hi = 30, ndv = 1),
+      HistogramBin(lo = 30, hi = 30, ndv = 1),
+      HistogramBin(lo = 30, hi = 60, ndv = 30)))
+    val histogram2 = Histogram(height = 100, Array(
+      HistogramBin(lo = 0, hi = 50, ndv = 50), HistogramBin(lo = 50, hi = 100, ndv = 40)))
+    // test bin trimming
+    val (t0, h0) = trimBin(histogram2.bins(0), height = 100, lowerBound = 30, upperBound = 60)
+    assert(t0 == HistogramBin(lo = 30, hi = 50, ndv = 20) && h0 == 40)
+    val (t1, h1) = trimBin(histogram2.bins(1), height = 100, lowerBound = 30, upperBound = 60)
+    assert(t1 ==HistogramBin(lo = 50, hi = 60, ndv = 8) && h1 == 20)
+
+    val expectedRanges = Seq(
+      OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
+      OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
+      OverlappedRange(30, 50, 30 * 2 / 3, 20, 300 * 2 / 3, 40),
+      OverlappedRange(50, 60, 30 * 1 / 3, 8, 300 * 1 / 3, 20)
+    )
+    assert(expectedRanges.equals(
+      getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 60)))
+
+    estimateByHistogram(
+      leftHistogram = histogram1,
+      rightHistogram = histogram2,
+      expectedMin = 30,
+      expectedMax = 60,
+      expectedNdv = 1 + 20 + 8,
+      expectedRows = 300 * 2 / 1 + 300 * 2 / 1 + 200 * 40 / 20 + 100 * 20 / 10)
+  }
+
+  test("equi-height histograms: skew distribution (histograms have different skewed values") {
+    val histogram1 = Histogram(height = 300, Array(
+      HistogramBin(lo = 30, hi = 30, ndv = 1), HistogramBin(lo = 30, hi = 60, ndv = 30)))
+    val histogram2 = Histogram(height = 100, Array(
+      HistogramBin(lo = 0, hi = 50, ndv = 50), HistogramBin(lo = 50, hi = 50, ndv = 1)))
+    // test bin trimming
+    val (t0, h0) = trimBin(histogram1.bins(1), height = 300, lowerBound = 30, upperBound = 50)
+    assert(t0 == HistogramBin(lo = 30, hi = 50, ndv = 20) && h0 == 200)
+    val (t1, h1) = trimBin(histogram2.bins(0), height = 100, lowerBound = 30, upperBound = 50)
+    assert(t1 == HistogramBin(lo = 30, hi = 50, ndv = 20) && h1 == 40)
+
+    val expectedRanges = Seq(
+      OverlappedRange(30, 30, 1, 1, 300, 40 / 20),
+      OverlappedRange(30, 50, 20, 20, 200, 40),
+      OverlappedRange(50, 50, 1, 1, 200 / 20, 100)
+    )
+    assert(expectedRanges.equals(
+      getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 50)))
+
+    estimateByHistogram(
+      leftHistogram = histogram1,
+      rightHistogram = histogram2,
+      expectedMin = 30,
+      expectedMax = 50,
+      expectedNdv = 1 + 20,
+      expectedRows = 300 * 2 / 1 + 200 * 40 / 20 + 10 * 100 / 1)
+  }
+
+  test("equi-height histograms: skew distribution (both histograms have the same skewed value") {
+    val histogram1 = Histogram(height = 300, Array(
+      HistogramBin(lo = 30, hi = 30, ndv = 1), HistogramBin(lo = 30, hi = 60, ndv = 30)))
+    val histogram2 = Histogram(height = 150, Array(
+      HistogramBin(lo = 0, hi = 30, ndv = 30), HistogramBin(lo = 30, hi = 30, ndv = 1)))
+    // test bin trimming
+    val (t0, h0) = trimBin(histogram1.bins(1), height = 300, lowerBound = 30, upperBound = 30)
+    assert(t0 == HistogramBin(lo = 30, hi = 30, ndv = 1) && h0 == 10)
+    val (t1, h1) = trimBin(histogram2.bins(0), height = 150, lowerBound = 30, upperBound = 30)
+    assert(t1 == HistogramBin(lo = 30, hi = 30, ndv = 1) && h1 == 5)
+
+    val expectedRanges = Seq(
+      OverlappedRange(30, 30, 1, 1, 300, 5),
+      OverlappedRange(30, 30, 1, 1, 300, 150),
+      OverlappedRange(30, 30, 1, 1, 10, 5),
+      OverlappedRange(30, 30, 1, 1, 10, 150)
+    )
+    assert(expectedRanges.equals(
+      getOverlappedRanges(histogram1, histogram2, lowerBound = 30, upperBound = 30)))
+
+    estimateByHistogram(
+      leftHistogram = histogram1,
+      rightHistogram = histogram2,
+      expectedMin = 30,
+      expectedMax = 30,
+      // only one value: 30
+      expectedNdv = 1,
+      expectedRows = 300 * 5 / 1 + 300 * 150 / 1 + 10 * 5 / 1 + 10 * 150 / 1)
+  }
+
   test("cross join") {
     // table1 (key-1-5 int, key-5-9 int): (1, 9), (2, 8), (3, 7), (4, 6), (5, 5)
     // table2 (key-1-2 int, key-2-4 int): (1, 2), (2, 3), (2, 4)

From 28315714ddef3ddcc192375e98dd5207cf4ecc98 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 19 Dec 2017 22:12:23 +0800
Subject: [PATCH 144/356] [SPARK-22791][SQL][SS] Redact Output of Explain

## What changes were proposed in this pull request?

When calling explain on a query, the output can contain sensitive information. We should provide an admin/user to redact such information.

Before this PR, the plan of SS is like this
```
== Physical Plan ==
*HashAggregate(keys=[value#6], functions=[count(1)], output=[value#6, count(1)#12L])
+- StateStoreSave [value#6], state info [ checkpoint = file:/private/var/folders/vx/j0ydl5rn0gd9mgrh1pljnw900000gn/T/temporary-91c6fac0-609f-4bc8-ad57-52c189f06797/state, runId = 05a4b3af-f02c-40f8-9ff9-a3e18bae496f, opId = 0, ver = 0, numPartitions = 5], Complete, 0
   +- *HashAggregate(keys=[value#6], functions=[merge_count(1)], output=[value#6, count#18L])
      +- StateStoreRestore [value#6], state info [ checkpoint = file:/private/var/folders/vx/j0ydl5rn0gd9mgrh1pljnw900000gn/T/temporary-91c6fac0-609f-4bc8-ad57-52c189f06797/state, runId = 05a4b3af-f02c-40f8-9ff9-a3e18bae496f, opId = 0, ver = 0, numPartitions = 5]
         +- *HashAggregate(keys=[value#6], functions=[merge_count(1)], output=[value#6, count#18L])
            +- Exchange hashpartitioning(value#6, 5)
               +- *HashAggregate(keys=[value#6], functions=[partial_count(1)], output=[value#6, count#18L])
                  +- *SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#6]
                     +- *MapElements <function1>, obj#5: java.lang.String
                        +- *DeserializeToObject value#30.toString, obj#4: java.lang.String
                           +- LocalTableScan [value#30]
```

After this PR, we can get the following output if users set `spark.redaction.string.regex` to `file:/[\\w_]+`
```
== Physical Plan ==
*HashAggregate(keys=[value#6], functions=[count(1)], output=[value#6, count(1)#12L])
+- StateStoreSave [value#6], state info [ checkpoint = *********(redacted)/var/folders/vx/j0ydl5rn0gd9mgrh1pljnw900000gn/T/temporary-e7da9b7d-3ec0-474d-8b8c-927f7d12ed72/state, runId = 8a9c3761-93d5-4896-ab82-14c06240dcea, opId = 0, ver = 0, numPartitions = 5], Complete, 0
   +- *HashAggregate(keys=[value#6], functions=[merge_count(1)], output=[value#6, count#32L])
      +- StateStoreRestore [value#6], state info [ checkpoint = *********(redacted)/var/folders/vx/j0ydl5rn0gd9mgrh1pljnw900000gn/T/temporary-e7da9b7d-3ec0-474d-8b8c-927f7d12ed72/state, runId = 8a9c3761-93d5-4896-ab82-14c06240dcea, opId = 0, ver = 0, numPartitions = 5]
         +- *HashAggregate(keys=[value#6], functions=[merge_count(1)], output=[value#6, count#32L])
            +- Exchange hashpartitioning(value#6, 5)
               +- *HashAggregate(keys=[value#6], functions=[partial_count(1)], output=[value#6, count#32L])
                  +- *SerializeFromObject [staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, input[0, java.lang.String, true], true, false) AS value#6]
                     +- *MapElements <function1>, obj#5: java.lang.String
                        +- *DeserializeToObject value#27.toString, obj#4: java.lang.String
                           +- LocalTableScan [value#27]
```
## How was this patch tested?
Added a test case

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19985 from gatorsmile/redactPlan.
---
 .../scala/org/apache/spark/util/Utils.scala   | 26 +++++++++++----
 .../apache/spark/sql/internal/SQLConf.scala   | 11 +++++++
 .../sql/execution/DataSourceScanExec.scala    |  2 +-
 .../spark/sql/execution/QueryExecution.scala  | 13 ++++++--
 .../DataSourceScanExecRedactionSuite.scala    | 31 +++++++++++++++++
 .../spark/sql/streaming/StreamSuite.scala     | 33 ++++++++++++++++++-
 6 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 8871870eb8681..5853302973140 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2650,15 +2650,29 @@ private[spark] object Utils extends Logging {
     redact(redactionPattern, kvs)
   }
 
+  /**
+   * Redact the sensitive values in the given map. If a map key matches the redaction pattern then
+   * its value is replaced with a dummy text.
+   */
+  def redact(regex: Option[Regex], kvs: Seq[(String, String)]): Seq[(String, String)] = {
+    regex match {
+      case None => kvs
+      case Some(r) => redact(r, kvs)
+    }
+  }
+
   /**
    * Redact the sensitive information in the given string.
    */
-  def redact(conf: SparkConf, text: String): String = {
-    if (text == null || text.isEmpty || conf == null || !conf.contains(STRING_REDACTION_PATTERN)) {
-      text
-    } else {
-      val regex = conf.get(STRING_REDACTION_PATTERN).get
-      regex.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+  def redact(regex: Option[Regex], text: String): String = {
+    regex match {
+      case None => text
+      case Some(r) =>
+        if (text == null || text.isEmpty) {
+          text
+        } else {
+          r.replaceAllIn(text, REDACTION_REPLACEMENT_TEXT)
+        }
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index cf7e3ebce7411..bdc8d92e84079 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicReference
 
 import scala.collection.JavaConverters._
 import scala.collection.immutable
+import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
 
@@ -1035,6 +1036,14 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val SQL_STRING_REDACTION_PATTERN =
+    ConfigBuilder("spark.sql.redaction.string.regex")
+      .doc("Regex to decide which parts of strings produced by Spark contain sensitive " +
+        "information. When this regex matches a string part, that string part is replaced by a " +
+        "dummy value. This is currently used to redact the output of SQL explain commands. " +
+        "When this conf is not set, the value from `spark.redaction.string.regex` is used.")
+      .fallbackConf(org.apache.spark.internal.config.STRING_REDACTION_PATTERN)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1173,6 +1182,8 @@ class SQLConf extends Serializable with Logging {
 
   def escapedStringLiterals: Boolean = getConf(ESCAPED_STRING_LITERALS)
 
+  def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
+
   /**
    * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
    * identifiers are equal.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 747749bc72e66..27c7dc3ee4533 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -69,7 +69,7 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport {
    * Shorthand for calling redactString() without specifying redacting rules
    */
   private def redact(text: String): String = {
-    Utils.redact(SparkSession.getActiveSession.map(_.sparkContext.conf).orNull, text)
+    Utils.redact(sqlContext.sessionState.conf.stringRedationPattern, text)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 946475a1e9751..8bfe3eff0c3b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -194,13 +194,13 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     }
   }
 
-  def simpleString: String = {
+  def simpleString: String = withRedaction {
     s"""== Physical Plan ==
        |${stringOrError(executedPlan.treeString(verbose = false))}
       """.stripMargin.trim
   }
 
-  override def toString: String = {
+  override def toString: String = withRedaction {
     def output = Utils.truncatedString(
       analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}"), ", ")
     val analyzedPlan = Seq(
@@ -219,7 +219,7 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     """.stripMargin.trim
   }
 
-  def stringWithStats: String = {
+  def stringWithStats: String = withRedaction {
     // trigger to compute stats for logical plans
     optimizedPlan.stats
 
@@ -231,6 +231,13 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     """.stripMargin.trim
   }
 
+  /**
+   * Redact the sensitive information in the given string.
+   */
+  private def withRedaction(message: String): String = {
+    Utils.redact(sparkSession.sessionState.conf.stringRedationPattern, message)
+  }
+
   /** A special namespace for commands that can be used to debug query execution. */
   // scalastyle:off
   object debug {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
index 423e1288e8dcb..c8d045a32d73c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/DataSourceScanExecRedactionSuite.scala
@@ -20,6 +20,7 @@ import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 
 /**
@@ -52,4 +53,34 @@ class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.simpleString.contains(replacement))
     }
   }
+
+  private def isIncluded(queryExecution: QueryExecution, msg: String): Boolean = {
+    queryExecution.toString.contains(msg) ||
+    queryExecution.simpleString.contains(msg) ||
+    queryExecution.stringWithStats.contains(msg)
+  }
+
+  test("explain is redacted using SQLConf") {
+    withTempDir { dir =>
+      val basePath = dir.getCanonicalPath
+      spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString)
+      val df = spark.read.parquet(basePath)
+      val replacement = "*********"
+
+      // Respect SparkConf and replace file:/
+      assert(isIncluded(df.queryExecution, replacement))
+
+      assert(isIncluded(df.queryExecution, "FileScan"))
+      assert(!isIncluded(df.queryExecution, "file:/"))
+
+      withSQLConf(SQLConf.SQL_STRING_REDACTION_PATTERN.key -> "(?i)FileScan") {
+        // Respect SQLConf and replace FileScan
+        assert(isIncluded(df.queryExecution, replacement))
+
+        assert(!isIncluded(df.queryExecution, "FileScan"))
+        assert(isIncluded(df.queryExecution, "file:/"))
+      }
+    }
+  }
+
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 9e696b2236b68..fa4b2dd6a6c9b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -28,7 +28,7 @@ import com.google.common.util.concurrent.UncheckedExecutionException
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.conf.Configuration
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.plans.logical.Range
@@ -418,6 +418,37 @@ class StreamSuite extends StreamTest {
     assert(OutputMode.Update === InternalOutputModes.Update)
   }
 
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set("spark.redaction.string.regex", "file:/[\\w_]+")
+
+  test("explain - redaction") {
+    val replacement = "*********"
+
+    val inputData = MemoryStream[String]
+    val df = inputData.toDS().map(_ + "foo").groupBy("value").agg(count("*"))
+    // Test StreamingQuery.display
+    val q = df.writeStream.queryName("memory_explain").outputMode("complete").format("memory")
+      .start()
+      .asInstanceOf[StreamingQueryWrapper]
+      .streamingQuery
+    try {
+      inputData.addData("abc")
+      q.processAllAvailable()
+
+      val explainWithoutExtended = q.explainInternal(false)
+      assert(explainWithoutExtended.contains(replacement))
+      assert(explainWithoutExtended.contains("StateStoreRestore"))
+      assert(!explainWithoutExtended.contains("file:/"))
+
+      val explainWithExtended = q.explainInternal(true)
+      assert(explainWithExtended.contains(replacement))
+      assert(explainWithExtended.contains("StateStoreRestore"))
+      assert(!explainWithoutExtended.contains("file:/"))
+    } finally {
+      q.stop()
+    }
+  }
+
   test("explain") {
     val inputData = MemoryStream[String]
     val df = inputData.toDS().map(_ + "foo").groupBy("value").agg(count("*"))

From b779c93518bd46850d6576bd34ea11f78fc4e01a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 19 Dec 2017 22:17:04 +0800
Subject: [PATCH 145/356] [SPARK-22815][SQL] Keep PromotePrecision in Optimized
 Plans

## What changes were proposed in this pull request?
We could get incorrect results by running DecimalPrecision twice. This PR resolves the original found in https://github.com/apache/spark/pull/15048 and https://github.com/apache/spark/pull/14797. After this PR, it becomes easier to change it back using `children` instead of using `innerChildren`.

## How was this patch tested?
The existing test.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20000 from gatorsmile/keepPromotePrecision.
---
 .../spark/sql/catalyst/analysis/StreamingJoinHelper.scala     | 4 +++-
 .../spark/sql/catalyst/expressions/decimalExpressions.scala   | 2 ++
 .../org/apache/spark/sql/catalyst/optimizer/expressions.scala | 1 -
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
index 072dc954879ca..7a0aa08289efa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/StreamingJoinHelper.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.analysis
 import scala.util.control.NonFatal
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.catalyst.expressions.{Add, AttributeReference, AttributeSet, Cast, CheckOverflow, Expression, ExpressionSet, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Multiply, PreciseTimestampConversion, PredicateHelper, Subtract, TimeAdd, TimeSub, UnaryMinus}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans.logical.{EventTimeWatermark, LogicalPlan}
 import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark._
@@ -238,6 +238,8 @@ object StreamingJoinHelper extends PredicateHelper with Logging {
           collect(child, !negate)
         case CheckOverflow(child, _) =>
           collect(child, negate)
+        case PromotePrecision(child) =>
+          collect(child, negate)
         case Cast(child, dataType, _) =>
           dataType match {
             case _: NumericType | _: TimestampType => collect(child, negate)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
index 752dea23e1f7a..db1579ba28671 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -70,10 +70,12 @@ case class MakeDecimal(child: Expression, precision: Int, scale: Int) extends Un
 case class PromotePrecision(child: Expression) extends UnaryExpression {
   override def dataType: DataType = child.dataType
   override def eval(input: InternalRow): Any = child.eval(input)
+  /** Just a simple pass-through for code generation. */
   override def genCode(ctx: CodegenContext): ExprCode = child.genCode(ctx)
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ev.copy("")
   override def prettyName: String = "promote_precision"
   override def sql: String = child.sql
+  override lazy val canonicalized: Expression = child.canonicalized
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 6305b6c84bae3..85295aff19808 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -614,7 +614,6 @@ object SimplifyCasts extends Rule[LogicalPlan] {
 object RemoveDispensableExpressions extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
     case UnaryPositive(child) => child
-    case PromotePrecision(child) => child
   }
 }
 

From ee56fc3432e7e328c29b88a255f7e2c2a4739754 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Wed, 20 Dec 2017 00:10:54 +0800
Subject: [PATCH 146/356] [SPARK-18016][SQL] Code Generation: Constant Pool
 Limit - reduce entries for mutable state

## What changes were proposed in this pull request?

This PR is follow-on of #19518. This PR tries to reduce the number of constant pool entries used for accessing mutable state.
There are two directions:
1. Primitive type variables should be allocated at the outer class due to better performance. Otherwise, this PR allocates an array.
2. The length of allocated array is up to 32768 due to avoiding usage of constant pool entry at access (e.g. `mutableStateArray[32767]`).

Here are some discussions to determine these directions.
1. [[1]](https://github.com/apache/spark/pull/19518#issuecomment-346690464), [[2]](https://github.com/apache/spark/pull/19518#issuecomment-346690642), [[3]](https://github.com/apache/spark/pull/19518#issuecomment-346828180), [[4]](https://github.com/apache/spark/pull/19518#issuecomment-346831544), [[5]](https://github.com/apache/spark/pull/19518#issuecomment-346857340)
2. [[6]](https://github.com/apache/spark/pull/19518#issuecomment-346729172), [[7]](https://github.com/apache/spark/pull/19518#issuecomment-346798358), [[8]](https://github.com/apache/spark/pull/19518#issuecomment-346870408)

This PR modifies `addMutableState` function in the `CodeGenerator` to check if the declared state can be easily initialized compacted into an array. We identify three types of states that cannot compacted:

- Primitive type state (ints, booleans, etc) if the number of them does not exceed threshold
- Multiple-dimensional array type
- `inline = true`

When `useFreshName = false`, the given name is used.

Many codes were ported from #19518. Many efforts were put here. I think this PR should credit to bdrillard

With this PR, the following code is generated:
```
/* 005 */ class SpecificMutableProjection extends org.apache.spark.sql.catalyst.expressions.codegen.BaseMutableProjection {
/* 006 */
/* 007 */   private Object[] references;
/* 008 */   private InternalRow mutableRow;
/* 009 */   private boolean isNull_0;
/* 010 */   private boolean isNull_1;
/* 011 */   private boolean isNull_2;
/* 012 */   private int value_2;
/* 013 */   private boolean isNull_3;
...
/* 10006 */   private int value_4999;
/* 10007 */   private boolean isNull_5000;
/* 10008 */   private int value_5000;
/* 10009 */   private InternalRow[] mutableStateArray = new InternalRow[2];
/* 10010 */   private boolean[] mutableStateArray1 = new boolean[7001];
/* 10011 */   private int[] mutableStateArray2 = new int[1001];
/* 10012 */   private UTF8String[] mutableStateArray3 = new UTF8String[6000];
/* 10013 */
...
/* 107956 */     private void init_176() {
/* 107957 */       isNull_4986 = true;
/* 107958 */       value_4986 = -1;
...
/* 108004 */     }
...
```

## How was this patch tested?

Added a new test case to `GeneratedProjectionSuite`

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #19811 from kiszk/SPARK-18016.
---
 .../sql/catalyst/expressions/Expression.scala |   3 +-
 .../MonotonicallyIncreasingID.scala           |   6 +-
 .../expressions/SparkPartitionID.scala        |   3 +-
 .../sql/catalyst/expressions/arithmetic.scala |   6 +-
 .../expressions/codegen/CodeGenerator.scala   | 157 +++++++++++++++---
 .../codegen/GenerateMutableProjection.scala   |  40 ++---
 .../codegen/GenerateUnsafeProjection.scala    |  19 +--
 .../expressions/conditionalExpressions.scala  |   3 +-
 .../expressions/datetimeExpressions.scala     |  37 ++---
 .../sql/catalyst/expressions/generators.scala |   6 +-
 .../expressions/nullExpressions.scala         |   3 +-
 .../expressions/objects/objects.scala         |  60 ++++---
 .../expressions/randomExpressions.scala       |   6 +-
 .../expressions/regexpExpressions.scala       |  34 ++--
 .../expressions/stringExpressions.scala       |  23 +--
 .../ArithmeticExpressionSuite.scala           |   4 +-
 .../sql/catalyst/expressions/CastSuite.scala  |   2 +-
 .../expressions/CodeGenerationSuite.scala     |  29 +++-
 .../expressions/ComplexTypeSuite.scala        |   2 +-
 .../ConditionalExpressionSuite.scala          |   2 +-
 .../expressions/NullExpressionsSuite.scala    |   2 +-
 .../catalyst/expressions/PredicateSuite.scala |   4 +-
 .../expressions/RegexpExpressionsSuite.scala  |   5 +-
 .../catalyst/expressions/ScalaUDFSuite.scala  |   2 +-
 .../codegen/GeneratedProjectionSuite.scala    |  27 +++
 .../optimizer/complexTypesSuite.scala         |   2 +-
 .../sql/execution/ColumnarBatchScan.scala     |  26 ++-
 .../sql/execution/DataSourceScanExec.scala    |   6 +-
 .../apache/spark/sql/execution/SortExec.scala |  18 +-
 .../sql/execution/WholeStageCodegenExec.scala |   5 +-
 .../aggregate/HashAggregateExec.scala         |  44 +++--
 .../aggregate/HashMapGenerator.scala          |   6 +-
 .../execution/basicPhysicalOperators.scala    |  71 ++++----
 .../columnar/GenerateColumnAccessor.scala     |   3 +-
 .../joins/BroadcastHashJoinExec.scala         |  15 +-
 .../execution/joins/SortMergeJoinExec.scala   |  21 ++-
 .../apache/spark/sql/execution/limit.scala    |   6 +-
 37 files changed, 404 insertions(+), 304 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 743782a6453e9..4568714933095 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -119,8 +119,7 @@ abstract class Expression extends TreeNode[Expression] {
     // TODO: support whole stage codegen too
     if (eval.code.trim.length > 1024 && ctx.INPUT_ROW != null && ctx.currentVars == null) {
       val setIsNull = if (eval.isNull != "false" && eval.isNull != "true") {
-        val globalIsNull = ctx.freshName("globalIsNull")
-        ctx.addMutableState(ctx.JAVA_BOOLEAN, globalIsNull)
+        val globalIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "globalIsNull")
         val localIsNull = eval.isNull
         eval.isNull = globalIsNull
         s"$globalIsNull = $localIsNull;"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 821d784a01342..784eaf8195194 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -65,10 +65,8 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val countTerm = ctx.freshName("count")
-    val partitionMaskTerm = ctx.freshName("partitionMask")
-    ctx.addMutableState(ctx.JAVA_LONG, countTerm)
-    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm)
+    val countTerm = ctx.addMutableState(ctx.JAVA_LONG, "count")
+    val partitionMaskTerm = ctx.addMutableState(ctx.JAVA_LONG, "partitionMask")
     ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
     ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 4fa18d6b3209b..736ca37c6d54a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -43,8 +43,7 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic {
   override protected def evalInternal(input: InternalRow): Int = partitionId
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val idTerm = ctx.freshName("partitionId")
-    ctx.addMutableState(ctx.JAVA_INT, idTerm)
+    val idTerm = ctx.addMutableState(ctx.JAVA_INT, "partitionId")
     ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;")
     ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index 1893eec22b65d..d3a8cb5804717 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -602,8 +602,7 @@ case class Least(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    val tmpIsNull = ctx.freshName("leastTmpIsNull")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
+    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "leastTmpIsNull")
     val evals = evalChildren.map(eval =>
       s"""
          |${eval.code}
@@ -683,8 +682,7 @@ case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    val tmpIsNull = ctx.freshName("greatestTmpIsNull")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
+    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "greatestTmpIsNull")
     val evals = evalChildren.map(eval =>
       s"""
          |${eval.code}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 3a03a65e1af92..41a920ba3d677 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -137,22 +137,63 @@ class CodegenContext {
   var currentVars: Seq[ExprCode] = null
 
   /**
-   * Holding expressions' mutable states like `MonotonicallyIncreasingID.count` as a
-   * 3-tuple: java type, variable name, code to init it.
-   * As an example, ("int", "count", "count = 0;") will produce code:
+   * Holding expressions' inlined mutable states like `MonotonicallyIncreasingID.count` as a
+   * 2-tuple: java type, variable name.
+   * As an example, ("int", "count") will produce code:
    * {{{
    *   private int count;
    * }}}
-   * as a member variable, and add
-   * {{{
-   *   count = 0;
-   * }}}
-   * to the constructor.
+   * as a member variable
    *
    * They will be kept as member variables in generated classes like `SpecificProjection`.
    */
-  val mutableStates: mutable.ArrayBuffer[(String, String, String)] =
-    mutable.ArrayBuffer.empty[(String, String, String)]
+  val inlinedMutableStates: mutable.ArrayBuffer[(String, String)] =
+    mutable.ArrayBuffer.empty[(String, String)]
+
+  /**
+   * The mapping between mutable state types and corrseponding compacted arrays.
+   * The keys are java type string. The values are [[MutableStateArrays]] which encapsulates
+   * the compacted arrays for the mutable states with the same java type.
+   */
+  val arrayCompactedMutableStates: mutable.Map[String, MutableStateArrays] =
+    mutable.Map.empty[String, MutableStateArrays]
+
+  // An array holds the code that will initialize each state
+  val mutableStateInitCode: mutable.ArrayBuffer[String] =
+    mutable.ArrayBuffer.empty[String]
+
+  /**
+   * This class holds a set of names of mutableStateArrays that is used for compacting mutable
+   * states for a certain type, and holds the next available slot of the current compacted array.
+   */
+  class MutableStateArrays {
+    val arrayNames = mutable.ListBuffer.empty[String]
+    createNewArray()
+
+    private[this] var currentIndex = 0
+
+    private def createNewArray() = arrayNames.append(freshName("mutableStateArray"))
+
+    def getCurrentIndex: Int = currentIndex
+
+    /**
+     * Returns the reference of next available slot in current compacted array. The size of each
+     * compacted array is controlled by the config `CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT`.
+     * Once reaching the threshold, new compacted array is created.
+     */
+    def getNextSlot(): String = {
+      if (currentIndex < CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT) {
+        val res = s"${arrayNames.last}[$currentIndex]"
+        currentIndex += 1
+        res
+      } else {
+        createNewArray()
+        currentIndex = 1
+        s"${arrayNames.last}[0]"
+      }
+    }
+
+  }
 
   /**
    * Add a mutable state as a field to the generated class. c.f. the comments above.
@@ -163,11 +204,52 @@ class CodegenContext {
    *                 the list of default imports available.
    *                 Also, generic type arguments are accepted but ignored.
    * @param variableName Name of the field.
-   * @param initCode The statement(s) to put into the init() method to initialize this field.
+   * @param initFunc Function includes statement(s) to put into the init() method to initialize
+   *                 this field. The argument is the name of the mutable state variable.
    *                 If left blank, the field will be default-initialized.
+   * @param forceInline whether the declaration and initialization code may be inlined rather than
+   *                    compacted. Please set `true` into forceInline for one of the followings:
+   *                    1. use the original name of the status
+   *                    2. expect to non-frequently generate the status
+   *                       (e.g. not much sort operators in one stage)
+   * @param useFreshName If this is false and the mutable state ends up inlining in the outer
+   *                     class, the name is not changed
+   * @return the name of the mutable state variable, which is the original name or fresh name if
+   *         the variable is inlined to the outer class, or an array access if the variable is to
+   *         be stored in an array of variables of the same type.
+   *         A variable will be inlined into the outer class when one of the following conditions
+   *         are satisfied:
+   *         1. forceInline is true
+   *         2. its type is primitive type and the total number of the inlined mutable variables
+   *            is less than `CodeGenerator.OUTER_CLASS_VARIABLES_THRESHOLD`
+   *         3. its type is multi-dimensional array
+   *         When a variable is compacted into an array, the max size of the array for compaction
+   *         is given by `CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT`.
    */
-  def addMutableState(javaType: String, variableName: String, initCode: String = ""): Unit = {
-    mutableStates += ((javaType, variableName, initCode))
+  def addMutableState(
+      javaType: String,
+      variableName: String,
+      initFunc: String => String = _ => "",
+      forceInline: Boolean = false,
+      useFreshName: Boolean = true): String = {
+
+    // want to put a primitive type variable at outerClass for performance
+    val canInlinePrimitive = isPrimitiveType(javaType) &&
+      (inlinedMutableStates.length < CodeGenerator.OUTER_CLASS_VARIABLES_THRESHOLD)
+    if (forceInline || canInlinePrimitive || javaType.contains("[][]")) {
+      val varName = if (useFreshName) freshName(variableName) else variableName
+      val initCode = initFunc(varName)
+      inlinedMutableStates += ((javaType, varName))
+      mutableStateInitCode += initCode
+      varName
+    } else {
+      val arrays = arrayCompactedMutableStates.getOrElseUpdate(javaType, new MutableStateArrays)
+      val element = arrays.getNextSlot()
+
+      val initCode = initFunc(element)
+      mutableStateInitCode += initCode
+      element
+    }
   }
 
   /**
@@ -176,8 +258,7 @@ class CodegenContext {
    * data types like: UTF8String, ArrayData, MapData & InternalRow.
    */
   def addBufferedState(dataType: DataType, variableName: String, initCode: String): ExprCode = {
-    val value = freshName(variableName)
-    addMutableState(javaType(dataType), value, "")
+    val value = addMutableState(javaType(dataType), variableName)
     val code = dataType match {
       case StringType => s"$value = $initCode.clone();"
       case _: StructType | _: ArrayType | _: MapType => s"$value = $initCode.copy();"
@@ -189,15 +270,37 @@ class CodegenContext {
   def declareMutableStates(): String = {
     // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in
     // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones.
-    mutableStates.distinct.map { case (javaType, variableName, _) =>
+    val inlinedStates = inlinedMutableStates.distinct.map { case (javaType, variableName) =>
       s"private $javaType $variableName;"
-    }.mkString("\n")
+    }
+
+    val arrayStates = arrayCompactedMutableStates.flatMap { case (javaType, mutableStateArrays) =>
+      val numArrays = mutableStateArrays.arrayNames.size
+      mutableStateArrays.arrayNames.zipWithIndex.map { case (arrayName, index) =>
+        val length = if (index + 1 == numArrays) {
+          mutableStateArrays.getCurrentIndex
+        } else {
+          CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT
+        }
+        if (javaType.contains("[]")) {
+          // initializer had an one-dimensional array variable
+          val baseType = javaType.substring(0, javaType.length - 2)
+          s"private $javaType[] $arrayName = new $baseType[$length][];"
+        } else {
+          // initializer had a scalar variable
+          s"private $javaType[] $arrayName = new $javaType[$length];"
+        }
+      }
+    }
+
+    (inlinedStates ++ arrayStates).mkString("\n")
   }
 
   def initMutableStates(): String = {
     // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in
     // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones.
-    val initCodes = mutableStates.distinct.map(_._3 + "\n")
+    val initCodes = mutableStateInitCode.distinct
+
     // The generated initialization code may exceed 64kb function size limit in JVM if there are too
     // many mutable states, so split it into multiple functions.
     splitExpressions(expressions = initCodes, funcName = "init", arguments = Nil)
@@ -1011,9 +1114,9 @@ class CodegenContext {
     val commonExprs = equivalentExpressions.getAllEquivalentExprs.filter(_.size > 1)
     commonExprs.foreach { e =>
       val expr = e.head
-      val fnName = freshName("evalExpr")
-      val isNull = s"${fnName}IsNull"
-      val value = s"${fnName}Value"
+      val fnName = freshName("subExpr")
+      val isNull = addMutableState(JAVA_BOOLEAN, "subExprIsNull")
+      val value = addMutableState(javaType(expr.dataType), "subExprValue")
 
       // Generate the code for this expression tree and wrap it in a function.
       val eval = expr.genCode(this)
@@ -1039,9 +1142,6 @@ class CodegenContext {
       //   2. Less code.
       // Currently, we will do this for all non-leaf only expression trees (i.e. expr trees with
       // at least two nodes) as the cost of doing it is expected to be low.
-      addMutableState(JAVA_BOOLEAN, isNull, s"$isNull = false;")
-      addMutableState(javaType(expr.dataType), value,
-        s"$value = ${defaultValue(expr.dataType)};")
 
       subexprFunctions += s"${addNewFunction(fnName, fn)}($INPUT_ROW);"
       val state = SubExprEliminationState(isNull, value)
@@ -1165,6 +1265,15 @@ object CodeGenerator extends Logging {
   // class.
   val GENERATED_CLASS_SIZE_THRESHOLD = 1000000
 
+  // This is the threshold for the number of global variables, whose types are primitive type or
+  // complex type (e.g. more than one-dimensional array), that will be placed at the outer class
+  val OUTER_CLASS_VARIABLES_THRESHOLD = 10000
+
+  // This is the maximum number of array elements to keep global variables in one Java array
+  // 32767 is the maximum integer value that does not require a constant pool entry in a Java
+  // bytecode instruction
+  val MUTABLESTATEARRAY_SIZE_LIMIT = 32768
+
   /**
    * Compile the Java source code into a Java class, using Janino.
    *
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
index bd8312eb8b7fe..b53c0087e7e2d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateMutableProjection.scala
@@ -57,41 +57,37 @@ object GenerateMutableProjection extends CodeGenerator[Seq[Expression], MutableP
       case _ => true
     }.unzip
     val exprVals = ctx.generateExpressions(validExpr, useSubexprElimination)
-    val projectionCodes = exprVals.zip(index).map {
+
+    // 4-tuples: (code for projection, isNull variable name, value variable name, column index)
+    val projectionCodes: Seq[(String, String, String, Int)] = exprVals.zip(index).map {
       case (ev, i) =>
         val e = expressions(i)
+        val value = ctx.addMutableState(ctx.javaType(e.dataType), "value")
         if (e.nullable) {
-          val isNull = s"isNull_$i"
-          val value = s"value_$i"
-          ctx.addMutableState(ctx.JAVA_BOOLEAN, isNull, s"$isNull = true;")
-          ctx.addMutableState(ctx.javaType(e.dataType), value,
-            s"$value = ${ctx.defaultValue(e.dataType)};")
-          s"""
-            ${ev.code}
-            $isNull = ${ev.isNull};
-            $value = ${ev.value};
-           """
+          val isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "isNull")
+          (s"""
+              |${ev.code}
+              |$isNull = ${ev.isNull};
+              |$value = ${ev.value};
+            """.stripMargin, isNull, value, i)
         } else {
-          val value = s"value_$i"
-          ctx.addMutableState(ctx.javaType(e.dataType), value,
-            s"$value = ${ctx.defaultValue(e.dataType)};")
-          s"""
-            ${ev.code}
-            $value = ${ev.value};
-           """
+          (s"""
+              |${ev.code}
+              |$value = ${ev.value};
+            """.stripMargin, ev.isNull, value, i)
         }
     }
 
     // Evaluate all the subexpressions.
     val evalSubexpr = ctx.subexprFunctions.mkString("\n")
 
-    val updates = validExpr.zip(index).map {
-      case (e, i) =>
-        val ev = ExprCode("", s"isNull_$i", s"value_$i")
+    val updates = validExpr.zip(projectionCodes).map {
+      case (e, (_, isNull, value, i)) =>
+        val ev = ExprCode("", isNull, value)
         ctx.updateColumn("mutableRow", e.dataType, i, ev, e.nullable)
     }
 
-    val allProjections = ctx.splitExpressionsWithCurrentInputs(projectionCodes)
+    val allProjections = ctx.splitExpressionsWithCurrentInputs(projectionCodes.map(_._1))
     val allUpdates = ctx.splitExpressionsWithCurrentInputs(updates)
 
     val codeBody = s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index b022457865d50..36ffa8dcdd2b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -73,9 +73,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       bufferHolder: String,
       isTopLevel: Boolean = false): String = {
     val rowWriterClass = classOf[UnsafeRowWriter].getName
-    val rowWriter = ctx.freshName("rowWriter")
-    ctx.addMutableState(rowWriterClass, rowWriter,
-      s"$rowWriter = new $rowWriterClass($bufferHolder, ${inputs.length});")
+    val rowWriter = ctx.addMutableState(rowWriterClass, "rowWriter",
+      v => s"$v = new $rowWriterClass($bufferHolder, ${inputs.length});")
 
     val resetWriter = if (isTopLevel) {
       // For top level row writer, it always writes to the beginning of the global buffer holder,
@@ -186,9 +185,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     // Puts `input` in a local variable to avoid to re-evaluate it if it's a statement.
     val tmpInput = ctx.freshName("tmpInput")
     val arrayWriterClass = classOf[UnsafeArrayWriter].getName
-    val arrayWriter = ctx.freshName("arrayWriter")
-    ctx.addMutableState(arrayWriterClass, arrayWriter,
-      s"$arrayWriter = new $arrayWriterClass();")
+    val arrayWriter = ctx.addMutableState(arrayWriterClass, "arrayWriter",
+      v => s"$v = new $arrayWriterClass();")
     val numElements = ctx.freshName("numElements")
     val index = ctx.freshName("index")
 
@@ -318,13 +316,12 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       case _ => true
     }
 
-    val result = ctx.freshName("result")
-    ctx.addMutableState("UnsafeRow", result, s"$result = new UnsafeRow(${expressions.length});")
+    val result = ctx.addMutableState("UnsafeRow", "result",
+      v => s"$v = new UnsafeRow(${expressions.length});")
 
-    val holder = ctx.freshName("holder")
     val holderClass = classOf[BufferHolder].getName
-    ctx.addMutableState(holderClass, holder,
-      s"$holder = new $holderClass($result, ${numVarLenFields * 32});")
+    val holder = ctx.addMutableState(holderClass, "holder",
+      v => s"$v = new $holderClass($result, ${numVarLenFields * 32});")
 
     val resetBufferHolder = if (numVarLenFields == 0) {
       ""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 53c3b226895ec..1a9b68222a7f4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -190,8 +190,7 @@ case class CaseWhen(
     // It is initialized to `NOT_MATCHED`, and if it's set to `HAS_NULL` or `HAS_NONNULL`,
     // We won't go on anymore on the computation.
     val resultState = ctx.freshName("caseWhenResultState")
-    val tmpResult = ctx.freshName("caseWhenTmpResult")
-    ctx.addMutableState(ctx.javaType(dataType), tmpResult)
+    val tmpResult = ctx.addMutableState(ctx.javaType(dataType), "caseWhenTmpResult")
 
     // these blocks are meant to be inside a
     // do {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 44d54a20844a3..cfec7f82951a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -442,9 +442,9 @@ case class DayOfWeek(child: Expression) extends UnaryExpression with ImplicitCas
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, time => {
       val cal = classOf[Calendar].getName
-      val c = ctx.freshName("cal")
       val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-      ctx.addMutableState(cal, c, s"""$c = $cal.getInstance($dtu.getTimeZone("UTC"));""")
+      val c = ctx.addMutableState(cal, "cal",
+        v => s"""$v = $cal.getInstance($dtu.getTimeZone("UTC"));""")
       s"""
         $c.setTimeInMillis($time * 1000L * 3600L * 24L);
         ${ev.value} = $c.get($cal.DAY_OF_WEEK);
@@ -484,18 +484,17 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, time => {
       val cal = classOf[Calendar].getName
-      val c = ctx.freshName("cal")
       val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-      ctx.addMutableState(cal, c,
+      val c = ctx.addMutableState(cal, "cal", v =>
         s"""
-          $c = $cal.getInstance($dtu.getTimeZone("UTC"));
-          $c.setFirstDayOfWeek($cal.MONDAY);
-          $c.setMinimalDaysInFirstWeek(4);
-         """)
+           |$v = $cal.getInstance($dtu.getTimeZone("UTC"));
+           |$v.setFirstDayOfWeek($cal.MONDAY);
+           |$v.setMinimalDaysInFirstWeek(4);
+         """.stripMargin)
       s"""
-        $c.setTimeInMillis($time * 1000L * 3600L * 24L);
-        ${ev.value} = $c.get($cal.WEEK_OF_YEAR);
-      """
+         |$c.setTimeInMillis($time * 1000L * 3600L * 24L);
+         |${ev.value} = $c.get($cal.WEEK_OF_YEAR);
+       """.stripMargin
     })
   }
 }
@@ -1014,12 +1013,12 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
            |long ${ev.value} = 0;
          """.stripMargin)
       } else {
-        val tzTerm = ctx.freshName("tz")
-        val utcTerm = ctx.freshName("utc")
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-        ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $dtu.getTimeZone("$tz");""")
-        ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $dtu.getTimeZone("UTC");""")
+        val tzTerm = ctx.addMutableState(tzClass, "tz",
+          v => s"""$v = $dtu.getTimeZone("$tz");""")
+        val utcTerm = ctx.addMutableState(tzClass, "utc",
+          v => s"""$v = $dtu.getTimeZone("UTC");""")
         val eval = left.genCode(ctx)
         ev.copy(code = s"""
            |${eval.code}
@@ -1190,12 +1189,12 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
            |long ${ev.value} = 0;
          """.stripMargin)
       } else {
-        val tzTerm = ctx.freshName("tz")
-        val utcTerm = ctx.freshName("utc")
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-        ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $dtu.getTimeZone("$tz");""")
-        ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $dtu.getTimeZone("UTC");""")
+        val tzTerm = ctx.addMutableState(tzClass, "tz",
+          v => s"""$v = $dtu.getTimeZone("$tz");""")
+        val utcTerm = ctx.addMutableState(tzClass, "utc",
+          v => s"""$v = $dtu.getTimeZone("UTC");""")
         val eval = left.genCode(ctx)
         ev.copy(code = s"""
            |${eval.code}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index cd38783a731ad..1cd73a92a8635 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -199,8 +199,8 @@ case class Stack(children: Seq[Expression]) extends Generator {
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Rows - we write these into an array.
-    val rowData = ctx.freshName("rows")
-    ctx.addMutableState("InternalRow[]", rowData, s"$rowData = new InternalRow[$numRows];")
+    val rowData = ctx.addMutableState("InternalRow[]", "rows",
+      v => s"$v = new InternalRow[$numRows];")
     val values = children.tail
     val dataTypes = values.take(numFields).map(_.dataType)
     val code = ctx.splitExpressionsWithCurrentInputs(Seq.tabulate(numRows) { row =>
@@ -217,7 +217,7 @@ case class Stack(children: Seq[Expression]) extends Generator {
     ctx.addMutableState(
       s"$wrapperClass<InternalRow>",
       ev.value,
-      s"${ev.value} = $wrapperClass$$.MODULE$$.make($rowData);")
+      v => s"$v = $wrapperClass$$.MODULE$$.make($rowData);", useFreshName = false)
     ev.copy(code = code, isNull = "false")
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 294cdcb2e9546..b4f895fffda38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -72,8 +72,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tmpIsNull = ctx.freshName("coalesceTmpIsNull")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, tmpIsNull)
+    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "coalesceTmpIsNull")
 
     // all the evals are meant to be in a do { ... } while (false); loop
     val evals = children.map { e =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 4bd395eadcf19..a59aad5be8715 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -62,15 +62,13 @@ trait InvokeLike extends Expression with NonSQLExpression {
   def prepareArguments(ctx: CodegenContext): (String, String, String) = {
 
     val resultIsNull = if (needNullCheck) {
-      val resultIsNull = ctx.freshName("resultIsNull")
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, resultIsNull)
+      val resultIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "resultIsNull")
       resultIsNull
     } else {
       "false"
     }
     val argValues = arguments.map { e =>
-      val argValue = ctx.freshName("argValue")
-      ctx.addMutableState(ctx.javaType(e.dataType), argValue)
+      val argValue = ctx.addMutableState(ctx.javaType(e.dataType), "argValue")
       argValue
     }
 
@@ -548,7 +546,7 @@ case class MapObjects private(
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val elementJavaType = ctx.javaType(loopVarDataType)
-    ctx.addMutableState(elementJavaType, loopValue)
+    ctx.addMutableState(elementJavaType, loopValue, forceInline = true, useFreshName = false)
     val genInputData = inputData.genCode(ctx)
     val genFunction = lambdaFunction.genCode(ctx)
     val dataLength = ctx.freshName("dataLength")
@@ -644,7 +642,7 @@ case class MapObjects private(
     }
 
     val loopNullCheck = if (loopIsNull != "false") {
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, loopIsNull)
+      ctx.addMutableState(ctx.JAVA_BOOLEAN, loopIsNull, forceInline = true, useFreshName = false)
       inputDataType match {
         case _: ArrayType => s"$loopIsNull = ${genInputData.value}.isNullAt($loopIndex);"
         case _ => s"$loopIsNull = $loopValue == null;"
@@ -808,10 +806,11 @@ case class CatalystToExternalMap private(
 
     val mapType = inputDataType(inputData.dataType).asInstanceOf[MapType]
     val keyElementJavaType = ctx.javaType(mapType.keyType)
-    ctx.addMutableState(keyElementJavaType, keyLoopValue)
+    ctx.addMutableState(keyElementJavaType, keyLoopValue, forceInline = true, useFreshName = false)
     val genKeyFunction = keyLambdaFunction.genCode(ctx)
     val valueElementJavaType = ctx.javaType(mapType.valueType)
-    ctx.addMutableState(valueElementJavaType, valueLoopValue)
+    ctx.addMutableState(valueElementJavaType, valueLoopValue, forceInline = true,
+      useFreshName = false)
     val genValueFunction = valueLambdaFunction.genCode(ctx)
     val genInputData = inputData.genCode(ctx)
     val dataLength = ctx.freshName("dataLength")
@@ -844,7 +843,8 @@ case class CatalystToExternalMap private(
     val genValueFunctionValue = genFunctionValue(valueLambdaFunction, genValueFunction)
 
     val valueLoopNullCheck = if (valueLoopIsNull != "false") {
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, valueLoopIsNull)
+      ctx.addMutableState(ctx.JAVA_BOOLEAN, valueLoopIsNull, forceInline = true,
+        useFreshName = false)
       s"$valueLoopIsNull = $valueArray.isNullAt($loopIndex);"
     } else {
       ""
@@ -994,8 +994,8 @@ case class ExternalMapToCatalyst private(
 
     val keyElementJavaType = ctx.javaType(keyType)
     val valueElementJavaType = ctx.javaType(valueType)
-    ctx.addMutableState(keyElementJavaType, key)
-    ctx.addMutableState(valueElementJavaType, value)
+    ctx.addMutableState(keyElementJavaType, key, forceInline = true, useFreshName = false)
+    ctx.addMutableState(valueElementJavaType, value, forceInline = true, useFreshName = false)
 
     val (defineEntries, defineKeyValue) = child.dataType match {
       case ObjectType(cls) if classOf[java.util.Map[_, _]].isAssignableFrom(cls) =>
@@ -1031,14 +1031,14 @@ case class ExternalMapToCatalyst private(
     }
 
     val keyNullCheck = if (keyIsNull != "false") {
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, keyIsNull)
+      ctx.addMutableState(ctx.JAVA_BOOLEAN, keyIsNull, forceInline = true, useFreshName = false)
       s"$keyIsNull = $key == null;"
     } else {
       ""
     }
 
     val valueNullCheck = if (valueIsNull != "false") {
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, valueIsNull)
+      ctx.addMutableState(ctx.JAVA_BOOLEAN, valueIsNull, forceInline = true, useFreshName = false)
       s"$valueIsNull = $value == null;"
     } else {
       ""
@@ -1148,7 +1148,6 @@ case class EncodeUsingSerializer(child: Expression, kryo: Boolean)
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Code to initialize the serializer.
-    val serializer = ctx.freshName("serializer")
     val (serializerClass, serializerInstanceClass) = {
       if (kryo) {
         (classOf[KryoSerializer].getName, classOf[KryoSerializerInstance].getName)
@@ -1159,14 +1158,14 @@ case class EncodeUsingSerializer(child: Expression, kryo: Boolean)
     // try conf from env, otherwise create a new one
     val env = s"${classOf[SparkEnv].getName}.get()"
     val sparkConf = s"new ${classOf[SparkConf].getName}()"
-    val serializerInit = s"""
-      if ($env == null) {
-        $serializer = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
-       } else {
-         $serializer = ($serializerInstanceClass) new $serializerClass($env.conf()).newInstance();
-       }
-     """
-    ctx.addMutableState(serializerInstanceClass, serializer, serializerInit)
+    val serializer = ctx.addMutableState(serializerInstanceClass, "serializerForEncode", v =>
+      s"""
+         |if ($env == null) {
+         |  $v = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
+         |} else {
+         |  $v = ($serializerInstanceClass) new $serializerClass($env.conf()).newInstance();
+         |}
+       """.stripMargin)
 
     // Code to serialize.
     val input = child.genCode(ctx)
@@ -1194,7 +1193,6 @@ case class DecodeUsingSerializer[T](child: Expression, tag: ClassTag[T], kryo: B
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Code to initialize the serializer.
-    val serializer = ctx.freshName("serializer")
     val (serializerClass, serializerInstanceClass) = {
       if (kryo) {
         (classOf[KryoSerializer].getName, classOf[KryoSerializerInstance].getName)
@@ -1205,14 +1203,14 @@ case class DecodeUsingSerializer[T](child: Expression, tag: ClassTag[T], kryo: B
     // try conf from env, otherwise create a new one
     val env = s"${classOf[SparkEnv].getName}.get()"
     val sparkConf = s"new ${classOf[SparkConf].getName}()"
-    val serializerInit = s"""
-      if ($env == null) {
-        $serializer = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
-       } else {
-         $serializer = ($serializerInstanceClass) new $serializerClass($env.conf()).newInstance();
-       }
-     """
-    ctx.addMutableState(serializerInstanceClass, serializer, serializerInit)
+    val serializer = ctx.addMutableState(serializerInstanceClass, "serializerForDecode", v =>
+      s"""
+         |if ($env == null) {
+         |  $v = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
+         |} else {
+         |  $v = ($serializerInstanceClass) new $serializerClass($env.conf()).newInstance();
+         |}
+       """.stripMargin)
 
     // Code to deserialize.
     val input = child.genCode(ctx)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
index b4aefe6cff73e..8bc936fcbfc31 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/randomExpressions.scala
@@ -77,9 +77,8 @@ case class Rand(child: Expression) extends RDG {
   override protected def evalInternal(input: InternalRow): Double = rng.nextDouble()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm)
+    val rngTerm = ctx.addMutableState(className, "rng")
     ctx.addPartitionInitializationStatement(
       s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
@@ -112,9 +111,8 @@ case class Randn(child: Expression) extends RDG {
   override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val rngTerm = ctx.freshName("rng")
     val className = classOf[XORShiftRandom].getName
-    ctx.addMutableState(className, rngTerm)
+    val rngTerm = ctx.addMutableState(className, "rng")
     ctx.addPartitionInitializationStatement(
       s"$rngTerm = new $className(${seed}L + partitionIndex);")
     ev.copy(code = s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 53d7096dd87d3..fa5425c77ebba 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -112,15 +112,15 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val patternClass = classOf[Pattern].getName
     val escapeFunc = StringUtils.getClass.getName.stripSuffix("$") + ".escapeLikeRegex"
-    val pattern = ctx.freshName("pattern")
 
     if (right.foldable) {
       val rVal = right.eval()
       if (rVal != null) {
         val regexStr =
           StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
-        ctx.addMutableState(patternClass, pattern,
-          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+        // inline mutable state since not many Like operations in a task
+        val pattern = ctx.addMutableState(patternClass, "patternLike",
+          v => s"""$v = ${patternClass}.compile("$regexStr");""", forceInline = true)
 
         // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
         val eval = left.genCode(ctx)
@@ -139,6 +139,7 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
         """)
       }
     } else {
+      val pattern = ctx.freshName("pattern")
       val rightStr = ctx.freshName("rightStr")
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
@@ -187,15 +188,15 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val patternClass = classOf[Pattern].getName
-    val pattern = ctx.freshName("pattern")
 
     if (right.foldable) {
       val rVal = right.eval()
       if (rVal != null) {
         val regexStr =
           StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString())
-        ctx.addMutableState(patternClass, pattern,
-          s"""$pattern = ${patternClass}.compile("$regexStr");""")
+        // inline mutable state since not many RLike operations in a task
+        val pattern = ctx.addMutableState(patternClass, "patternRLike",
+          v => s"""$v = ${patternClass}.compile("$regexStr");""", forceInline = true)
 
         // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
         val eval = left.genCode(ctx)
@@ -215,6 +216,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
       }
     } else {
       val rightStr = ctx.freshName("rightStr")
+      val pattern = ctx.freshName("pattern")
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
           String $rightStr = ${eval2}.toString();
@@ -316,11 +318,6 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
   override def prettyName: String = "regexp_replace"
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val termLastRegex = ctx.freshName("lastRegex")
-    val termPattern = ctx.freshName("pattern")
-
-    val termLastReplacement = ctx.freshName("lastReplacement")
-    val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
     val termResult = ctx.freshName("termResult")
 
     val classNamePattern = classOf[Pattern].getCanonicalName
@@ -328,11 +325,10 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
 
     val matcher = ctx.freshName("matcher")
 
-    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
-    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
-    ctx.addMutableState("String", termLastReplacement, s"${termLastReplacement} = null;")
-    ctx.addMutableState("UTF8String",
-      termLastReplacementInUTF8, s"${termLastReplacementInUTF8} = null;")
+    val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+    val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+    val termLastReplacement = ctx.addMutableState("String", "lastReplacement")
+    val termLastReplacementInUTF8 = ctx.addMutableState("UTF8String", "lastReplacementInUTF8")
 
     val setEvNotNull = if (nullable) {
       s"${ev.isNull} = false;"
@@ -414,14 +410,12 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
   override def prettyName: String = "regexp_extract"
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val termLastRegex = ctx.freshName("lastRegex")
-    val termPattern = ctx.freshName("pattern")
     val classNamePattern = classOf[Pattern].getCanonicalName
     val matcher = ctx.freshName("matcher")
     val matchResult = ctx.freshName("matchResult")
 
-    ctx.addMutableState("UTF8String", termLastRegex, s"${termLastRegex} = null;")
-    ctx.addMutableState(classNamePattern, termPattern, s"${termPattern} = null;")
+    val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+    val termPattern = ctx.addMutableState(classNamePattern, "pattern")
 
     val setEvNotNull = if (nullable) {
       s"${ev.isNull} = false;"
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 8c4d2fd686be5..c02c41db1668e 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -291,8 +291,7 @@ case class Elt(children: Seq[Expression])
     val indexVal = ctx.freshName("index")
     val indexMatched = ctx.freshName("eltIndexMatched")
 
-    val stringVal = ctx.freshName("stringVal")
-    ctx.addMutableState(ctx.javaType(dataType), stringVal)
+    val stringVal = ctx.addMutableState(ctx.javaType(dataType), "stringVal")
 
     val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
       s"""
@@ -532,14 +531,11 @@ case class StringTranslate(srcExpr: Expression, matchingExpr: Expression, replac
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val termLastMatching = ctx.freshName("lastMatching")
-    val termLastReplace = ctx.freshName("lastReplace")
-    val termDict = ctx.freshName("dict")
     val classNameDict = classOf[JMap[Character, Character]].getCanonicalName
 
-    ctx.addMutableState("UTF8String", termLastMatching, s"$termLastMatching = null;")
-    ctx.addMutableState("UTF8String", termLastReplace, s"$termLastReplace = null;")
-    ctx.addMutableState(classNameDict, termDict, s"$termDict = null;")
+    val termLastMatching = ctx.addMutableState("UTF8String", "lastMatching")
+    val termLastReplace = ctx.addMutableState("UTF8String", "lastReplace")
+    val termDict = ctx.addMutableState(classNameDict, "dict")
 
     nullSafeCodeGen(ctx, ev, (src, matching, replace) => {
       val check = if (matchingExpr.foldable && replaceExpr.foldable) {
@@ -2065,15 +2061,12 @@ case class FormatNumber(x: Expression, d: Expression)
       // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.')
       // as a decimal separator.
       val usLocale = "US"
-      val lastDValue = ctx.freshName("lastDValue")
-      val pattern = ctx.freshName("pattern")
-      val numberFormat = ctx.freshName("numberFormat")
       val i = ctx.freshName("i")
       val dFormat = ctx.freshName("dFormat")
-      ctx.addMutableState(ctx.JAVA_INT, lastDValue, s"$lastDValue = -100;")
-      ctx.addMutableState(sb, pattern, s"$pattern = new $sb();")
-      ctx.addMutableState(df, numberFormat,
-      s"""$numberFormat = new $df("", new $dfs($l.$usLocale));""")
+      val lastDValue = ctx.addMutableState(ctx.JAVA_INT, "lastDValue", v => s"$v = -100;")
+      val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();")
+      val numberFormat = ctx.addMutableState(df, "numberFormat",
+        v => s"""$v = new $df("", new $dfs($l.$usLocale));""")
 
       s"""
         if ($d >= 0) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
index be638d80e45d8..6edb4348f8309 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ArithmeticExpressionSuite.scala
@@ -348,10 +348,10 @@ class ArithmeticExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
   test("SPARK-22704: Least and greatest use less global variables") {
     val ctx1 = new CodegenContext()
     Least(Seq(Literal(1), Literal(1))).genCode(ctx1)
-    assert(ctx1.mutableStates.size == 1)
+    assert(ctx1.inlinedMutableStates.size == 1)
 
     val ctx2 = new CodegenContext()
     Greatest(Seq(Literal(1), Literal(1))).genCode(ctx2)
-    assert(ctx2.mutableStates.size == 1)
+    assert(ctx2.inlinedMutableStates.size == 1)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 65617be05a434..1dd040e4696a1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -851,6 +851,6 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     val ctx = new CodegenContext
     cast("1", IntegerType).genCode(ctx)
     cast("2", LongType).genCode(ctx)
-    assert(ctx.mutableStates.length == 0)
+    assert(ctx.inlinedMutableStates.length == 0)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index a969811019161..b1a44528e64d7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -385,20 +385,43 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     val ctx = new CodegenContext
     val schema = new StructType().add("a", IntegerType).add("b", StringType)
     CreateExternalRow(Seq(Literal(1), Literal("x")), schema).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 
   test("SPARK-22696: InitializeJavaBean should not use global variables") {
     val ctx = new CodegenContext
     InitializeJavaBean(Literal.fromObject(new java.util.LinkedList[Int]),
       Map("add" -> Literal(1))).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 
   test("SPARK-22716: addReferenceObj should not add mutable states") {
     val ctx = new CodegenContext
     val foo = new Object()
     ctx.addReferenceObj("foo", foo)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
+  }
+
+  test("SPARK-18016: define mutable states by using an array") {
+    val ctx1 = new CodegenContext
+    for (i <- 1 to CodeGenerator.OUTER_CLASS_VARIABLES_THRESHOLD + 10) {
+      ctx1.addMutableState(ctx1.JAVA_INT, "i", v => s"$v = $i;")
+    }
+    assert(ctx1.inlinedMutableStates.size == CodeGenerator.OUTER_CLASS_VARIABLES_THRESHOLD)
+    // When the number of primitive type mutable states is over the threshold, others are
+    // allocated into an array
+    assert(ctx1.arrayCompactedMutableStates.get(ctx1.JAVA_INT).get.arrayNames.size == 1)
+    assert(ctx1.mutableStateInitCode.size == CodeGenerator.OUTER_CLASS_VARIABLES_THRESHOLD + 10)
+
+    val ctx2 = new CodegenContext
+    for (i <- 1 to CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT + 10) {
+      ctx2.addMutableState("InternalRow[]", "r", v => s"$v = new InternalRow[$i];")
+    }
+    // When the number of non-primitive type mutable states is over the threshold, others are
+    // allocated into a new array
+    assert(ctx2.inlinedMutableStates.isEmpty)
+    assert(ctx2.arrayCompactedMutableStates.get("InternalRow[]").get.arrayNames.size == 2)
+    assert(ctx2.arrayCompactedMutableStates("InternalRow[]").getCurrentIndex == 10)
+    assert(ctx2.mutableStateInitCode.size == CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT + 10)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
index 6dfca7d73a3df..84190f0bd5f7d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ComplexTypeSuite.scala
@@ -304,6 +304,6 @@ class ComplexTypeSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22693: CreateNamedStruct should not use global variables") {
     val ctx = new CodegenContext
     CreateNamedStruct(Seq("a", "x", "b", 2.0)).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
index 60d84aae1fa3f..a099119732e25 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ConditionalExpressionSuite.scala
@@ -150,6 +150,6 @@ class ConditionalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper
   test("SPARK-22705: case when should use less global variables") {
     val ctx = new CodegenContext()
     CaseWhen(Seq((Literal.create(false, BooleanType), Literal(1))), Literal(-1)).genCode(ctx)
-    assert(ctx.mutableStates.size == 1)
+    assert(ctx.inlinedMutableStates.size == 1)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index a23cd95632770..cc6c15cb2c909 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -159,7 +159,7 @@ class NullExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22705: Coalesce should use less global variables") {
     val ctx = new CodegenContext()
     Coalesce(Seq(Literal("a"), Literal("b"))).genCode(ctx)
-    assert(ctx.mutableStates.size == 1)
+    assert(ctx.inlinedMutableStates.size == 1)
   }
 
   test("AtLeastNNonNulls should not throw 64kb exception") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 15cb0bea08f17..8a8f8e10225fa 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -249,7 +249,7 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22705: In should use less global variables") {
     val ctx = new CodegenContext()
     In(Literal(1.0D), Seq(Literal(1.0D), Literal(2.0D))).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 
   test("INSET") {
@@ -440,6 +440,6 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22693: InSet should not use global variables") {
     val ctx = new CodegenContext
     InSet(Literal(1), Set(1, 2, 3, 4)).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 4fa61fbaf66c2..2a0a42c65b086 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -183,8 +183,9 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val ctx = new CodegenContext
     RegExpReplace(Literal("100"), Literal("(\\d+)"), Literal("num")).genCode(ctx)
     // four global variables (lastRegex, pattern, lastReplacement, and lastReplacementInUTF8)
-    // are always required
-    assert(ctx.mutableStates.length == 4)
+    // are always required, which are allocated in type-based global array
+    assert(ctx.inlinedMutableStates.length == 0)
+    assert(ctx.mutableStateInitCode.length == 4)
   }
 
   test("RegexExtract") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
index 70dea4b39d55d..10e3ffd0dff97 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
@@ -51,6 +51,6 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("SPARK-22695: ScalaUDF should not use global variables") {
     val ctx = new CodegenContext
     ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
-    assert(ctx.mutableStates.isEmpty)
+    assert(ctx.inlinedMutableStates.isEmpty)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
index 6031bdf19e957..2c45b3b0c73d1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GeneratedProjectionSuite.scala
@@ -219,4 +219,31 @@ class GeneratedProjectionSuite extends SparkFunSuite {
     // - one is the mutableRow
     assert(globalVariables.length == 3)
   }
+
+  test("SPARK-18016: generated projections on wider table requiring state compaction") {
+    val N = 6000
+    val wideRow1 = new GenericInternalRow(new Array[Any](N))
+    val schema1 = StructType((1 to N).map(i => StructField("", IntegerType)))
+    val wideRow2 = new GenericInternalRow(
+      Array.tabulate[Any](N)(i => UTF8String.fromString(i.toString)))
+    val schema2 = StructType((1 to N).map(i => StructField("", StringType)))
+    val joined = new JoinedRow(wideRow1, wideRow2)
+    val joinedSchema = StructType(schema1 ++ schema2)
+    val nested = new JoinedRow(InternalRow(joined, joined), joined)
+    val nestedSchema = StructType(
+      Seq(StructField("", joinedSchema), StructField("", joinedSchema)) ++ joinedSchema)
+
+    val safeProj = FromUnsafeProjection(nestedSchema)
+    val result = safeProj(nested)
+
+    // test generated MutableProjection
+    val exprs = nestedSchema.fields.zipWithIndex.map { case (f, i) =>
+      BoundReference(i, f.dataType, true)
+    }
+    val mutableProj = GenerateMutableProjection.generate(exprs)
+    val row1 = mutableProj(result)
+    assert(result === row1)
+    val row2 = mutableProj(result)
+    assert(result === row2)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
index e3675367d78e4..0d11958876ce9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/complexTypesSuite.scala
@@ -168,7 +168,7 @@ class ComplexTypesSuite extends PlanTest{
   test("SPARK-22570: CreateArray should not create a lot of global variables") {
     val ctx = new CodegenContext
     CreateArray(Seq(Literal(1))).genCode(ctx)
-    assert(ctx.mutableStates.length == 0)
+    assert(ctx.inlinedMutableStates.length == 0)
   }
 
   test("simplify map ops") {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index a9bfb634fbdea..782cec5e292ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -68,30 +68,26 @@ private[sql] trait ColumnarBatchScan extends CodegenSupport {
    */
   // TODO: return ColumnarBatch.Rows instead
   override protected def doProduce(ctx: CodegenContext): String = {
-    val input = ctx.freshName("input")
     // PhysicalRDD always just has one input
-    ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    val input = ctx.addMutableState("scala.collection.Iterator", "input",
+      v => s"$v = inputs[0];")
 
     // metrics
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     val scanTimeMetric = metricTerm(ctx, "scanTime")
-    val scanTimeTotalNs = ctx.freshName("scanTime")
-    ctx.addMutableState(ctx.JAVA_LONG, scanTimeTotalNs, s"$scanTimeTotalNs = 0;")
+    val scanTimeTotalNs = ctx.addMutableState(ctx.JAVA_LONG, "scanTime") // init as scanTime = 0
 
     val columnarBatchClz = classOf[ColumnarBatch].getName
-    val batch = ctx.freshName("batch")
-    ctx.addMutableState(columnarBatchClz, batch, s"$batch = null;")
+    val batch = ctx.addMutableState(columnarBatchClz, "batch")
 
-    val idx = ctx.freshName("batchIdx")
-    ctx.addMutableState(ctx.JAVA_INT, idx, s"$idx = 0;")
-    val colVars = output.indices.map(i => ctx.freshName("colInstance" + i))
+    val idx = ctx.addMutableState(ctx.JAVA_INT, "batchIdx") // init as batchIdx = 0
     val columnVectorClzs = vectorTypes.getOrElse(
-      Seq.fill(colVars.size)(classOf[ColumnVector].getName))
-    val columnAssigns = colVars.zip(columnVectorClzs).zipWithIndex.map {
-      case ((name, columnVectorClz), i) =>
-        ctx.addMutableState(columnVectorClz, name, s"$name = null;")
-        s"$name = ($columnVectorClz) $batch.column($i);"
-    }
+      Seq.fill(output.indices.size)(classOf[ColumnVector].getName))
+    val (colVars, columnAssigns) = columnVectorClzs.zipWithIndex.map {
+      case (columnVectorClz, i) =>
+        val name = ctx.addMutableState(columnVectorClz, s"colInstance$i")
+        (name, s"$name = ($columnVectorClz) $batch.column($i);")
+    }.unzip
 
     val nextBatch = ctx.freshName("nextBatch")
     val nextBatchFuncName = ctx.addNewFunction(nextBatch,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
index 27c7dc3ee4533..d1ff82c7c06bc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -110,8 +110,7 @@ case class RowDataSourceScanExec(
   override protected def doProduce(ctx: CodegenContext): String = {
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     // PhysicalRDD always just has one input
-    val input = ctx.freshName("input")
-    ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];")
     val exprRows = output.zipWithIndex.map{ case (a, i) =>
       BoundReference(i, a.dataType, a.nullable)
     }
@@ -353,8 +352,7 @@ case class FileSourceScanExec(
     }
     val numOutputRows = metricTerm(ctx, "numOutputRows")
     // PhysicalRDD always just has one input
-    val input = ctx.freshName("input")
-    ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];")
     val row = ctx.freshName("row")
 
     ctx.INPUT_ROW = row
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
index c0e21343ae623..daff3c49e7517 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
@@ -133,20 +133,18 @@ case class SortExec(
   override def needStopCheck: Boolean = false
 
   override protected def doProduce(ctx: CodegenContext): String = {
-    val needToSort = ctx.freshName("needToSort")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, needToSort, s"$needToSort = true;")
+    val needToSort = ctx.addMutableState(ctx.JAVA_BOOLEAN, "needToSort", v => s"$v = true;")
 
     // Initialize the class member variables. This includes the instance of the Sorter and
     // the iterator to return sorted rows.
     val thisPlan = ctx.addReferenceObj("plan", this)
-    sorterVariable = ctx.freshName("sorter")
-    ctx.addMutableState(classOf[UnsafeExternalRowSorter].getName, sorterVariable,
-      s"$sorterVariable = $thisPlan.createSorter();")
-    val metrics = ctx.freshName("metrics")
-    ctx.addMutableState(classOf[TaskMetrics].getName, metrics,
-      s"$metrics = org.apache.spark.TaskContext.get().taskMetrics();")
-    val sortedIterator = ctx.freshName("sortedIter")
-    ctx.addMutableState("scala.collection.Iterator<UnsafeRow>", sortedIterator, "")
+    // inline mutable state since not many Sort operations in a task
+    sorterVariable = ctx.addMutableState(classOf[UnsafeExternalRowSorter].getName, "sorter",
+      v => s"$v = $thisPlan.createSorter();", forceInline = true)
+    val metrics = ctx.addMutableState(classOf[TaskMetrics].getName, "metrics",
+      v => s"$v = org.apache.spark.TaskContext.get().taskMetrics();", forceInline = true)
+    val sortedIterator = ctx.addMutableState("scala.collection.Iterator<UnsafeRow>", "sortedIter",
+      forceInline = true)
 
     val addToSorter = ctx.freshName("addToSorter")
     val addToSorterFuncName = ctx.addNewFunction(addToSorter,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 7166b7771e4db..9e7008d1e0c31 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -282,9 +282,10 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp
   }
 
   override def doProduce(ctx: CodegenContext): String = {
-    val input = ctx.freshName("input")
     // Right now, InputAdapter is only used when there is one input RDD.
-    ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
+    // inline mutable state since an inputAdaptor in a task
+    val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];",
+      forceInline = true)
     val row = ctx.freshName("row")
     s"""
        | while ($input.hasNext() && !stopEarly()) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 9cadd13999e72..b1af360d85095 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -178,8 +178,7 @@ case class HashAggregateExec(
   private var bufVars: Seq[ExprCode] = _
 
   private def doProduceWithoutKeys(ctx: CodegenContext): String = {
-    val initAgg = ctx.freshName("initAgg")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, initAgg, s"$initAgg = false;")
+    val initAgg = ctx.addMutableState(ctx.JAVA_BOOLEAN, "initAgg")
     // The generated function doesn't have input row in the code context.
     ctx.INPUT_ROW = null
 
@@ -187,10 +186,8 @@ case class HashAggregateExec(
     val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
     val initExpr = functions.flatMap(f => f.initialValues)
     bufVars = initExpr.map { e =>
-      val isNull = ctx.freshName("bufIsNull")
-      val value = ctx.freshName("bufValue")
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, isNull)
-      ctx.addMutableState(ctx.javaType(e.dataType), value)
+      val isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "bufIsNull")
+      val value = ctx.addMutableState(ctx.javaType(e.dataType), "bufValue")
       // The initial expression should not access any column
       val ev = e.genCode(ctx)
       val initVars = s"""
@@ -568,8 +565,7 @@ case class HashAggregateExec(
   }
 
   private def doProduceWithKeys(ctx: CodegenContext): String = {
-    val initAgg = ctx.freshName("initAgg")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, initAgg, s"$initAgg = false;")
+    val initAgg = ctx.addMutableState(ctx.JAVA_BOOLEAN, "initAgg")
     if (sqlContext.conf.enableTwoLevelAggMap) {
       enableTwoLevelHashMap(ctx)
     } else {
@@ -583,42 +579,41 @@ case class HashAggregateExec(
     val thisPlan = ctx.addReferenceObj("plan", this)
 
     // Create a name for the iterator from the fast hash map.
-    val iterTermForFastHashMap = ctx.freshName("fastHashMapIter")
-    if (isFastHashMapEnabled) {
+    val iterTermForFastHashMap = if (isFastHashMapEnabled) {
       // Generates the fast hash map class and creates the fash hash map term.
-      fastHashMapTerm = ctx.freshName("fastHashMap")
       val fastHashMapClassName = ctx.freshName("FastHashMap")
       if (isVectorizedHashMapEnabled) {
         val generatedMap = new VectorizedHashMapGenerator(ctx, aggregateExpressions,
           fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
         ctx.addInnerClass(generatedMap)
 
-        ctx.addMutableState(fastHashMapClassName, fastHashMapTerm,
-          s"$fastHashMapTerm = new $fastHashMapClassName();")
-        ctx.addMutableState(s"java.util.Iterator<InternalRow>", iterTermForFastHashMap)
+        fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "vectorizedHastHashMap",
+          v => s"$v = new $fastHashMapClassName();")
+        ctx.addMutableState(s"java.util.Iterator<InternalRow>", "vectorizedFastHashMapIter")
       } else {
         val generatedMap = new RowBasedHashMapGenerator(ctx, aggregateExpressions,
           fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
         ctx.addInnerClass(generatedMap)
 
-        ctx.addMutableState(fastHashMapClassName, fastHashMapTerm,
-          s"$fastHashMapTerm = new $fastHashMapClassName(" +
+        fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "fastHashMap",
+          v => s"$v = new $fastHashMapClassName(" +
             s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());")
         ctx.addMutableState(
           "org.apache.spark.unsafe.KVIterator<UnsafeRow, UnsafeRow>",
-          iterTermForFastHashMap)
+          "fastHashMapIter")
       }
     }
 
     // Create a name for the iterator from the regular hash map.
-    val iterTerm = ctx.freshName("mapIter")
-    ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName, iterTerm)
+    // inline mutable state since not many aggregation operations in a task
+    val iterTerm = ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName,
+      "mapIter", forceInline = true)
     // create hashMap
-    hashMapTerm = ctx.freshName("hashMap")
     val hashMapClassName = classOf[UnsafeFixedWidthAggregationMap].getName
-    ctx.addMutableState(hashMapClassName, hashMapTerm, s"$hashMapTerm = $thisPlan.createHashMap();")
-    sorterTerm = ctx.freshName("sorter")
-    ctx.addMutableState(classOf[UnsafeKVExternalSorter].getName, sorterTerm)
+    hashMapTerm = ctx.addMutableState(hashMapClassName, "hashMap",
+      v => s"$v = $thisPlan.createHashMap();")
+    sorterTerm = ctx.addMutableState(classOf[UnsafeKVExternalSorter].getName, "sorter",
+      forceInline = true)
 
     val doAgg = ctx.freshName("doAggregateWithKeys")
     val peakMemory = metricTerm(ctx, "peakMemory")
@@ -758,8 +753,7 @@ case class HashAggregateExec(
 
     val (checkFallbackForGeneratedHashMap, checkFallbackForBytesToBytesMap, resetCounter,
     incCounter) = if (testFallbackStartsAt.isDefined) {
-      val countTerm = ctx.freshName("fallbackCounter")
-      ctx.addMutableState(ctx.JAVA_INT, countTerm, s"$countTerm = 0;")
+      val countTerm = ctx.addMutableState(ctx.JAVA_INT, "fallbackCounter")
       (s"$countTerm < ${testFallbackStartsAt.get._1}",
         s"$countTerm < ${testFallbackStartsAt.get._2}", s"$countTerm = 0;", s"$countTerm += 1;")
     } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala
index 85b4529501ea8..1c613b19c4ab1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashMapGenerator.scala
@@ -46,10 +46,8 @@ abstract class HashMapGenerator(
     val functions = aggregateExpressions.map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
     val initExpr = functions.flatMap(f => f.initialValues)
     initExpr.map { e =>
-      val isNull = ctx.freshName("bufIsNull")
-      val value = ctx.freshName("bufValue")
-      ctx.addMutableState(ctx.JAVA_BOOLEAN, isNull)
-      ctx.addMutableState(ctx.javaType(e.dataType), value)
+      val isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "bufIsNull")
+      val value = ctx.addMutableState(ctx.javaType(e.dataType), "bufValue")
       val ev = e.genCode(ctx)
       val initVars =
         s"""
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index c9a15147e30d0..78137d3f97cfc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -279,29 +279,30 @@ case class SampleExec(
 
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     val numOutput = metricTerm(ctx, "numOutputRows")
-    val sampler = ctx.freshName("sampler")
 
     if (withReplacement) {
       val samplerClass = classOf[PoissonSampler[UnsafeRow]].getName
       val initSampler = ctx.freshName("initSampler")
 
-      val initSamplerFuncName = ctx.addNewFunction(initSampler,
-        s"""
-          | private void $initSampler() {
-          |   $sampler = new $samplerClass<UnsafeRow>($upperBound - $lowerBound, false);
-          |   java.util.Random random = new java.util.Random(${seed}L);
-          |   long randomSeed = random.nextLong();
-          |   int loopCount = 0;
-          |   while (loopCount < partitionIndex) {
-          |     randomSeed = random.nextLong();
-          |     loopCount += 1;
-          |   }
-          |   $sampler.setSeed(randomSeed);
-          | }
-         """.stripMargin.trim)
-
-      ctx.addMutableState(s"$samplerClass<UnsafeRow>", sampler,
-        s"$initSamplerFuncName();")
+      // inline mutable state since not many Sample operations in a task
+      val sampler = ctx.addMutableState(s"$samplerClass<UnsafeRow>", "sampleReplace",
+        v => {
+          val initSamplerFuncName = ctx.addNewFunction(initSampler,
+            s"""
+              | private void $initSampler() {
+              |   $v = new $samplerClass<UnsafeRow>($upperBound - $lowerBound, false);
+              |   java.util.Random random = new java.util.Random(${seed}L);
+              |   long randomSeed = random.nextLong();
+              |   int loopCount = 0;
+              |   while (loopCount < partitionIndex) {
+              |     randomSeed = random.nextLong();
+              |     loopCount += 1;
+              |   }
+              |   $v.setSeed(randomSeed);
+              | }
+           """.stripMargin.trim)
+          s"$initSamplerFuncName();"
+        }, forceInline = true)
 
       val samplingCount = ctx.freshName("samplingCount")
       s"""
@@ -313,10 +314,10 @@ case class SampleExec(
        """.stripMargin.trim
     } else {
       val samplerClass = classOf[BernoulliCellSampler[UnsafeRow]].getName
-      ctx.addMutableState(s"$samplerClass<UnsafeRow>", sampler,
-        s"""
-          | $sampler = new $samplerClass<UnsafeRow>($lowerBound, $upperBound, false);
-          | $sampler.setSeed(${seed}L + partitionIndex);
+      val sampler = ctx.addMutableState(s"$samplerClass<UnsafeRow>", "sampler",
+        v => s"""
+          | $v = new $samplerClass<UnsafeRow>($lowerBound, $upperBound, false);
+          | $v.setSeed(${seed}L + partitionIndex);
          """.stripMargin.trim)
 
       s"""
@@ -363,20 +364,18 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
   protected override def doProduce(ctx: CodegenContext): String = {
     val numOutput = metricTerm(ctx, "numOutputRows")
 
-    val initTerm = ctx.freshName("initRange")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, initTerm, s"$initTerm = false;")
-    val number = ctx.freshName("number")
-    ctx.addMutableState(ctx.JAVA_LONG, number, s"$number = 0L;")
+    val initTerm = ctx.addMutableState(ctx.JAVA_BOOLEAN, "initRange")
+    val number = ctx.addMutableState(ctx.JAVA_LONG, "number")
 
     val value = ctx.freshName("value")
     val ev = ExprCode("", "false", value)
     val BigInt = classOf[java.math.BigInteger].getName
 
-    val taskContext = ctx.freshName("taskContext")
-    ctx.addMutableState("TaskContext", taskContext, s"$taskContext = TaskContext.get();")
-    val inputMetrics = ctx.freshName("inputMetrics")
-    ctx.addMutableState("InputMetrics", inputMetrics,
-        s"$inputMetrics = $taskContext.taskMetrics().inputMetrics();")
+    // inline mutable state since not many Range operations in a task
+    val taskContext = ctx.addMutableState("TaskContext", "taskContext",
+      v => s"$v = TaskContext.get();", forceInline = true)
+    val inputMetrics = ctx.addMutableState("InputMetrics", "inputMetrics",
+      v => s"$v = $taskContext.taskMetrics().inputMetrics();", forceInline = true)
 
     // In order to periodically update the metrics without inflicting performance penalty, this
     // operator produces elements in batches. After a batch is complete, the metrics are updated
@@ -386,12 +385,10 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     // the metrics.
 
     // Once number == batchEnd, it's time to progress to the next batch.
-    val batchEnd = ctx.freshName("batchEnd")
-    ctx.addMutableState(ctx.JAVA_LONG, batchEnd, s"$batchEnd = 0;")
+    val batchEnd = ctx.addMutableState(ctx.JAVA_LONG, "batchEnd")
 
     // How many values should still be generated by this range operator.
-    val numElementsTodo = ctx.freshName("numElementsTodo")
-    ctx.addMutableState(ctx.JAVA_LONG, numElementsTodo, s"$numElementsTodo = 0L;")
+    val numElementsTodo = ctx.addMutableState(ctx.JAVA_LONG, "numElementsTodo")
 
     // How many values should be generated in the next batch.
     val nextBatchTodo = ctx.freshName("nextBatchTodo")
@@ -440,10 +437,6 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
         | }
        """.stripMargin)
 
-    val input = ctx.freshName("input")
-    // Right now, Range is only used when there is one upstream.
-    ctx.addMutableState("scala.collection.Iterator", input, s"$input = inputs[0];")
-
     val localIdx = ctx.freshName("localIdx")
     val localEnd = ctx.freshName("localEnd")
     val range = ctx.freshName("range")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
index ff5dd707f0b38..4f28eeb725cbb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/GenerateColumnAccessor.scala
@@ -70,7 +70,6 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
     val ctx = newCodeGenContext()
     val numFields = columnTypes.size
     val (initializeAccessors, extractors) = columnTypes.zipWithIndex.map { case (dt, index) =>
-      val accessorName = ctx.freshName("accessor")
       val accessorCls = dt match {
         case NullType => classOf[NullColumnAccessor].getName
         case BooleanType => classOf[BooleanColumnAccessor].getName
@@ -89,7 +88,7 @@ object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarItera
         case array: ArrayType => classOf[ArrayColumnAccessor].getName
         case t: MapType => classOf[MapColumnAccessor].getName
       }
-      ctx.addMutableState(accessorCls, accessorName)
+      val accessorName = ctx.addMutableState(accessorCls, "accessor")
 
       val createCode = dt match {
         case t if ctx.isPrimitiveType(dt) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index c96ed6ef41016..ee763e23415cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -134,19 +134,18 @@ case class BroadcastHashJoinExec(
     // create a name for HashedRelation
     val broadcastRelation = buildPlan.executeBroadcast[HashedRelation]()
     val broadcast = ctx.addReferenceObj("broadcast", broadcastRelation)
-    val relationTerm = ctx.freshName("relation")
     val clsName = broadcastRelation.value.getClass.getName
 
     // At the end of the task, we update the avg hash probe.
     val avgHashProbe = metricTerm(ctx, "avgHashProbe")
-    val addTaskListener = genTaskListener(avgHashProbe, relationTerm)
 
-    ctx.addMutableState(clsName, relationTerm,
-      s"""
-         | $relationTerm = (($clsName) $broadcast.value()).asReadOnlyCopy();
-         | incPeakExecutionMemory($relationTerm.estimatedSize());
-         | $addTaskListener
-       """.stripMargin)
+    // inline mutable state since not many join operations in a task
+    val relationTerm = ctx.addMutableState(clsName, "relation",
+      v => s"""
+         | $v = (($clsName) $broadcast.value()).asReadOnlyCopy();
+         | incPeakExecutionMemory($v.estimatedSize());
+         | ${genTaskListener(avgHashProbe, v)}
+       """.stripMargin, forceInline = true)
     (broadcastRelation, relationTerm)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 554b73181116c..073730462a75f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -422,10 +422,9 @@ case class SortMergeJoinExec(
    */
   private def genScanner(ctx: CodegenContext): (String, String) = {
     // Create class member for next row from both sides.
-    val leftRow = ctx.freshName("leftRow")
-    ctx.addMutableState("InternalRow", leftRow)
-    val rightRow = ctx.freshName("rightRow")
-    ctx.addMutableState("InternalRow", rightRow, s"$rightRow = null;")
+    // inline mutable state since not many join operations in a task
+    val leftRow = ctx.addMutableState("InternalRow", "leftRow", forceInline = true)
+    val rightRow = ctx.addMutableState("InternalRow", "rightRow", forceInline = true)
 
     // Create variables for join keys from both sides.
     val leftKeyVars = createJoinKey(ctx, leftRow, leftKeys, left.output)
@@ -436,14 +435,13 @@ case class SortMergeJoinExec(
     val rightKeyVars = copyKeys(ctx, rightKeyTmpVars)
 
     // A list to hold all matched rows from right side.
-    val matches = ctx.freshName("matches")
     val clsName = classOf[ExternalAppendOnlyUnsafeRowArray].getName
 
     val spillThreshold = getSpillThreshold
     val inMemoryThreshold = getInMemoryThreshold
 
-    ctx.addMutableState(clsName, matches,
-      s"$matches = new $clsName($inMemoryThreshold, $spillThreshold);")
+    val matches = ctx.addMutableState(clsName, "matches",
+      v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);")
     // Copy the left keys as class members so they could be used in next function call.
     val matchedKeyVars = copyKeys(ctx, leftKeyVars)
 
@@ -578,10 +576,11 @@ case class SortMergeJoinExec(
   override def needCopyResult: Boolean = true
 
   override def doProduce(ctx: CodegenContext): String = {
-    val leftInput = ctx.freshName("leftInput")
-    ctx.addMutableState("scala.collection.Iterator", leftInput, s"$leftInput = inputs[0];")
-    val rightInput = ctx.freshName("rightInput")
-    ctx.addMutableState("scala.collection.Iterator", rightInput, s"$rightInput = inputs[1];")
+    // inline mutable state since not many join operations in a task
+    val leftInput = ctx.addMutableState("scala.collection.Iterator", "leftInput",
+      v => s"$v = inputs[0];", forceInline = true)
+    val rightInput = ctx.addMutableState("scala.collection.Iterator", "rightInput",
+      v => s"$v = inputs[1];", forceInline = true)
 
     val (leftRow, matches) = genScanner(ctx)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index a8556f6ba107a..cccee63bc0680 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -71,8 +71,7 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
   }
 
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
-    val stopEarly = ctx.freshName("stopEarly")
-    ctx.addMutableState(ctx.JAVA_BOOLEAN, stopEarly, s"$stopEarly = false;")
+    val stopEarly = ctx.addMutableState(ctx.JAVA_BOOLEAN, "stopEarly") // init as stopEarly = false
 
     ctx.addNewFunction("stopEarly", s"""
       @Override
@@ -80,8 +79,7 @@ trait BaseLimitExec extends UnaryExecNode with CodegenSupport {
         return $stopEarly;
       }
     """, inlineToOuterClass = true)
-    val countTerm = ctx.freshName("count")
-    ctx.addMutableState(ctx.JAVA_INT, countTerm, s"$countTerm = 0;")
+    val countTerm = ctx.addMutableState(ctx.JAVA_INT, "count") // init as count = 0
     s"""
        | if ($countTerm < $limit) {
        |   $countTerm += 1;

From ef10f452e62c77d0434e80f7266f6685eb1bcb2c Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 19 Dec 2017 09:05:47 -0800
Subject: [PATCH 147/356] [SPARK-21652][SQL][FOLLOW-UP] Fix rule conflict
 caused by InferFiltersFromConstraints

## What changes were proposed in this pull request?
The optimizer rule `InferFiltersFromConstraints` could trigger our batch  `Operator Optimizations` exceeds the max iteration limit (i.e., 100) so that the final plan might not be properly optimized. The rule `InferFiltersFromConstraints` could conflict with the other Filter/Join predicate reduction rules. Thus, we need to separate `InferFiltersFromConstraints` from the other rules.

This PR is to separate `InferFiltersFromConstraints ` from the main batch `Operator Optimizations` .

## How was this patch tested?
The existing test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #19149 from gatorsmile/inferFilterRule.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 115 ++++++++++--------
 1 file changed, 64 insertions(+), 51 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 5acadf8cf330e..6a4d1e997c3c5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -47,7 +47,62 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
   protected def fixedPoint = FixedPoint(SQLConf.get.optimizerMaxIterations)
 
   def batches: Seq[Batch] = {
-    Batch("Eliminate Distinct", Once, EliminateDistinct) ::
+    val operatorOptimizationRuleSet =
+      Seq(
+        // Operator push down
+        PushProjectionThroughUnion,
+        ReorderJoin,
+        EliminateOuterJoin,
+        PushPredicateThroughJoin,
+        PushDownPredicate,
+        LimitPushDown,
+        ColumnPruning,
+        InferFiltersFromConstraints,
+        // Operator combine
+        CollapseRepartition,
+        CollapseProject,
+        CollapseWindow,
+        CombineFilters,
+        CombineLimits,
+        CombineUnions,
+        // Constant folding and strength reduction
+        NullPropagation,
+        ConstantPropagation,
+        FoldablePropagation,
+        OptimizeIn,
+        ConstantFolding,
+        ReorderAssociativeOperator,
+        LikeSimplification,
+        BooleanSimplification,
+        SimplifyConditionals,
+        RemoveDispensableExpressions,
+        SimplifyBinaryComparison,
+        PruneFilters,
+        EliminateSorts,
+        SimplifyCasts,
+        SimplifyCaseConversionExpressions,
+        RewriteCorrelatedScalarSubquery,
+        EliminateSerialization,
+        RemoveRedundantAliases,
+        RemoveRedundantProject,
+        SimplifyCreateStructOps,
+        SimplifyCreateArrayOps,
+        SimplifyCreateMapOps,
+        CombineConcats) ++
+        extendedOperatorOptimizationRules
+
+    val operatorOptimizationBatch: Seq[Batch] = {
+      val rulesWithoutInferFiltersFromConstraints =
+        operatorOptimizationRuleSet.filterNot(_ == InferFiltersFromConstraints)
+      Batch("Operator Optimization before Inferring Filters", fixedPoint,
+        rulesWithoutInferFiltersFromConstraints: _*) ::
+      Batch("Infer Filters", Once,
+        InferFiltersFromConstraints) ::
+      Batch("Operator Optimization after Inferring Filters", fixedPoint,
+        rulesWithoutInferFiltersFromConstraints: _*) :: Nil
+    }
+
+    (Batch("Eliminate Distinct", Once, EliminateDistinct) ::
     // Technically some of the rules in Finish Analysis are not optimizer rules and belong more
     // in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime).
     // However, because we also use the analyzer to canonicalized queries (for view definition),
@@ -81,68 +136,26 @@ abstract class Optimizer(sessionCatalog: SessionCatalog)
       ReplaceDistinctWithAggregate) ::
     Batch("Aggregate", fixedPoint,
       RemoveLiteralFromGroupExpressions,
-      RemoveRepetitionFromGroupExpressions) ::
-    Batch("Operator Optimizations", fixedPoint, Seq(
-      // Operator push down
-      PushProjectionThroughUnion,
-      ReorderJoin,
-      EliminateOuterJoin,
-      InferFiltersFromConstraints,
-      BooleanSimplification,
-      PushPredicateThroughJoin,
-      PushDownPredicate,
-      LimitPushDown,
-      ColumnPruning,
-      // Operator combine
-      CollapseRepartition,
-      CollapseProject,
-      CollapseWindow,
-      CombineFilters,
-      CombineLimits,
-      CombineUnions,
-      // Constant folding and strength reduction
-      NullPropagation,
-      ConstantPropagation,
-      FoldablePropagation,
-      OptimizeIn,
-      ConstantFolding,
-      ReorderAssociativeOperator,
-      LikeSimplification,
-      BooleanSimplification,
-      SimplifyConditionals,
-      RemoveDispensableExpressions,
-      SimplifyBinaryComparison,
-      PruneFilters,
-      EliminateSorts,
-      SimplifyCasts,
-      SimplifyCaseConversionExpressions,
-      RewriteCorrelatedScalarSubquery,
-      EliminateSerialization,
-      RemoveRedundantAliases,
-      RemoveRedundantProject,
-      SimplifyCreateStructOps,
-      SimplifyCreateArrayOps,
-      SimplifyCreateMapOps,
-      CombineConcats) ++
-      extendedOperatorOptimizationRules: _*) ::
+      RemoveRepetitionFromGroupExpressions) :: Nil ++
+    operatorOptimizationBatch) :+
     Batch("Join Reorder", Once,
-      CostBasedJoinReorder) ::
+      CostBasedJoinReorder) :+
     Batch("Decimal Optimizations", fixedPoint,
-      DecimalAggregates) ::
+      DecimalAggregates) :+
     Batch("Object Expressions Optimization", fixedPoint,
       EliminateMapObjects,
-      CombineTypedFilters) ::
+      CombineTypedFilters) :+
     Batch("LocalRelation", fixedPoint,
       ConvertToLocalRelation,
-      PropagateEmptyRelation) ::
+      PropagateEmptyRelation) :+
     // The following batch should be executed after batch "Join Reorder" and "LocalRelation".
     Batch("Check Cartesian Products", Once,
-      CheckCartesianProducts) ::
+      CheckCartesianProducts) :+
     Batch("RewriteSubquery", Once,
       RewritePredicateSubquery,
       ColumnPruning,
       CollapseProject,
-      RemoveRedundantProject) :: Nil
+      RemoveRedundantProject)
   }
 
   /**

From 6129ffa11ea62437a25844455e87a1e4c21b030f Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Tue, 19 Dec 2017 11:56:22 -0800
Subject: [PATCH 148/356] [SPARK-22821][TEST] Basic tests for
 WidenSetOperationTypes, BooleanEquality, StackCoercion and Division

## What changes were proposed in this pull request?

Test Coverage for `WidenSetOperationTypes`, `BooleanEquality`, `StackCoercion`  and `Division`, this is a Sub-tasks for [SPARK-22722](https://issues.apache.org/jira/browse/SPARK-22722).

## How was this patch tested?
N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20006 from wangyum/SPARK-22821.
---
 .../typeCoercion/native/booleanEquality.sql   |  122 ++
 .../inputs/typeCoercion/native/division.sql   |  174 +++
 .../native/widenSetOperationTypes.sql         |  175 +++
 .../native/booleanEquality.sql.out            |  802 ++++++++++
 .../typeCoercion/native/division.sql.out      | 1242 ++++++++++++++++
 .../native/widenSetOperationTypes.sql.out     | 1305 +++++++++++++++++
 6 files changed, 3820 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/booleanEquality.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/division.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/widenSetOperationTypes.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/booleanEquality.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/booleanEquality.sql
new file mode 100644
index 0000000000000..442f2355f8e3a
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/booleanEquality.sql
@@ -0,0 +1,122 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT true = cast(1 as tinyint) FROM t;
+SELECT true = cast(1 as smallint) FROM t;
+SELECT true = cast(1 as int) FROM t;
+SELECT true = cast(1 as bigint) FROM t;
+SELECT true = cast(1 as float) FROM t;
+SELECT true = cast(1 as double) FROM t;
+SELECT true = cast(1 as decimal(10, 0)) FROM t;
+SELECT true = cast(1 as string) FROM t;
+SELECT true = cast('1' as binary) FROM t;
+SELECT true = cast(1 as boolean) FROM t;
+SELECT true = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT true = cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT true <=> cast(1 as tinyint) FROM t;
+SELECT true <=> cast(1 as smallint) FROM t;
+SELECT true <=> cast(1 as int) FROM t;
+SELECT true <=> cast(1 as bigint) FROM t;
+SELECT true <=> cast(1 as float) FROM t;
+SELECT true <=> cast(1 as double) FROM t;
+SELECT true <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT true <=> cast(1 as string) FROM t;
+SELECT true <=> cast('1' as binary) FROM t;
+SELECT true <=> cast(1 as boolean) FROM t;
+SELECT true <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT true <=> cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) = true FROM t;
+SELECT cast(1 as smallint) = true FROM t;
+SELECT cast(1 as int) = true FROM t;
+SELECT cast(1 as bigint) = true FROM t;
+SELECT cast(1 as float) = true FROM t;
+SELECT cast(1 as double) = true FROM t;
+SELECT cast(1 as decimal(10, 0)) = true FROM t;
+SELECT cast(1 as string) = true FROM t;
+SELECT cast('1' as binary) = true FROM t;
+SELECT cast(1 as boolean) = true FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = true FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) = true FROM t;
+
+SELECT cast(1 as tinyint) <=> true FROM t;
+SELECT cast(1 as smallint) <=> true FROM t;
+SELECT cast(1 as int) <=> true FROM t;
+SELECT cast(1 as bigint) <=> true FROM t;
+SELECT cast(1 as float) <=> true FROM t;
+SELECT cast(1 as double) <=> true FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> true FROM t;
+SELECT cast(1 as string) <=> true FROM t;
+SELECT cast('1' as binary) <=> true FROM t;
+SELECT cast(1 as boolean) <=> true FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> true FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <=> true FROM t;
+
+SELECT false = cast(0 as tinyint) FROM t;
+SELECT false = cast(0 as smallint) FROM t;
+SELECT false = cast(0 as int) FROM t;
+SELECT false = cast(0 as bigint) FROM t;
+SELECT false = cast(0 as float) FROM t;
+SELECT false = cast(0 as double) FROM t;
+SELECT false = cast(0 as decimal(10, 0)) FROM t;
+SELECT false = cast(0 as string) FROM t;
+SELECT false = cast('0' as binary) FROM t;
+SELECT false = cast(0 as boolean) FROM t;
+SELECT false = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT false = cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT false <=> cast(0 as tinyint) FROM t;
+SELECT false <=> cast(0 as smallint) FROM t;
+SELECT false <=> cast(0 as int) FROM t;
+SELECT false <=> cast(0 as bigint) FROM t;
+SELECT false <=> cast(0 as float) FROM t;
+SELECT false <=> cast(0 as double) FROM t;
+SELECT false <=> cast(0 as decimal(10, 0)) FROM t;
+SELECT false <=> cast(0 as string) FROM t;
+SELECT false <=> cast('0' as binary) FROM t;
+SELECT false <=> cast(0 as boolean) FROM t;
+SELECT false <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT false <=> cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(0 as tinyint) = false FROM t;
+SELECT cast(0 as smallint) = false FROM t;
+SELECT cast(0 as int) = false FROM t;
+SELECT cast(0 as bigint) = false FROM t;
+SELECT cast(0 as float) = false FROM t;
+SELECT cast(0 as double) = false FROM t;
+SELECT cast(0 as decimal(10, 0)) = false FROM t;
+SELECT cast(0 as string) = false FROM t;
+SELECT cast('0' as binary) = false FROM t;
+SELECT cast(0 as boolean) = false FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = false FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) = false FROM t;
+
+SELECT cast(0 as tinyint) <=> false FROM t;
+SELECT cast(0 as smallint) <=> false FROM t;
+SELECT cast(0 as int) <=> false FROM t;
+SELECT cast(0 as bigint) <=> false FROM t;
+SELECT cast(0 as float) <=> false FROM t;
+SELECT cast(0 as double) <=> false FROM t;
+SELECT cast(0 as decimal(10, 0)) <=> false FROM t;
+SELECT cast(0 as string) <=> false FROM t;
+SELECT cast('0' as binary) <=> false FROM t;
+SELECT cast(0 as boolean) <=> false FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> false FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <=> false FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/division.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/division.sql
new file mode 100644
index 0000000000000..d669740ddd9ca
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/division.sql
@@ -0,0 +1,174 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT cast(1 as tinyint) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as smallint) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as int) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as bigint) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as float) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as double) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as string) FROM t;
+SELECT cast(1 as tinyint) / cast('1' as binary) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as boolean) FROM t;
+SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as smallint) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as smallint) / cast(1 as smallint) FROM t;
+SELECT cast(1 as smallint) / cast(1 as int) FROM t;
+SELECT cast(1 as smallint) / cast(1 as bigint) FROM t;
+SELECT cast(1 as smallint) / cast(1 as float) FROM t;
+SELECT cast(1 as smallint) / cast(1 as double) FROM t;
+SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) / cast(1 as string) FROM t;
+SELECT cast(1 as smallint) / cast('1' as binary) FROM t;
+SELECT cast(1 as smallint) / cast(1 as boolean) FROM t;
+SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as int) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as int) / cast(1 as smallint) FROM t;
+SELECT cast(1 as int) / cast(1 as int) FROM t;
+SELECT cast(1 as int) / cast(1 as bigint) FROM t;
+SELECT cast(1 as int) / cast(1 as float) FROM t;
+SELECT cast(1 as int) / cast(1 as double) FROM t;
+SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) / cast(1 as string) FROM t;
+SELECT cast(1 as int) / cast('1' as binary) FROM t;
+SELECT cast(1 as int) / cast(1 as boolean) FROM t;
+SELECT cast(1 as int) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as int) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as bigint) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as bigint) / cast(1 as smallint) FROM t;
+SELECT cast(1 as bigint) / cast(1 as int) FROM t;
+SELECT cast(1 as bigint) / cast(1 as bigint) FROM t;
+SELECT cast(1 as bigint) / cast(1 as float) FROM t;
+SELECT cast(1 as bigint) / cast(1 as double) FROM t;
+SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) / cast(1 as string) FROM t;
+SELECT cast(1 as bigint) / cast('1' as binary) FROM t;
+SELECT cast(1 as bigint) / cast(1 as boolean) FROM t;
+SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as float) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as float) / cast(1 as smallint) FROM t;
+SELECT cast(1 as float) / cast(1 as int) FROM t;
+SELECT cast(1 as float) / cast(1 as bigint) FROM t;
+SELECT cast(1 as float) / cast(1 as float) FROM t;
+SELECT cast(1 as float) / cast(1 as double) FROM t;
+SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) / cast(1 as string) FROM t;
+SELECT cast(1 as float) / cast('1' as binary) FROM t;
+SELECT cast(1 as float) / cast(1 as boolean) FROM t;
+SELECT cast(1 as float) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as float) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as double) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as double) / cast(1 as smallint) FROM t;
+SELECT cast(1 as double) / cast(1 as int) FROM t;
+SELECT cast(1 as double) / cast(1 as bigint) FROM t;
+SELECT cast(1 as double) / cast(1 as float) FROM t;
+SELECT cast(1 as double) / cast(1 as double) FROM t;
+SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) / cast(1 as string) FROM t;
+SELECT cast(1 as double) / cast('1' as binary) FROM t;
+SELECT cast(1 as double) / cast(1 as boolean) FROM t;
+SELECT cast(1 as double) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as double) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as string) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as string) / cast(1 as smallint) FROM t;
+SELECT cast(1 as string) / cast(1 as int) FROM t;
+SELECT cast(1 as string) / cast(1 as bigint) FROM t;
+SELECT cast(1 as string) / cast(1 as float) FROM t;
+SELECT cast(1 as string) / cast(1 as double) FROM t;
+SELECT cast(1 as string) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as string) / cast(1 as string) FROM t;
+SELECT cast(1 as string) / cast('1' as binary) FROM t;
+SELECT cast(1 as string) / cast(1 as boolean) FROM t;
+SELECT cast(1 as string) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as string) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('1' as binary) / cast(1 as tinyint) FROM t;
+SELECT cast('1' as binary) / cast(1 as smallint) FROM t;
+SELECT cast('1' as binary) / cast(1 as int) FROM t;
+SELECT cast('1' as binary) / cast(1 as bigint) FROM t;
+SELECT cast('1' as binary) / cast(1 as float) FROM t;
+SELECT cast('1' as binary) / cast(1 as double) FROM t;
+SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) / cast(1 as string) FROM t;
+SELECT cast('1' as binary) / cast('1' as binary) FROM t;
+SELECT cast('1' as binary) / cast(1 as boolean) FROM t;
+SELECT cast('1' as binary) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('1' as binary) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as boolean) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as boolean) / cast(1 as smallint) FROM t;
+SELECT cast(1 as boolean) / cast(1 as int) FROM t;
+SELECT cast(1 as boolean) / cast(1 as bigint) FROM t;
+SELECT cast(1 as boolean) / cast(1 as float) FROM t;
+SELECT cast(1 as boolean) / cast(1 as double) FROM t;
+SELECT cast(1 as boolean) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as boolean) / cast(1 as string) FROM t;
+SELECT cast(1 as boolean) / cast('1' as binary) FROM t;
+SELECT cast(1 as boolean) / cast(1 as boolean) FROM t;
+SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as tinyint) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as smallint) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as int) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as bigint) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as float) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as double) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as string) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('1' as binary) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as boolean) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as tinyint) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as smallint) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as int) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as bigint) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as float) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as double) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as string) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast('1' as binary) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as boolean) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00' as date) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/widenSetOperationTypes.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/widenSetOperationTypes.sql
new file mode 100644
index 0000000000000..66e9689850d93
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/widenSetOperationTypes.sql
@@ -0,0 +1,175 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+-- UNION
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as tinyint) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as smallint) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as int) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as bigint) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as float) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as double) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as string) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2' as binary) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as boolean) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out
new file mode 100644
index 0000000000000..46775d79ff4a2
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/booleanEquality.sql.out
@@ -0,0 +1,802 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 97
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT true = cast(1 as tinyint) FROM t
+-- !query 1 schema
+struct<(CAST(true AS TINYINT) = CAST(1 AS TINYINT)):boolean>
+-- !query 1 output
+true
+
+
+-- !query 2
+SELECT true = cast(1 as smallint) FROM t
+-- !query 2 schema
+struct<(CAST(true AS SMALLINT) = CAST(1 AS SMALLINT)):boolean>
+-- !query 2 output
+true
+
+
+-- !query 3
+SELECT true = cast(1 as int) FROM t
+-- !query 3 schema
+struct<(CAST(true AS INT) = CAST(1 AS INT)):boolean>
+-- !query 3 output
+true
+
+
+-- !query 4
+SELECT true = cast(1 as bigint) FROM t
+-- !query 4 schema
+struct<(CAST(true AS BIGINT) = CAST(1 AS BIGINT)):boolean>
+-- !query 4 output
+true
+
+
+-- !query 5
+SELECT true = cast(1 as float) FROM t
+-- !query 5 schema
+struct<(CAST(true AS FLOAT) = CAST(1 AS FLOAT)):boolean>
+-- !query 5 output
+true
+
+
+-- !query 6
+SELECT true = cast(1 as double) FROM t
+-- !query 6 schema
+struct<(CAST(true AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 6 output
+true
+
+
+-- !query 7
+SELECT true = cast(1 as decimal(10, 0)) FROM t
+-- !query 7 schema
+struct<(CAST(true AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 7 output
+true
+
+
+-- !query 8
+SELECT true = cast(1 as string) FROM t
+-- !query 8 schema
+struct<(true = CAST(CAST(1 AS STRING) AS BOOLEAN)):boolean>
+-- !query 8 output
+true
+
+
+-- !query 9
+SELECT true = cast('1' as binary) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(true = CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 10
+SELECT true = cast(1 as boolean) FROM t
+-- !query 10 schema
+struct<(true = CAST(1 AS BOOLEAN)):boolean>
+-- !query 10 output
+true
+
+
+-- !query 11
+SELECT true = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(true = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 12
+SELECT true = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(true = CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7
+
+
+-- !query 13
+SELECT true <=> cast(1 as tinyint) FROM t
+-- !query 13 schema
+struct<(CAST(true AS TINYINT) <=> CAST(1 AS TINYINT)):boolean>
+-- !query 13 output
+true
+
+
+-- !query 14
+SELECT true <=> cast(1 as smallint) FROM t
+-- !query 14 schema
+struct<(CAST(true AS SMALLINT) <=> CAST(1 AS SMALLINT)):boolean>
+-- !query 14 output
+true
+
+
+-- !query 15
+SELECT true <=> cast(1 as int) FROM t
+-- !query 15 schema
+struct<(CAST(true AS INT) <=> CAST(1 AS INT)):boolean>
+-- !query 15 output
+true
+
+
+-- !query 16
+SELECT true <=> cast(1 as bigint) FROM t
+-- !query 16 schema
+struct<(CAST(true AS BIGINT) <=> CAST(1 AS BIGINT)):boolean>
+-- !query 16 output
+true
+
+
+-- !query 17
+SELECT true <=> cast(1 as float) FROM t
+-- !query 17 schema
+struct<(CAST(true AS FLOAT) <=> CAST(1 AS FLOAT)):boolean>
+-- !query 17 output
+true
+
+
+-- !query 18
+SELECT true <=> cast(1 as double) FROM t
+-- !query 18 schema
+struct<(CAST(true AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 18 output
+true
+
+
+-- !query 19
+SELECT true <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 19 schema
+struct<(CAST(true AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 19 output
+true
+
+
+-- !query 20
+SELECT true <=> cast(1 as string) FROM t
+-- !query 20 schema
+struct<(true <=> CAST(CAST(1 AS STRING) AS BOOLEAN)):boolean>
+-- !query 20 output
+true
+
+
+-- !query 21
+SELECT true <=> cast('1' as binary) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(true <=> CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 22
+SELECT true <=> cast(1 as boolean) FROM t
+-- !query 22 schema
+struct<(true <=> CAST(1 AS BOOLEAN)):boolean>
+-- !query 22 output
+true
+
+
+-- !query 23
+SELECT true <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(true <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 24
+SELECT true <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(true <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(true <=> CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7
+
+
+-- !query 25
+SELECT cast(1 as tinyint) = true FROM t
+-- !query 25 schema
+struct<(CAST(1 AS TINYINT) = CAST(true AS TINYINT)):boolean>
+-- !query 25 output
+true
+
+
+-- !query 26
+SELECT cast(1 as smallint) = true FROM t
+-- !query 26 schema
+struct<(CAST(1 AS SMALLINT) = CAST(true AS SMALLINT)):boolean>
+-- !query 26 output
+true
+
+
+-- !query 27
+SELECT cast(1 as int) = true FROM t
+-- !query 27 schema
+struct<(CAST(1 AS INT) = CAST(true AS INT)):boolean>
+-- !query 27 output
+true
+
+
+-- !query 28
+SELECT cast(1 as bigint) = true FROM t
+-- !query 28 schema
+struct<(CAST(1 AS BIGINT) = CAST(true AS BIGINT)):boolean>
+-- !query 28 output
+true
+
+
+-- !query 29
+SELECT cast(1 as float) = true FROM t
+-- !query 29 schema
+struct<(CAST(1 AS FLOAT) = CAST(true AS FLOAT)):boolean>
+-- !query 29 output
+true
+
+
+-- !query 30
+SELECT cast(1 as double) = true FROM t
+-- !query 30 schema
+struct<(CAST(1 AS DOUBLE) = CAST(true AS DOUBLE)):boolean>
+-- !query 30 output
+true
+
+
+-- !query 31
+SELECT cast(1 as decimal(10, 0)) = true FROM t
+-- !query 31 schema
+struct<(CAST(1 AS DECIMAL(10,0)) = CAST(true AS DECIMAL(10,0))):boolean>
+-- !query 31 output
+true
+
+
+-- !query 32
+SELECT cast(1 as string) = true FROM t
+-- !query 32 schema
+struct<(CAST(CAST(1 AS STRING) AS BOOLEAN) = true):boolean>
+-- !query 32 output
+true
+
+
+-- !query 33
+SELECT cast('1' as binary) = true FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = true)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = true)' (binary and boolean).; line 1 pos 7
+
+
+-- !query 34
+SELECT cast(1 as boolean) = true FROM t
+-- !query 34 schema
+struct<(CAST(1 AS BOOLEAN) = true):boolean>
+-- !query 34 output
+true
+
+
+-- !query 35
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = true FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = true)' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 36
+SELECT cast('2017-12-11 09:30:00' as date) = true FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = true)' (date and boolean).; line 1 pos 7
+
+
+-- !query 37
+SELECT cast(1 as tinyint) <=> true FROM t
+-- !query 37 schema
+struct<(CAST(1 AS TINYINT) <=> CAST(true AS TINYINT)):boolean>
+-- !query 37 output
+true
+
+
+-- !query 38
+SELECT cast(1 as smallint) <=> true FROM t
+-- !query 38 schema
+struct<(CAST(1 AS SMALLINT) <=> CAST(true AS SMALLINT)):boolean>
+-- !query 38 output
+true
+
+
+-- !query 39
+SELECT cast(1 as int) <=> true FROM t
+-- !query 39 schema
+struct<(CAST(1 AS INT) <=> CAST(true AS INT)):boolean>
+-- !query 39 output
+true
+
+
+-- !query 40
+SELECT cast(1 as bigint) <=> true FROM t
+-- !query 40 schema
+struct<(CAST(1 AS BIGINT) <=> CAST(true AS BIGINT)):boolean>
+-- !query 40 output
+true
+
+
+-- !query 41
+SELECT cast(1 as float) <=> true FROM t
+-- !query 41 schema
+struct<(CAST(1 AS FLOAT) <=> CAST(true AS FLOAT)):boolean>
+-- !query 41 output
+true
+
+
+-- !query 42
+SELECT cast(1 as double) <=> true FROM t
+-- !query 42 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(true AS DOUBLE)):boolean>
+-- !query 42 output
+true
+
+
+-- !query 43
+SELECT cast(1 as decimal(10, 0)) <=> true FROM t
+-- !query 43 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(true AS DECIMAL(10,0))):boolean>
+-- !query 43 output
+true
+
+
+-- !query 44
+SELECT cast(1 as string) <=> true FROM t
+-- !query 44 schema
+struct<(CAST(CAST(1 AS STRING) AS BOOLEAN) <=> true):boolean>
+-- !query 44 output
+true
+
+
+-- !query 45
+SELECT cast('1' as binary) <=> true FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <=> true)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> true)' (binary and boolean).; line 1 pos 7
+
+
+-- !query 46
+SELECT cast(1 as boolean) <=> true FROM t
+-- !query 46 schema
+struct<(CAST(1 AS BOOLEAN) <=> true):boolean>
+-- !query 46 output
+true
+
+
+-- !query 47
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> true FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> true)' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 48
+SELECT cast('2017-12-11 09:30:00' as date) <=> true FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> true)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> true)' (date and boolean).; line 1 pos 7
+
+
+-- !query 49
+SELECT false = cast(0 as tinyint) FROM t
+-- !query 49 schema
+struct<(CAST(false AS TINYINT) = CAST(0 AS TINYINT)):boolean>
+-- !query 49 output
+true
+
+
+-- !query 50
+SELECT false = cast(0 as smallint) FROM t
+-- !query 50 schema
+struct<(CAST(false AS SMALLINT) = CAST(0 AS SMALLINT)):boolean>
+-- !query 50 output
+true
+
+
+-- !query 51
+SELECT false = cast(0 as int) FROM t
+-- !query 51 schema
+struct<(CAST(false AS INT) = CAST(0 AS INT)):boolean>
+-- !query 51 output
+true
+
+
+-- !query 52
+SELECT false = cast(0 as bigint) FROM t
+-- !query 52 schema
+struct<(CAST(false AS BIGINT) = CAST(0 AS BIGINT)):boolean>
+-- !query 52 output
+true
+
+
+-- !query 53
+SELECT false = cast(0 as float) FROM t
+-- !query 53 schema
+struct<(CAST(false AS FLOAT) = CAST(0 AS FLOAT)):boolean>
+-- !query 53 output
+true
+
+
+-- !query 54
+SELECT false = cast(0 as double) FROM t
+-- !query 54 schema
+struct<(CAST(false AS DOUBLE) = CAST(0 AS DOUBLE)):boolean>
+-- !query 54 output
+true
+
+
+-- !query 55
+SELECT false = cast(0 as decimal(10, 0)) FROM t
+-- !query 55 schema
+struct<(CAST(false AS DECIMAL(10,0)) = CAST(0 AS DECIMAL(10,0))):boolean>
+-- !query 55 output
+true
+
+
+-- !query 56
+SELECT false = cast(0 as string) FROM t
+-- !query 56 schema
+struct<(false = CAST(CAST(0 AS STRING) AS BOOLEAN)):boolean>
+-- !query 56 output
+true
+
+
+-- !query 57
+SELECT false = cast('0' as binary) FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false = CAST('0' AS BINARY))' due to data type mismatch: differing types in '(false = CAST('0' AS BINARY))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 58
+SELECT false = cast(0 as boolean) FROM t
+-- !query 58 schema
+struct<(false = CAST(0 AS BOOLEAN)):boolean>
+-- !query 58 output
+true
+
+
+-- !query 59
+SELECT false = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(false = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 60
+SELECT false = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(false = CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7
+
+
+-- !query 61
+SELECT false <=> cast(0 as tinyint) FROM t
+-- !query 61 schema
+struct<(CAST(false AS TINYINT) <=> CAST(0 AS TINYINT)):boolean>
+-- !query 61 output
+true
+
+
+-- !query 62
+SELECT false <=> cast(0 as smallint) FROM t
+-- !query 62 schema
+struct<(CAST(false AS SMALLINT) <=> CAST(0 AS SMALLINT)):boolean>
+-- !query 62 output
+true
+
+
+-- !query 63
+SELECT false <=> cast(0 as int) FROM t
+-- !query 63 schema
+struct<(CAST(false AS INT) <=> CAST(0 AS INT)):boolean>
+-- !query 63 output
+true
+
+
+-- !query 64
+SELECT false <=> cast(0 as bigint) FROM t
+-- !query 64 schema
+struct<(CAST(false AS BIGINT) <=> CAST(0 AS BIGINT)):boolean>
+-- !query 64 output
+true
+
+
+-- !query 65
+SELECT false <=> cast(0 as float) FROM t
+-- !query 65 schema
+struct<(CAST(false AS FLOAT) <=> CAST(0 AS FLOAT)):boolean>
+-- !query 65 output
+true
+
+
+-- !query 66
+SELECT false <=> cast(0 as double) FROM t
+-- !query 66 schema
+struct<(CAST(false AS DOUBLE) <=> CAST(0 AS DOUBLE)):boolean>
+-- !query 66 output
+true
+
+
+-- !query 67
+SELECT false <=> cast(0 as decimal(10, 0)) FROM t
+-- !query 67 schema
+struct<(CAST(false AS DECIMAL(10,0)) <=> CAST(0 AS DECIMAL(10,0))):boolean>
+-- !query 67 output
+true
+
+
+-- !query 68
+SELECT false <=> cast(0 as string) FROM t
+-- !query 68 schema
+struct<(false <=> CAST(CAST(0 AS STRING) AS BOOLEAN)):boolean>
+-- !query 68 output
+true
+
+
+-- !query 69
+SELECT false <=> cast('0' as binary) FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false <=> CAST('0' AS BINARY))' due to data type mismatch: differing types in '(false <=> CAST('0' AS BINARY))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 70
+SELECT false <=> cast(0 as boolean) FROM t
+-- !query 70 schema
+struct<(false <=> CAST(0 AS BOOLEAN)):boolean>
+-- !query 70 output
+true
+
+
+-- !query 71
+SELECT false <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(false <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 72
+SELECT false <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(false <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(false <=> CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7
+
+
+-- !query 73
+SELECT cast(0 as tinyint) = false FROM t
+-- !query 73 schema
+struct<(CAST(0 AS TINYINT) = CAST(false AS TINYINT)):boolean>
+-- !query 73 output
+true
+
+
+-- !query 74
+SELECT cast(0 as smallint) = false FROM t
+-- !query 74 schema
+struct<(CAST(0 AS SMALLINT) = CAST(false AS SMALLINT)):boolean>
+-- !query 74 output
+true
+
+
+-- !query 75
+SELECT cast(0 as int) = false FROM t
+-- !query 75 schema
+struct<(CAST(0 AS INT) = CAST(false AS INT)):boolean>
+-- !query 75 output
+true
+
+
+-- !query 76
+SELECT cast(0 as bigint) = false FROM t
+-- !query 76 schema
+struct<(CAST(0 AS BIGINT) = CAST(false AS BIGINT)):boolean>
+-- !query 76 output
+true
+
+
+-- !query 77
+SELECT cast(0 as float) = false FROM t
+-- !query 77 schema
+struct<(CAST(0 AS FLOAT) = CAST(false AS FLOAT)):boolean>
+-- !query 77 output
+true
+
+
+-- !query 78
+SELECT cast(0 as double) = false FROM t
+-- !query 78 schema
+struct<(CAST(0 AS DOUBLE) = CAST(false AS DOUBLE)):boolean>
+-- !query 78 output
+true
+
+
+-- !query 79
+SELECT cast(0 as decimal(10, 0)) = false FROM t
+-- !query 79 schema
+struct<(CAST(0 AS DECIMAL(10,0)) = CAST(false AS DECIMAL(10,0))):boolean>
+-- !query 79 output
+true
+
+
+-- !query 80
+SELECT cast(0 as string) = false FROM t
+-- !query 80 schema
+struct<(CAST(CAST(0 AS STRING) AS BOOLEAN) = false):boolean>
+-- !query 80 output
+true
+
+
+-- !query 81
+SELECT cast('0' as binary) = false FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('0' AS BINARY) = false)' due to data type mismatch: differing types in '(CAST('0' AS BINARY) = false)' (binary and boolean).; line 1 pos 7
+
+
+-- !query 82
+SELECT cast(0 as boolean) = false FROM t
+-- !query 82 schema
+struct<(CAST(0 AS BOOLEAN) = false):boolean>
+-- !query 82 output
+true
+
+
+-- !query 83
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = false FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = false)' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 84
+SELECT cast('2017-12-11 09:30:00' as date) = false FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = false)' (date and boolean).; line 1 pos 7
+
+
+-- !query 85
+SELECT cast(0 as tinyint) <=> false FROM t
+-- !query 85 schema
+struct<(CAST(0 AS TINYINT) <=> CAST(false AS TINYINT)):boolean>
+-- !query 85 output
+true
+
+
+-- !query 86
+SELECT cast(0 as smallint) <=> false FROM t
+-- !query 86 schema
+struct<(CAST(0 AS SMALLINT) <=> CAST(false AS SMALLINT)):boolean>
+-- !query 86 output
+true
+
+
+-- !query 87
+SELECT cast(0 as int) <=> false FROM t
+-- !query 87 schema
+struct<(CAST(0 AS INT) <=> CAST(false AS INT)):boolean>
+-- !query 87 output
+true
+
+
+-- !query 88
+SELECT cast(0 as bigint) <=> false FROM t
+-- !query 88 schema
+struct<(CAST(0 AS BIGINT) <=> CAST(false AS BIGINT)):boolean>
+-- !query 88 output
+true
+
+
+-- !query 89
+SELECT cast(0 as float) <=> false FROM t
+-- !query 89 schema
+struct<(CAST(0 AS FLOAT) <=> CAST(false AS FLOAT)):boolean>
+-- !query 89 output
+true
+
+
+-- !query 90
+SELECT cast(0 as double) <=> false FROM t
+-- !query 90 schema
+struct<(CAST(0 AS DOUBLE) <=> CAST(false AS DOUBLE)):boolean>
+-- !query 90 output
+true
+
+
+-- !query 91
+SELECT cast(0 as decimal(10, 0)) <=> false FROM t
+-- !query 91 schema
+struct<(CAST(0 AS DECIMAL(10,0)) <=> CAST(false AS DECIMAL(10,0))):boolean>
+-- !query 91 output
+true
+
+
+-- !query 92
+SELECT cast(0 as string) <=> false FROM t
+-- !query 92 schema
+struct<(CAST(CAST(0 AS STRING) AS BOOLEAN) <=> false):boolean>
+-- !query 92 output
+true
+
+
+-- !query 93
+SELECT cast('0' as binary) <=> false FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('0' AS BINARY) <=> false)' due to data type mismatch: differing types in '(CAST('0' AS BINARY) <=> false)' (binary and boolean).; line 1 pos 7
+
+
+-- !query 94
+SELECT cast(0 as boolean) <=> false FROM t
+-- !query 94 schema
+struct<(CAST(0 AS BOOLEAN) <=> false):boolean>
+-- !query 94 output
+true
+
+
+-- !query 95
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> false FROM t
+-- !query 95 schema
+struct<>
+-- !query 95 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> false)' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 96
+SELECT cast('2017-12-11 09:30:00' as date) <=> false FROM t
+-- !query 96 schema
+struct<>
+-- !query 96 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> false)' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> false)' (date and boolean).; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out
new file mode 100644
index 0000000000000..017e0fea30e90
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/division.sql.out
@@ -0,0 +1,1242 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 145
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT cast(1 as tinyint) / cast(1 as tinyint) FROM t
+-- !query 1 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 1 output
+1.0
+
+
+-- !query 2
+SELECT cast(1 as tinyint) / cast(1 as smallint) FROM t
+-- !query 2 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 2 output
+1.0
+
+
+-- !query 3
+SELECT cast(1 as tinyint) / cast(1 as int) FROM t
+-- !query 3 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 3 output
+1.0
+
+
+-- !query 4
+SELECT cast(1 as tinyint) / cast(1 as bigint) FROM t
+-- !query 4 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 4 output
+1.0
+
+
+-- !query 5
+SELECT cast(1 as tinyint) / cast(1 as float) FROM t
+-- !query 5 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 5 output
+1.0
+
+
+-- !query 6
+SELECT cast(1 as tinyint) / cast(1 as double) FROM t
+-- !query 6 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 6 output
+1.0
+
+
+-- !query 7
+SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 7 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)>
+-- !query 7 output
+1
+
+
+-- !query 8
+SELECT cast(1 as tinyint) / cast(1 as string) FROM t
+-- !query 8 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double>
+-- !query 8 output
+1.0
+
+
+-- !query 9
+SELECT cast(1 as tinyint) / cast('1' as binary) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('1' AS BINARY))' (tinyint and binary).; line 1 pos 7
+
+
+-- !query 10
+SELECT cast(1 as tinyint) / cast(1 as boolean) FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST(1 AS BOOLEAN))' (tinyint and boolean).; line 1 pos 7
+
+
+-- !query 11
+SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (tinyint and timestamp).; line 1 pos 7
+
+
+-- !query 12
+SELECT cast(1 as tinyint) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) / CAST('2017-12-11 09:30:00' AS DATE))' (tinyint and date).; line 1 pos 7
+
+
+-- !query 13
+SELECT cast(1 as smallint) / cast(1 as tinyint) FROM t
+-- !query 13 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 13 output
+1.0
+
+
+-- !query 14
+SELECT cast(1 as smallint) / cast(1 as smallint) FROM t
+-- !query 14 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 14 output
+1.0
+
+
+-- !query 15
+SELECT cast(1 as smallint) / cast(1 as int) FROM t
+-- !query 15 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 15 output
+1.0
+
+
+-- !query 16
+SELECT cast(1 as smallint) / cast(1 as bigint) FROM t
+-- !query 16 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 16 output
+1.0
+
+
+-- !query 17
+SELECT cast(1 as smallint) / cast(1 as float) FROM t
+-- !query 17 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 17 output
+1.0
+
+
+-- !query 18
+SELECT cast(1 as smallint) / cast(1 as double) FROM t
+-- !query 18 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 18 output
+1.0
+
+
+-- !query 19
+SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 19 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)>
+-- !query 19 output
+1
+
+
+-- !query 20
+SELECT cast(1 as smallint) / cast(1 as string) FROM t
+-- !query 20 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double>
+-- !query 20 output
+1.0
+
+
+-- !query 21
+SELECT cast(1 as smallint) / cast('1' as binary) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('1' AS BINARY))' (smallint and binary).; line 1 pos 7
+
+
+-- !query 22
+SELECT cast(1 as smallint) / cast(1 as boolean) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST(1 AS BOOLEAN))' (smallint and boolean).; line 1 pos 7
+
+
+-- !query 23
+SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (smallint and timestamp).; line 1 pos 7
+
+
+-- !query 24
+SELECT cast(1 as smallint) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) / CAST('2017-12-11 09:30:00' AS DATE))' (smallint and date).; line 1 pos 7
+
+
+-- !query 25
+SELECT cast(1 as int) / cast(1 as tinyint) FROM t
+-- !query 25 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 25 output
+1.0
+
+
+-- !query 26
+SELECT cast(1 as int) / cast(1 as smallint) FROM t
+-- !query 26 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 26 output
+1.0
+
+
+-- !query 27
+SELECT cast(1 as int) / cast(1 as int) FROM t
+-- !query 27 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 27 output
+1.0
+
+
+-- !query 28
+SELECT cast(1 as int) / cast(1 as bigint) FROM t
+-- !query 28 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 28 output
+1.0
+
+
+-- !query 29
+SELECT cast(1 as int) / cast(1 as float) FROM t
+-- !query 29 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 29 output
+1.0
+
+
+-- !query 30
+SELECT cast(1 as int) / cast(1 as double) FROM t
+-- !query 30 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 30 output
+1.0
+
+
+-- !query 31
+SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t
+-- !query 31 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 31 output
+1
+
+
+-- !query 32
+SELECT cast(1 as int) / cast(1 as string) FROM t
+-- !query 32 schema
+struct<(CAST(CAST(1 AS INT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double>
+-- !query 32 output
+1.0
+
+
+-- !query 33
+SELECT cast(1 as int) / cast('1' as binary) FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('1' AS BINARY))' (int and binary).; line 1 pos 7
+
+
+-- !query 34
+SELECT cast(1 as int) / cast(1 as boolean) FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST(1 AS BOOLEAN))' (int and boolean).; line 1 pos 7
+
+
+-- !query 35
+SELECT cast(1 as int) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (int and timestamp).; line 1 pos 7
+
+
+-- !query 36
+SELECT cast(1 as int) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS INT) / CAST('2017-12-11 09:30:00' AS DATE))' (int and date).; line 1 pos 7
+
+
+-- !query 37
+SELECT cast(1 as bigint) / cast(1 as tinyint) FROM t
+-- !query 37 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 37 output
+1.0
+
+
+-- !query 38
+SELECT cast(1 as bigint) / cast(1 as smallint) FROM t
+-- !query 38 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 38 output
+1.0
+
+
+-- !query 39
+SELECT cast(1 as bigint) / cast(1 as int) FROM t
+-- !query 39 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 39 output
+1.0
+
+
+-- !query 40
+SELECT cast(1 as bigint) / cast(1 as bigint) FROM t
+-- !query 40 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 40 output
+1.0
+
+
+-- !query 41
+SELECT cast(1 as bigint) / cast(1 as float) FROM t
+-- !query 41 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 41 output
+1.0
+
+
+-- !query 42
+SELECT cast(1 as bigint) / cast(1 as double) FROM t
+-- !query 42 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 42 output
+1.0
+
+
+-- !query 43
+SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 43 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)>
+-- !query 43 output
+1
+
+
+-- !query 44
+SELECT cast(1 as bigint) / cast(1 as string) FROM t
+-- !query 44 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double>
+-- !query 44 output
+1.0
+
+
+-- !query 45
+SELECT cast(1 as bigint) / cast('1' as binary) FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('1' AS BINARY))' (bigint and binary).; line 1 pos 7
+
+
+-- !query 46
+SELECT cast(1 as bigint) / cast(1 as boolean) FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST(1 AS BOOLEAN))' (bigint and boolean).; line 1 pos 7
+
+
+-- !query 47
+SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (bigint and timestamp).; line 1 pos 7
+
+
+-- !query 48
+SELECT cast(1 as bigint) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) / CAST('2017-12-11 09:30:00' AS DATE))' (bigint and date).; line 1 pos 7
+
+
+-- !query 49
+SELECT cast(1 as float) / cast(1 as tinyint) FROM t
+-- !query 49 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 49 output
+1.0
+
+
+-- !query 50
+SELECT cast(1 as float) / cast(1 as smallint) FROM t
+-- !query 50 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 50 output
+1.0
+
+
+-- !query 51
+SELECT cast(1 as float) / cast(1 as int) FROM t
+-- !query 51 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 51 output
+1.0
+
+
+-- !query 52
+SELECT cast(1 as float) / cast(1 as bigint) FROM t
+-- !query 52 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 52 output
+1.0
+
+
+-- !query 53
+SELECT cast(1 as float) / cast(1 as float) FROM t
+-- !query 53 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 53 output
+1.0
+
+
+-- !query 54
+SELECT cast(1 as float) / cast(1 as double) FROM t
+-- !query 54 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(1 AS DOUBLE) AS DOUBLE)):double>
+-- !query 54 output
+1.0
+
+
+-- !query 55
+SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t
+-- !query 55 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) AS DOUBLE)):double>
+-- !query 55 output
+1.0
+
+
+-- !query 56
+SELECT cast(1 as float) / cast(1 as string) FROM t
+-- !query 56 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS STRING) AS DOUBLE) AS DOUBLE)):double>
+-- !query 56 output
+1.0
+
+
+-- !query 57
+SELECT cast(1 as float) / cast('1' as binary) FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('1' AS BINARY))' (float and binary).; line 1 pos 7
+
+
+-- !query 58
+SELECT cast(1 as float) / cast(1 as boolean) FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST(1 AS BOOLEAN))' (float and boolean).; line 1 pos 7
+
+
+-- !query 59
+SELECT cast(1 as float) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (float and timestamp).; line 1 pos 7
+
+
+-- !query 60
+SELECT cast(1 as float) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) / CAST('2017-12-11 09:30:00' AS DATE))' (float and date).; line 1 pos 7
+
+
+-- !query 61
+SELECT cast(1 as double) / cast(1 as tinyint) FROM t
+-- !query 61 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 61 output
+1.0
+
+
+-- !query 62
+SELECT cast(1 as double) / cast(1 as smallint) FROM t
+-- !query 62 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 62 output
+1.0
+
+
+-- !query 63
+SELECT cast(1 as double) / cast(1 as int) FROM t
+-- !query 63 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 63 output
+1.0
+
+
+-- !query 64
+SELECT cast(1 as double) / cast(1 as bigint) FROM t
+-- !query 64 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 64 output
+1.0
+
+
+-- !query 65
+SELECT cast(1 as double) / cast(1 as float) FROM t
+-- !query 65 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 65 output
+1.0
+
+
+-- !query 66
+SELECT cast(1 as double) / cast(1 as double) FROM t
+-- !query 66 schema
+struct<(CAST(1 AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 66 output
+1.0
+
+
+-- !query 67
+SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t
+-- !query 67 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 67 output
+1.0
+
+
+-- !query 68
+SELECT cast(1 as double) / cast(1 as string) FROM t
+-- !query 68 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 68 output
+1.0
+
+
+-- !query 69
+SELECT cast(1 as double) / cast('1' as binary) FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 70
+SELECT cast(1 as double) / cast(1 as boolean) FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 71
+SELECT cast(1 as double) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 72
+SELECT cast(1 as double) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 73
+SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t
+-- !query 73 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 73 output
+1
+
+
+-- !query 74
+SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t
+-- !query 74 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 74 output
+1
+
+
+-- !query 75
+SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t
+-- !query 75 schema
+struct<(CAST(1 AS DECIMAL(10,0)) / CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 75 output
+1
+
+
+-- !query 76
+SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t
+-- !query 76 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)>
+-- !query 76 output
+1
+
+
+-- !query 77
+SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t
+-- !query 77 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 77 output
+1.0
+
+
+-- !query 78
+SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t
+-- !query 78 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 78 output
+1.0
+
+
+-- !query 79
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t
+-- !query 79 schema
+struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 79 output
+1
+
+
+-- !query 80
+SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t
+-- !query 80 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 80 output
+1.0
+
+
+-- !query 81
+SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 82
+SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 83
+SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 84
+SELECT cast(1 as decimal(10, 0)) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 85
+SELECT cast(1 as string) / cast(1 as tinyint) FROM t
+-- !query 85 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS TINYINT) AS DOUBLE)):double>
+-- !query 85 output
+1.0
+
+
+-- !query 86
+SELECT cast(1 as string) / cast(1 as smallint) FROM t
+-- !query 86 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS SMALLINT) AS DOUBLE)):double>
+-- !query 86 output
+1.0
+
+
+-- !query 87
+SELECT cast(1 as string) / cast(1 as int) FROM t
+-- !query 87 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS INT) AS DOUBLE)):double>
+-- !query 87 output
+1.0
+
+
+-- !query 88
+SELECT cast(1 as string) / cast(1 as bigint) FROM t
+-- !query 88 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS BIGINT) AS DOUBLE)):double>
+-- !query 88 output
+1.0
+
+
+-- !query 89
+SELECT cast(1 as string) / cast(1 as float) FROM t
+-- !query 89 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 89 output
+1.0
+
+
+-- !query 90
+SELECT cast(1 as string) / cast(1 as double) FROM t
+-- !query 90 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 90 output
+1.0
+
+
+-- !query 91
+SELECT cast(1 as string) / cast(1 as decimal(10, 0)) FROM t
+-- !query 91 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 91 output
+1.0
+
+
+-- !query 92
+SELECT cast(1 as string) / cast(1 as string) FROM t
+-- !query 92 schema
+struct<(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 92 output
+1.0
+
+
+-- !query 93
+SELECT cast(1 as string) / cast('1' as binary) FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('1' AS BINARY))' (double and binary).; line 1 pos 7
+
+
+-- !query 94
+SELECT cast(1 as string) / cast(1 as boolean) FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST(1 AS BOOLEAN))' (double and boolean).; line 1 pos 7
+
+
+-- !query 95
+SELECT cast(1 as string) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 95 schema
+struct<>
+-- !query 95 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (double and timestamp).; line 1 pos 7
+
+
+-- !query 96
+SELECT cast(1 as string) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 96 schema
+struct<>
+-- !query 96 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(CAST(1 AS STRING) AS DOUBLE) / CAST('2017-12-11 09:30:00' AS DATE))' (double and date).; line 1 pos 7
+
+
+-- !query 97
+SELECT cast('1' as binary) / cast(1 as tinyint) FROM t
+-- !query 97 schema
+struct<>
+-- !query 97 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS TINYINT))' (binary and tinyint).; line 1 pos 7
+
+
+-- !query 98
+SELECT cast('1' as binary) / cast(1 as smallint) FROM t
+-- !query 98 schema
+struct<>
+-- !query 98 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS SMALLINT))' (binary and smallint).; line 1 pos 7
+
+
+-- !query 99
+SELECT cast('1' as binary) / cast(1 as int) FROM t
+-- !query 99 schema
+struct<>
+-- !query 99 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS INT))' (binary and int).; line 1 pos 7
+
+
+-- !query 100
+SELECT cast('1' as binary) / cast(1 as bigint) FROM t
+-- !query 100 schema
+struct<>
+-- !query 100 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS BIGINT))' (binary and bigint).; line 1 pos 7
+
+
+-- !query 101
+SELECT cast('1' as binary) / cast(1 as float) FROM t
+-- !query 101 schema
+struct<>
+-- !query 101 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS FLOAT))' (binary and float).; line 1 pos 7
+
+
+-- !query 102
+SELECT cast('1' as binary) / cast(1 as double) FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 103
+SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 104
+SELECT cast('1' as binary) / cast(1 as string) FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(CAST(1 AS STRING) AS DOUBLE))' (binary and double).; line 1 pos 7
+
+
+-- !query 105
+SELECT cast('1' as binary) / cast('1' as binary) FROM t
+-- !query 105 schema
+struct<>
+-- !query 105 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST('1' AS BINARY))' due to data type mismatch: '(CAST('1' AS BINARY) / CAST('1' AS BINARY))' requires (double or decimal) type, not binary; line 1 pos 7
+
+
+-- !query 106
+SELECT cast('1' as binary) / cast(1 as boolean) FROM t
+-- !query 106 schema
+struct<>
+-- !query 106 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS BOOLEAN))' (binary and boolean).; line 1 pos 7
+
+
+-- !query 107
+SELECT cast('1' as binary) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 107 schema
+struct<>
+-- !query 107 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (binary and timestamp).; line 1 pos 7
+
+
+-- !query 108
+SELECT cast('1' as binary) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 108 schema
+struct<>
+-- !query 108 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST('2017-12-11 09:30:00' AS DATE))' (binary and date).; line 1 pos 7
+
+
+-- !query 109
+SELECT cast(1 as boolean) / cast(1 as tinyint) FROM t
+-- !query 109 schema
+struct<>
+-- !query 109 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS TINYINT))' (boolean and tinyint).; line 1 pos 7
+
+
+-- !query 110
+SELECT cast(1 as boolean) / cast(1 as smallint) FROM t
+-- !query 110 schema
+struct<>
+-- !query 110 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS SMALLINT))' (boolean and smallint).; line 1 pos 7
+
+
+-- !query 111
+SELECT cast(1 as boolean) / cast(1 as int) FROM t
+-- !query 111 schema
+struct<>
+-- !query 111 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS INT))' (boolean and int).; line 1 pos 7
+
+
+-- !query 112
+SELECT cast(1 as boolean) / cast(1 as bigint) FROM t
+-- !query 112 schema
+struct<>
+-- !query 112 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS BIGINT))' (boolean and bigint).; line 1 pos 7
+
+
+-- !query 113
+SELECT cast(1 as boolean) / cast(1 as float) FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS FLOAT))' (boolean and float).; line 1 pos 7
+
+
+-- !query 114
+SELECT cast(1 as boolean) / cast(1 as double) FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 115
+SELECT cast(1 as boolean) / cast(1 as decimal(10, 0)) FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(1 AS DECIMAL(10,0)))' (boolean and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 116
+SELECT cast(1 as boolean) / cast(1 as string) FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST(CAST(1 AS STRING) AS DOUBLE))' (boolean and double).; line 1 pos 7
+
+
+-- !query 117
+SELECT cast(1 as boolean) / cast('1' as binary) FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('1' AS BINARY))' (boolean and binary).; line 1 pos 7
+
+
+-- !query 118
+SELECT cast(1 as boolean) / cast(1 as boolean) FROM t
+-- !query 118 schema
+struct<>
+-- !query 118 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST(1 AS BOOLEAN))' due to data type mismatch: '(CAST(1 AS BOOLEAN) / CAST(1 AS BOOLEAN))' requires (double or decimal) type, not boolean; line 1 pos 7
+
+
+-- !query 119
+SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (boolean and timestamp).; line 1 pos 7
+
+
+-- !query 120
+SELECT cast(1 as boolean) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) / CAST('2017-12-11 09:30:00' AS DATE))' (boolean and date).; line 1 pos 7
+
+
+-- !query 121
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as tinyint) FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS TINYINT))' (timestamp and tinyint).; line 1 pos 7
+
+
+-- !query 122
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as smallint) FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS SMALLINT))' (timestamp and smallint).; line 1 pos 7
+
+
+-- !query 123
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as int) FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS INT))' (timestamp and int).; line 1 pos 7
+
+
+-- !query 124
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as bigint) FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BIGINT))' (timestamp and bigint).; line 1 pos 7
+
+
+-- !query 125
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as float) FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS FLOAT))' (timestamp and float).; line 1 pos 7
+
+
+-- !query 126
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as double) FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 127
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 128
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as string) FROM t
+-- !query 128 schema
+struct<>
+-- !query 128 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(CAST(1 AS STRING) AS DOUBLE))' (timestamp and double).; line 1 pos 7
+
+
+-- !query 129
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('1' as binary) FROM t
+-- !query 129 schema
+struct<>
+-- !query 129 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('1' AS BINARY))' (timestamp and binary).; line 1 pos 7
+
+
+-- !query 130
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast(1 as boolean) FROM t
+-- !query 130 schema
+struct<>
+-- !query 130 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS BOOLEAN))' (timestamp and boolean).; line 1 pos 7
+
+
+-- !query 131
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 131 schema
+struct<>
+-- !query 131 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' requires (double or decimal) type, not timestamp; line 1 pos 7
+
+
+-- !query 132
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 132 schema
+struct<>
+-- !query 132 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) / CAST('2017-12-11 09:30:00' AS DATE))' (timestamp and date).; line 1 pos 7
+
+
+-- !query 133
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as tinyint) FROM t
+-- !query 133 schema
+struct<>
+-- !query 133 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS TINYINT))' (date and tinyint).; line 1 pos 7
+
+
+-- !query 134
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as smallint) FROM t
+-- !query 134 schema
+struct<>
+-- !query 134 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS SMALLINT))' (date and smallint).; line 1 pos 7
+
+
+-- !query 135
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as int) FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS INT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS INT))' (date and int).; line 1 pos 7
+
+
+-- !query 136
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as bigint) FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BIGINT))' (date and bigint).; line 1 pos 7
+
+
+-- !query 137
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as float) FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS FLOAT))' (date and float).; line 1 pos 7
+
+
+-- !query 138
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as double) FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 139
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t
+-- !query 139 schema
+struct<>
+-- !query 139 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 140
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as string) FROM t
+-- !query 140 schema
+struct<>
+-- !query 140 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(CAST(1 AS STRING) AS DOUBLE))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(CAST(1 AS STRING) AS DOUBLE))' (date and double).; line 1 pos 7
+
+
+-- !query 141
+SELECT cast('2017-12-11 09:30:00' as date) / cast('1' as binary) FROM t
+-- !query 141 schema
+struct<>
+-- !query 141 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('1' AS BINARY))' (date and binary).; line 1 pos 7
+
+
+-- !query 142
+SELECT cast('2017-12-11 09:30:00' as date) / cast(1 as boolean) FROM t
+-- !query 142 schema
+struct<>
+-- !query 142 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST(1 AS BOOLEAN))' (date and boolean).; line 1 pos 7
+
+
+-- !query 143
+SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 143 schema
+struct<>
+-- !query 143 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (date and timestamp).; line 1 pos 7
+
+
+-- !query 144
+SELECT cast('2017-12-11 09:30:00' as date) / cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 144 schema
+struct<>
+-- !query 144 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: '(CAST('2017-12-11 09:30:00' AS DATE) / CAST('2017-12-11 09:30:00' AS DATE))' requires (double or decimal) type, not date; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out
new file mode 100644
index 0000000000000..20a9e47217238
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/widenSetOperationTypes.sql.out
@@ -0,0 +1,1305 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 145
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 1 schema
+struct<CAST(1 AS TINYINT):tinyint>
+-- !query 1 output
+1
+2
+
+
+-- !query 2
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 2 schema
+struct<CAST(1 AS TINYINT):smallint>
+-- !query 2 output
+1
+2
+
+
+-- !query 3
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 3 schema
+struct<CAST(1 AS TINYINT):int>
+-- !query 3 output
+1
+2
+
+
+-- !query 4
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 4 schema
+struct<CAST(1 AS TINYINT):bigint>
+-- !query 4 output
+1
+2
+
+
+-- !query 5
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 5 schema
+struct<CAST(1 AS TINYINT):float>
+-- !query 5 output
+1.0
+2.0
+
+
+-- !query 6
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 6 schema
+struct<CAST(1 AS TINYINT):double>
+-- !query 6 output
+1.0
+2.0
+
+
+-- !query 7
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 7 schema
+struct<CAST(1 AS TINYINT):decimal(10,0)>
+-- !query 7 output
+1
+2
+
+
+-- !query 8
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 8 schema
+struct<CAST(1 AS TINYINT):string>
+-- !query 8 output
+1
+2
+
+
+-- !query 9
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 9 schema
+struct<>
+-- !query 9 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> tinyint at the first column of the second table;
+
+
+-- !query 10
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> tinyint at the first column of the second table;
+
+
+-- !query 11
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> tinyint at the first column of the second table;
+
+
+-- !query 12
+SELECT cast(1 as tinyint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> tinyint at the first column of the second table;
+
+
+-- !query 13
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 13 schema
+struct<CAST(1 AS SMALLINT):smallint>
+-- !query 13 output
+1
+2
+
+
+-- !query 14
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 14 schema
+struct<CAST(1 AS SMALLINT):smallint>
+-- !query 14 output
+1
+2
+
+
+-- !query 15
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 15 schema
+struct<CAST(1 AS SMALLINT):int>
+-- !query 15 output
+1
+2
+
+
+-- !query 16
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 16 schema
+struct<CAST(1 AS SMALLINT):bigint>
+-- !query 16 output
+1
+2
+
+
+-- !query 17
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 17 schema
+struct<CAST(1 AS SMALLINT):float>
+-- !query 17 output
+1.0
+2.0
+
+
+-- !query 18
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 18 schema
+struct<CAST(1 AS SMALLINT):double>
+-- !query 18 output
+1.0
+2.0
+
+
+-- !query 19
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 19 schema
+struct<CAST(1 AS SMALLINT):decimal(10,0)>
+-- !query 19 output
+1
+2
+
+
+-- !query 20
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 20 schema
+struct<CAST(1 AS SMALLINT):string>
+-- !query 20 output
+1
+2
+
+
+-- !query 21
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> smallint at the first column of the second table;
+
+
+-- !query 22
+SELECT cast(1 as smallint) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> smallint at the first column of the second table;
+
+
+-- !query 23
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> smallint at the first column of the second table;
+
+
+-- !query 24
+SELECT cast(1 as smallint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> smallint at the first column of the second table;
+
+
+-- !query 25
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 25 schema
+struct<CAST(1 AS INT):int>
+-- !query 25 output
+1
+2
+
+
+-- !query 26
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 26 schema
+struct<CAST(1 AS INT):int>
+-- !query 26 output
+1
+2
+
+
+-- !query 27
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 27 schema
+struct<CAST(1 AS INT):int>
+-- !query 27 output
+1
+2
+
+
+-- !query 28
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 28 schema
+struct<CAST(1 AS INT):bigint>
+-- !query 28 output
+1
+2
+
+
+-- !query 29
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 29 schema
+struct<CAST(1 AS INT):float>
+-- !query 29 output
+1.0
+2.0
+
+
+-- !query 30
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 30 schema
+struct<CAST(1 AS INT):double>
+-- !query 30 output
+1.0
+2.0
+
+
+-- !query 31
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 31 schema
+struct<CAST(1 AS INT):decimal(10,0)>
+-- !query 31 output
+1
+2
+
+
+-- !query 32
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 32 schema
+struct<CAST(1 AS INT):string>
+-- !query 32 output
+1
+2
+
+
+-- !query 33
+SELECT cast(1 as int) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> int at the first column of the second table;
+
+
+-- !query 34
+SELECT cast(1 as int) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> int at the first column of the second table;
+
+
+-- !query 35
+SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> int at the first column of the second table;
+
+
+-- !query 36
+SELECT cast(1 as int) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> int at the first column of the second table;
+
+
+-- !query 37
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 37 schema
+struct<CAST(1 AS BIGINT):bigint>
+-- !query 37 output
+1
+2
+
+
+-- !query 38
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 38 schema
+struct<CAST(1 AS BIGINT):bigint>
+-- !query 38 output
+1
+2
+
+
+-- !query 39
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 39 schema
+struct<CAST(1 AS BIGINT):bigint>
+-- !query 39 output
+1
+2
+
+
+-- !query 40
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 40 schema
+struct<CAST(1 AS BIGINT):bigint>
+-- !query 40 output
+1
+2
+
+
+-- !query 41
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 41 schema
+struct<CAST(1 AS BIGINT):float>
+-- !query 41 output
+1.0
+2.0
+
+
+-- !query 42
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 42 schema
+struct<CAST(1 AS BIGINT):double>
+-- !query 42 output
+1.0
+2.0
+
+
+-- !query 43
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 43 schema
+struct<CAST(1 AS BIGINT):decimal(20,0)>
+-- !query 43 output
+1
+2
+
+
+-- !query 44
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 44 schema
+struct<CAST(1 AS BIGINT):string>
+-- !query 44 output
+1
+2
+
+
+-- !query 45
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 45 schema
+struct<>
+-- !query 45 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> bigint at the first column of the second table;
+
+
+-- !query 46
+SELECT cast(1 as bigint) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 46 schema
+struct<>
+-- !query 46 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> bigint at the first column of the second table;
+
+
+-- !query 47
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 47 schema
+struct<>
+-- !query 47 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> bigint at the first column of the second table;
+
+
+-- !query 48
+SELECT cast(1 as bigint) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 48 schema
+struct<>
+-- !query 48 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> bigint at the first column of the second table;
+
+
+-- !query 49
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 49 schema
+struct<CAST(1 AS FLOAT):float>
+-- !query 49 output
+1.0
+2.0
+
+
+-- !query 50
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 50 schema
+struct<CAST(1 AS FLOAT):float>
+-- !query 50 output
+1.0
+2.0
+
+
+-- !query 51
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 51 schema
+struct<CAST(1 AS FLOAT):float>
+-- !query 51 output
+1.0
+2.0
+
+
+-- !query 52
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 52 schema
+struct<CAST(1 AS FLOAT):float>
+-- !query 52 output
+1.0
+2.0
+
+
+-- !query 53
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 53 schema
+struct<CAST(1 AS FLOAT):float>
+-- !query 53 output
+1.0
+2.0
+
+
+-- !query 54
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 54 schema
+struct<CAST(1 AS FLOAT):double>
+-- !query 54 output
+1.0
+2.0
+
+
+-- !query 55
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 55 schema
+struct<CAST(1 AS FLOAT):double>
+-- !query 55 output
+1.0
+2.0
+
+
+-- !query 56
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 56 schema
+struct<CAST(1 AS FLOAT):string>
+-- !query 56 output
+1.0
+2
+
+
+-- !query 57
+SELECT cast(1 as float) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 57 schema
+struct<>
+-- !query 57 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> float at the first column of the second table;
+
+
+-- !query 58
+SELECT cast(1 as float) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 58 schema
+struct<>
+-- !query 58 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> float at the first column of the second table;
+
+
+-- !query 59
+SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 59 schema
+struct<>
+-- !query 59 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> float at the first column of the second table;
+
+
+-- !query 60
+SELECT cast(1 as float) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 60 schema
+struct<>
+-- !query 60 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> float at the first column of the second table;
+
+
+-- !query 61
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 61 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 61 output
+1.0
+2.0
+
+
+-- !query 62
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 62 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 62 output
+1.0
+2.0
+
+
+-- !query 63
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 63 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 63 output
+1.0
+2.0
+
+
+-- !query 64
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 64 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 64 output
+1.0
+2.0
+
+
+-- !query 65
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 65 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 65 output
+1.0
+2.0
+
+
+-- !query 66
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 66 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 66 output
+1.0
+2.0
+
+
+-- !query 67
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 67 schema
+struct<CAST(1 AS DOUBLE):double>
+-- !query 67 output
+1.0
+2.0
+
+
+-- !query 68
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 68 schema
+struct<CAST(1 AS DOUBLE):string>
+-- !query 68 output
+1.0
+2
+
+
+-- !query 69
+SELECT cast(1 as double) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 69 schema
+struct<>
+-- !query 69 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> double at the first column of the second table;
+
+
+-- !query 70
+SELECT cast(1 as double) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 70 schema
+struct<>
+-- !query 70 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> double at the first column of the second table;
+
+
+-- !query 71
+SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 71 schema
+struct<>
+-- !query 71 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> double at the first column of the second table;
+
+
+-- !query 72
+SELECT cast(1 as double) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 72 schema
+struct<>
+-- !query 72 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> double at the first column of the second table;
+
+
+-- !query 73
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 73 schema
+struct<CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 73 output
+1
+2
+
+
+-- !query 74
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 74 schema
+struct<CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 74 output
+1
+2
+
+
+-- !query 75
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 75 schema
+struct<CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 75 output
+1
+2
+
+
+-- !query 76
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 76 schema
+struct<CAST(1 AS DECIMAL(10,0)):decimal(20,0)>
+-- !query 76 output
+1
+2
+
+
+-- !query 77
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 77 schema
+struct<CAST(1 AS DECIMAL(10,0)):double>
+-- !query 77 output
+1.0
+2.0
+
+
+-- !query 78
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 78 schema
+struct<CAST(1 AS DECIMAL(10,0)):double>
+-- !query 78 output
+1.0
+2.0
+
+
+-- !query 79
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 79 schema
+struct<CAST(1 AS DECIMAL(10,0)):decimal(10,0)>
+-- !query 79 output
+1
+2
+
+
+-- !query 80
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 80 schema
+struct<CAST(1 AS DECIMAL(10,0)):string>
+-- !query 80 output
+1
+2
+
+
+-- !query 81
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> decimal(10,0) at the first column of the second table;
+
+
+-- !query 82
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> decimal(10,0) at the first column of the second table;
+
+
+-- !query 83
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> decimal(10,0) at the first column of the second table;
+
+
+-- !query 84
+SELECT cast(1 as decimal(10, 0)) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> decimal(10,0) at the first column of the second table;
+
+
+-- !query 85
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 85 schema
+struct<CAST(1 AS STRING):string>
+-- !query 85 output
+1
+2
+
+
+-- !query 86
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 86 schema
+struct<CAST(1 AS STRING):string>
+-- !query 86 output
+1
+2
+
+
+-- !query 87
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 87 schema
+struct<CAST(1 AS STRING):string>
+-- !query 87 output
+1
+2
+
+
+-- !query 88
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 88 schema
+struct<CAST(1 AS STRING):string>
+-- !query 88 output
+1
+2
+
+
+-- !query 89
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 89 schema
+struct<CAST(1 AS STRING):string>
+-- !query 89 output
+1
+2.0
+
+
+-- !query 90
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 90 schema
+struct<CAST(1 AS STRING):string>
+-- !query 90 output
+1
+2.0
+
+
+-- !query 91
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 91 schema
+struct<CAST(1 AS STRING):string>
+-- !query 91 output
+1
+2
+
+
+-- !query 92
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 92 schema
+struct<CAST(1 AS STRING):string>
+-- !query 92 output
+1
+2
+
+
+-- !query 93
+SELECT cast(1 as string) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 93 schema
+struct<>
+-- !query 93 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> string at the first column of the second table;
+
+
+-- !query 94
+SELECT cast(1 as string) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 94 schema
+struct<>
+-- !query 94 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> string at the first column of the second table;
+
+
+-- !query 95
+SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 95 schema
+struct<CAST(1 AS STRING):string>
+-- !query 95 output
+1
+2017-12-11 09:30:00
+
+
+-- !query 96
+SELECT cast(1 as string) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 96 schema
+struct<CAST(1 AS STRING):string>
+-- !query 96 output
+1
+2017-12-11
+
+
+-- !query 97
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 97 schema
+struct<>
+-- !query 97 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. tinyint <> binary at the first column of the second table;
+
+
+-- !query 98
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 98 schema
+struct<>
+-- !query 98 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. smallint <> binary at the first column of the second table;
+
+
+-- !query 99
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 99 schema
+struct<>
+-- !query 99 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. int <> binary at the first column of the second table;
+
+
+-- !query 100
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 100 schema
+struct<>
+-- !query 100 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. bigint <> binary at the first column of the second table;
+
+
+-- !query 101
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 101 schema
+struct<>
+-- !query 101 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. float <> binary at the first column of the second table;
+
+
+-- !query 102
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 102 schema
+struct<>
+-- !query 102 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. double <> binary at the first column of the second table;
+
+
+-- !query 103
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 103 schema
+struct<>
+-- !query 103 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. decimal(10,0) <> binary at the first column of the second table;
+
+
+-- !query 104
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 104 schema
+struct<>
+-- !query 104 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. string <> binary at the first column of the second table;
+
+
+-- !query 105
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 105 schema
+struct<CAST(1 AS BINARY):binary>
+-- !query 105 output
+1
+2
+
+
+-- !query 106
+SELECT cast('1' as binary) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 106 schema
+struct<>
+-- !query 106 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> binary at the first column of the second table;
+
+
+-- !query 107
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 107 schema
+struct<>
+-- !query 107 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> binary at the first column of the second table;
+
+
+-- !query 108
+SELECT cast('1' as binary) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 108 schema
+struct<>
+-- !query 108 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> binary at the first column of the second table;
+
+
+-- !query 109
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 109 schema
+struct<>
+-- !query 109 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. tinyint <> boolean at the first column of the second table;
+
+
+-- !query 110
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 110 schema
+struct<>
+-- !query 110 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. smallint <> boolean at the first column of the second table;
+
+
+-- !query 111
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 111 schema
+struct<>
+-- !query 111 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. int <> boolean at the first column of the second table;
+
+
+-- !query 112
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 112 schema
+struct<>
+-- !query 112 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. bigint <> boolean at the first column of the second table;
+
+
+-- !query 113
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 113 schema
+struct<>
+-- !query 113 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. float <> boolean at the first column of the second table;
+
+
+-- !query 114
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 114 schema
+struct<>
+-- !query 114 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. double <> boolean at the first column of the second table;
+
+
+-- !query 115
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 115 schema
+struct<>
+-- !query 115 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. decimal(10,0) <> boolean at the first column of the second table;
+
+
+-- !query 116
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 116 schema
+struct<>
+-- !query 116 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. string <> boolean at the first column of the second table;
+
+
+-- !query 117
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> boolean at the first column of the second table;
+
+
+-- !query 118
+SELECT cast(1 as boolean) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 118 schema
+struct<CAST(1 AS BOOLEAN):boolean>
+-- !query 118 output
+true
+
+
+-- !query 119
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. timestamp <> boolean at the first column of the second table;
+
+
+-- !query 120
+SELECT cast(1 as boolean) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. date <> boolean at the first column of the second table;
+
+
+-- !query 121
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. tinyint <> timestamp at the first column of the second table;
+
+
+-- !query 122
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. smallint <> timestamp at the first column of the second table;
+
+
+-- !query 123
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. int <> timestamp at the first column of the second table;
+
+
+-- !query 124
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. bigint <> timestamp at the first column of the second table;
+
+
+-- !query 125
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. float <> timestamp at the first column of the second table;
+
+
+-- !query 126
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. double <> timestamp at the first column of the second table;
+
+
+-- !query 127
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. decimal(10,0) <> timestamp at the first column of the second table;
+
+
+-- !query 128
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 128 schema
+struct<CAST(2017-12-12 09:30:00.0 AS TIMESTAMP):string>
+-- !query 128 output
+2
+2017-12-12 09:30:00
+
+
+-- !query 129
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 129 schema
+struct<>
+-- !query 129 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> timestamp at the first column of the second table;
+
+
+-- !query 130
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 130 schema
+struct<>
+-- !query 130 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> timestamp at the first column of the second table;
+
+
+-- !query 131
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 131 schema
+struct<CAST(2017-12-12 09:30:00.0 AS TIMESTAMP):timestamp>
+-- !query 131 output
+2017-12-11 09:30:00
+2017-12-12 09:30:00
+
+
+-- !query 132
+SELECT cast('2017-12-12 09:30:00.0' as timestamp) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 132 schema
+struct<CAST(2017-12-12 09:30:00.0 AS TIMESTAMP):timestamp>
+-- !query 132 output
+2017-12-11 00:00:00
+2017-12-12 09:30:00
+
+
+-- !query 133
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as tinyint) FROM t
+-- !query 133 schema
+struct<>
+-- !query 133 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. tinyint <> date at the first column of the second table;
+
+
+-- !query 134
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as smallint) FROM t
+-- !query 134 schema
+struct<>
+-- !query 134 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. smallint <> date at the first column of the second table;
+
+
+-- !query 135
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as int) FROM t
+-- !query 135 schema
+struct<>
+-- !query 135 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. int <> date at the first column of the second table;
+
+
+-- !query 136
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as bigint) FROM t
+-- !query 136 schema
+struct<>
+-- !query 136 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. bigint <> date at the first column of the second table;
+
+
+-- !query 137
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as float) FROM t
+-- !query 137 schema
+struct<>
+-- !query 137 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. float <> date at the first column of the second table;
+
+
+-- !query 138
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as double) FROM t
+-- !query 138 schema
+struct<>
+-- !query 138 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. double <> date at the first column of the second table;
+
+
+-- !query 139
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as decimal(10, 0)) FROM t
+-- !query 139 schema
+struct<>
+-- !query 139 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. decimal(10,0) <> date at the first column of the second table;
+
+
+-- !query 140
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as string) FROM t
+-- !query 140 schema
+struct<CAST(2017-12-12 09:30:00 AS DATE):string>
+-- !query 140 output
+2
+2017-12-12
+
+
+-- !query 141
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2' as binary) FROM t
+-- !query 141 schema
+struct<>
+-- !query 141 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. binary <> date at the first column of the second table;
+
+
+-- !query 142
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast(2 as boolean) FROM t
+-- !query 142 schema
+struct<>
+-- !query 142 output
+org.apache.spark.sql.AnalysisException
+Union can only be performed on tables with the compatible column types. boolean <> date at the first column of the second table;
+
+
+-- !query 143
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 143 schema
+struct<CAST(2017-12-12 09:30:00 AS DATE):timestamp>
+-- !query 143 output
+2017-12-11 09:30:00
+2017-12-12 00:00:00
+
+
+-- !query 144
+SELECT cast('2017-12-12 09:30:00' as date) FROM t UNION SELECT cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 144 schema
+struct<CAST(2017-12-12 09:30:00 AS DATE):date>
+-- !query 144 output
+2017-12-11
+2017-12-12

From 3a7494dfee714510f6a62d5533023f3e0d5ccdcd Mon Sep 17 00:00:00 2001
From: Sital Kedia <skedia@fb.com>
Date: Wed, 20 Dec 2017 12:21:00 +0800
Subject: [PATCH 149/356] [SPARK-22827][CORE] Avoid throwing OutOfMemoryError
 in case of exception in spill

## What changes were proposed in this pull request?
Currently, the task memory manager throws an OutofMemory error when there is an IO exception happens in spill() - https://github.com/apache/spark/blob/master/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java#L194. Similarly there any many other places in code when if a task is not able to acquire memory due to an exception we throw an OutofMemory error which kills the entire executor and hence failing all the tasks that are running on that executor instead of just failing one single task.

## How was this patch tested?

Unit tests

Author: Sital Kedia <skedia@fb.com>

Closes #20014 from sitalkedia/skedia/upstream_SPARK-22827.
---
 .../apache/spark/memory/MemoryConsumer.java   |  4 +--
 .../spark/memory/SparkOutOfMemoryError.java   | 36 +++++++++++++++++++
 .../spark/memory/TaskMemoryManager.java       |  4 +--
 .../shuffle/sort/ShuffleExternalSorter.java   |  3 +-
 .../unsafe/sort/UnsafeExternalSorter.java     |  3 +-
 .../unsafe/sort/UnsafeInMemorySorter.java     |  3 +-
 .../org/apache/spark/executor/Executor.scala  |  5 ++-
 .../TungstenAggregationIterator.scala         |  3 +-
 8 files changed, 50 insertions(+), 11 deletions(-)
 create mode 100644 core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java

diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
index 2dff241900e82..a7bd4b3799a25 100644
--- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
+++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -88,7 +88,7 @@ public void spill() throws IOException {
    * `LongArray` is too large to fit in a single page. The caller side should take care of these
    * two exceptions, or make sure the `size` is small enough that won't trigger exceptions.
    *
-   * @throws OutOfMemoryError
+   * @throws SparkOutOfMemoryError
    * @throws TooLargePageException
    */
   public LongArray allocateArray(long size) {
@@ -154,6 +154,6 @@ private void throwOom(final MemoryBlock page, final long required) {
       taskMemoryManager.freePage(page, this);
     }
     taskMemoryManager.showMemoryUsage();
-    throw new OutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+    throw new SparkOutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
   }
 }
diff --git a/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java b/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java
new file mode 100644
index 0000000000000..ca00ca58e9713
--- /dev/null
+++ b/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.memory;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * This exception is thrown when a task can not acquire memory from the Memory manager.
+ * Instead of throwing {@link OutOfMemoryError}, which kills the executor,
+ * we should use throw this exception, which just kills the current task.
+ */
+@Private
+public final class SparkOutOfMemoryError extends OutOfMemoryError {
+
+    public SparkOutOfMemoryError(String s) {
+        super(s);
+    }
+
+    public SparkOutOfMemoryError(OutOfMemoryError e) {
+        super(e.getMessage());
+    }
+}
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index f6b5ea3c0ad26..e8d3730daa7a4 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -192,7 +192,7 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
             throw new RuntimeException(e.getMessage());
           } catch (IOException e) {
             logger.error("error while calling spill() on " + c, e);
-            throw new OutOfMemoryError("error while calling spill() on " + c + " : "
+            throw new SparkOutOfMemoryError("error while calling spill() on " + c + " : "
               + e.getMessage());
           }
         }
@@ -213,7 +213,7 @@ public long acquireExecutionMemory(long required, MemoryConsumer consumer) {
           throw new RuntimeException(e.getMessage());
         } catch (IOException e) {
           logger.error("error while calling spill() on " + consumer, e);
-          throw new OutOfMemoryError("error while calling spill() on " + consumer + " : "
+          throw new SparkOutOfMemoryError("error while calling spill() on " + consumer + " : "
             + e.getMessage());
         }
       }
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
index e80f9734ecf7b..c3a07b2abf896 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/ShuffleExternalSorter.java
@@ -33,6 +33,7 @@
 import org.apache.spark.executor.ShuffleWriteMetrics;
 import org.apache.spark.internal.config.package$;
 import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.SparkOutOfMemoryError;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.memory.TooLargePageException;
 import org.apache.spark.serializer.DummySerializerInstance;
@@ -337,7 +338,7 @@ private void growPointerArrayIfNecessary() throws IOException {
         // The pointer array is too big to fix in a single page, spill.
         spill();
         return;
-      } catch (OutOfMemoryError e) {
+      } catch (SparkOutOfMemoryError e) {
         // should have trigger spilling
         if (!inMemSorter.hasSpaceForAnotherRecord()) {
           logger.error("Unable to grow the pointer array");
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 8b8e15e3f78ed..66118f454159b 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -25,6 +25,7 @@
 import java.util.function.Supplier;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.apache.spark.memory.SparkOutOfMemoryError;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -349,7 +350,7 @@ private void growPointerArrayIfNecessary() throws IOException {
         // The pointer array is too big to fix in a single page, spill.
         spill();
         return;
-      } catch (OutOfMemoryError e) {
+      } catch (SparkOutOfMemoryError e) {
         // should have trigger spilling
         if (!inMemSorter.hasSpaceForAnotherRecord()) {
           logger.error("Unable to grow the pointer array");
diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index 3bb87a6ed653d..951d076420ee6 100644
--- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -24,6 +24,7 @@
 
 import org.apache.spark.TaskContext;
 import org.apache.spark.memory.MemoryConsumer;
+import org.apache.spark.memory.SparkOutOfMemoryError;
 import org.apache.spark.memory.TaskMemoryManager;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.UnsafeAlignedOffset;
@@ -212,7 +213,7 @@ public boolean hasSpaceForAnotherRecord() {
 
   public void expandPointerArray(LongArray newArray) {
     if (newArray.size() < array.size()) {
-      throw new OutOfMemoryError("Not enough memory to grow pointer array");
+      throw new SparkOutOfMemoryError("Not enough memory to grow pointer array");
     }
     Platform.copyMemory(
       array.getBaseObject(),
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index af0a0ab656564..2c3a8ef74800b 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -35,7 +35,7 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.memory.TaskMemoryManager
+import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager}
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Task, TaskDescription}
 import org.apache.spark.shuffle.FetchFailedException
@@ -553,10 +553,9 @@ private[spark] class Executor(
 
           // Don't forcibly exit unless the exception was inherently fatal, to avoid
           // stopping other tasks unnecessarily.
-          if (Utils.isFatalError(t)) {
+          if (!t.isInstanceOf[SparkOutOfMemoryError] && Utils.isFatalError(t)) {
             uncaughtExceptionHandler.uncaughtException(Thread.currentThread(), t)
           }
-
       } finally {
         runningTasks.remove(taskId)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 756eeb642e2d0..9dc334c1ead3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.TaskContext
 import org.apache.spark.internal.Logging
+import org.apache.spark.memory.SparkOutOfMemoryError
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
@@ -205,7 +206,7 @@ class TungstenAggregationIterator(
           buffer = hashMap.getAggregationBufferFromUnsafeRow(groupingKey)
           if (buffer == null) {
             // failed to allocate the first page
-            throw new OutOfMemoryError("No enough memory for aggregation")
+            throw new SparkOutOfMemoryError("No enough memory for aggregation")
           }
         }
         processRow(buffer, newInput)

From 6e36d8d56279a2c5c92c8df8e89ee99b514817e7 Mon Sep 17 00:00:00 2001
From: Youngbin Kim <ykim828@hotmail.com>
Date: Tue, 19 Dec 2017 20:22:33 -0800
Subject: [PATCH 150/356] [SPARK-22829] Add new built-in function date_trunc()

## What changes were proposed in this pull request?

Adding date_trunc() as a built-in function.
`date_trunc` is common in other databases, but Spark or Hive does not have support for this. `date_trunc` is commonly used by data scientists and business intelligence application such as Superset (https://github.com/apache/incubator-superset).
We do have `trunc` but this only works with 'MONTH' and 'YEAR' level on the DateType input.

date_trunc() in other databases:
AWS Redshift: http://docs.aws.amazon.com/redshift/latest/dg/r_DATE_TRUNC.html
PostgreSQL: https://www.postgresql.org/docs/9.1/static/functions-datetime.html
Presto: https://prestodb.io/docs/current/functions/datetime.html

## How was this patch tested?

Unit tests

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Youngbin Kim <ykim828@hotmail.com>

Closes #20015 from youngbink/date_trunc.
---
 python/pyspark/sql/functions.py               |  20 ++-
 .../catalyst/analysis/FunctionRegistry.scala  |   1 +
 .../expressions/datetimeExpressions.scala     | 170 ++++++++++++++----
 .../sql/catalyst/util/DateTimeUtils.scala     | 102 +++++++++--
 .../expressions/DateExpressionsSuite.scala    |  73 +++++++-
 .../catalyst/util/DateTimeUtilsSuite.scala    |  70 ++++++++
 .../org/apache/spark/sql/functions.scala      |  15 ++
 .../apache/spark/sql/DateFunctionsSuite.scala |  46 +++++
 8 files changed, 445 insertions(+), 52 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 4e0faddb1c0df..54530055dfa85 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1099,7 +1099,7 @@ def trunc(date, format):
     """
     Returns date truncated to the unit specified by the format.
 
-    :param format: 'year', 'YYYY', 'yy' or 'month', 'mon', 'mm'
+    :param format: 'year', 'yyyy', 'yy' or 'month', 'mon', 'mm'
 
     >>> df = spark.createDataFrame([('1997-02-28',)], ['d'])
     >>> df.select(trunc(df.d, 'year').alias('year')).collect()
@@ -1111,6 +1111,24 @@ def trunc(date, format):
     return Column(sc._jvm.functions.trunc(_to_java_column(date), format))
 
 
+@since(2.3)
+def date_trunc(format, timestamp):
+    """
+    Returns timestamp truncated to the unit specified by the format.
+
+    :param format: 'year', 'yyyy', 'yy', 'month', 'mon', 'mm',
+        'day', 'dd', 'hour', 'minute', 'second', 'week', 'quarter'
+
+    >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t'])
+    >>> df.select(date_trunc('year', df.t).alias('year')).collect()
+    [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]
+    >>> df.select(date_trunc('mon', df.t).alias('month')).collect()
+    [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(sc._jvm.functions.date_trunc(format, _to_java_column(timestamp)))
+
+
 @since(1.5)
 def next_day(date, dayOfWeek):
     """
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index 11538bd31b4fd..5ddb39822617d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -392,6 +392,7 @@ object FunctionRegistry {
     expression[ToUnixTimestamp]("to_unix_timestamp"),
     expression[ToUTCTimestamp]("to_utc_timestamp"),
     expression[TruncDate]("trunc"),
+    expression[TruncTimestamp]("date_trunc"),
     expression[UnixTimestamp]("unix_timestamp"),
     expression[DayOfWeek]("dayofweek"),
     expression[WeekOfYear]("weekofyear"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index cfec7f82951a7..59c3e3d9947a3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -1294,80 +1294,79 @@ case class ParseToTimestamp(left: Expression, format: Option[Expression], child:
   override def dataType: DataType = TimestampType
 }
 
-/**
- * Returns date truncated to the unit specified by the format.
- */
-// scalastyle:off line.size.limit
-@ExpressionDescription(
-  usage = "_FUNC_(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`.",
-  examples = """
-    Examples:
-      > SELECT _FUNC_('2009-02-12', 'MM');
-       2009-02-01
-      > SELECT _FUNC_('2015-10-27', 'YEAR');
-       2015-01-01
-  """,
-  since = "1.5.0")
-// scalastyle:on line.size.limit
-case class TruncDate(date: Expression, format: Expression)
-  extends BinaryExpression with ImplicitCastInputTypes {
-  override def left: Expression = date
-  override def right: Expression = format
-
-  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType)
-  override def dataType: DataType = DateType
+trait TruncInstant extends BinaryExpression with ImplicitCastInputTypes {
+  val instant: Expression
+  val format: Expression
   override def nullable: Boolean = true
-  override def prettyName: String = "trunc"
 
   private lazy val truncLevel: Int =
     DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
 
-  override def eval(input: InternalRow): Any = {
+  /**
+   * @param input internalRow (time)
+   * @param maxLevel Maximum level that can be used for truncation (e.g MONTH for Date input)
+   * @param truncFunc function: (time, level) => time
+   */
+  protected def evalHelper(input: InternalRow, maxLevel: Int)(
+    truncFunc: (Any, Int) => Any): Any = {
     val level = if (format.foldable) {
       truncLevel
     } else {
       DateTimeUtils.parseTruncLevel(format.eval().asInstanceOf[UTF8String])
     }
-    if (level == -1) {
-      // unknown format
+    if (level == DateTimeUtils.TRUNC_INVALID || level > maxLevel) {
+      // unknown format or too large level
       null
     } else {
-      val d = date.eval(input)
-      if (d == null) {
+      val t = instant.eval(input)
+      if (t == null) {
         null
       } else {
-        DateTimeUtils.truncDate(d.asInstanceOf[Int], level)
+        truncFunc(t, level)
       }
     }
   }
 
-  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+  protected def codeGenHelper(
+      ctx: CodegenContext,
+      ev: ExprCode,
+      maxLevel: Int,
+      orderReversed: Boolean = false)(
+      truncFunc: (String, String) => String)
+    : ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
 
     if (format.foldable) {
-      if (truncLevel == -1) {
+      if (truncLevel == DateTimeUtils.TRUNC_INVALID || truncLevel > maxLevel) {
         ev.copy(code = s"""
           boolean ${ev.isNull} = true;
           ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};""")
       } else {
-        val d = date.genCode(ctx)
+        val t = instant.genCode(ctx)
+        val truncFuncStr = truncFunc(t.value, truncLevel.toString)
         ev.copy(code = s"""
-          ${d.code}
-          boolean ${ev.isNull} = ${d.isNull};
+          ${t.code}
+          boolean ${ev.isNull} = ${t.isNull};
           ${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
           if (!${ev.isNull}) {
-            ${ev.value} = $dtu.truncDate(${d.value}, $truncLevel);
+            ${ev.value} = $dtu.$truncFuncStr;
           }""")
       }
     } else {
-      nullSafeCodeGen(ctx, ev, (dateVal, fmt) => {
+      nullSafeCodeGen(ctx, ev, (left, right) => {
         val form = ctx.freshName("form")
+        val (dateVal, fmt) = if (orderReversed) {
+          (right, left)
+        } else {
+          (left, right)
+        }
+        val truncFuncStr = truncFunc(dateVal, form)
         s"""
           int $form = $dtu.parseTruncLevel($fmt);
-          if ($form == -1) {
+          if ($form == -1 || $form > $maxLevel) {
             ${ev.isNull} = true;
           } else {
-            ${ev.value} = $dtu.truncDate($dateVal, $form);
+            ${ev.value} = $dtu.$truncFuncStr
           }
         """
       })
@@ -1375,6 +1374,101 @@ case class TruncDate(date: Expression, format: Expression)
   }
 }
 
+/**
+ * Returns date truncated to the unit specified by the format.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(date, fmt) - Returns `date` with the time portion of the day truncated to the unit specified by the format model `fmt`.
+    `fmt` should be one of ["year", "yyyy", "yy", "mon", "month", "mm"]
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('2009-02-12', 'MM');
+       2009-02-01
+      > SELECT _FUNC_('2015-10-27', 'YEAR');
+       2015-01-01
+  """,
+  since = "1.5.0")
+// scalastyle:on line.size.limit
+case class TruncDate(date: Expression, format: Expression)
+  extends TruncInstant {
+  override def left: Expression = date
+  override def right: Expression = format
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(DateType, StringType)
+  override def dataType: DataType = DateType
+  override def prettyName: String = "trunc"
+  override val instant = date
+
+  override def eval(input: InternalRow): Any = {
+    evalHelper(input, maxLevel = DateTimeUtils.TRUNC_TO_MONTH) { (d: Any, level: Int) =>
+      DateTimeUtils.truncDate(d.asInstanceOf[Int], level)
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    codeGenHelper(ctx, ev, maxLevel = DateTimeUtils.TRUNC_TO_MONTH) { (date: String, fmt: String) =>
+      s"truncDate($date, $fmt);"
+    }
+  }
+}
+
+/**
+ * Returns timestamp truncated to the unit specified by the format.
+ */
+// scalastyle:off line.size.limit
+@ExpressionDescription(
+  usage = """
+    _FUNC_(fmt, ts) - Returns timestamp `ts` truncated to the unit specified by the format model `fmt`.
+    `fmt` should be one of ["YEAR", "YYYY", "YY", "MON", "MONTH", "MM", "DAY", "DD", "HOUR", "MINUTE", "SECOND", "WEEK", "QUARTER"]
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'YEAR');
+       2015-01-01T00:00:00
+      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'MM');
+       2015-03-01T00:00:00
+      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'DD');
+       2015-03-05T00:00:00
+      > SELECT _FUNC_('2015-03-05T09:32:05.359', 'HOUR');
+       2015-03-05T09:00:00
+  """,
+  since = "2.3.0")
+// scalastyle:on line.size.limit
+case class TruncTimestamp(
+    format: Expression,
+    timestamp: Expression,
+    timeZoneId: Option[String] = None)
+  extends TruncInstant with TimeZoneAwareExpression {
+  override def left: Expression = format
+  override def right: Expression = timestamp
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, TimestampType)
+  override def dataType: TimestampType = TimestampType
+  override def prettyName: String = "date_trunc"
+  override val instant = timestamp
+  override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
+    copy(timeZoneId = Option(timeZoneId))
+
+  def this(format: Expression, timestamp: Expression) = this(format, timestamp, None)
+
+  override def eval(input: InternalRow): Any = {
+    evalHelper(input, maxLevel = DateTimeUtils.TRUNC_TO_SECOND) { (t: Any, level: Int) =>
+      DateTimeUtils.truncTimestamp(t.asInstanceOf[Long], level, timeZone)
+    }
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    val tz = ctx.addReferenceObj("timeZone", timeZone)
+    codeGenHelper(ctx, ev, maxLevel = DateTimeUtils.TRUNC_TO_SECOND, true) {
+      (date: String, fmt: String) =>
+        s"truncTimestamp($date, $fmt, $tz);"
+    }
+  }
+}
+
 /**
  * Returns the number of days from startDate to endDate.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index b1ed25645b36c..fa69b8af62c85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -45,7 +45,8 @@ object DateTimeUtils {
   // it's 2440587.5, rounding up to compatible with Hive
   final val JULIAN_DAY_OF_EPOCH = 2440588
   final val SECONDS_PER_DAY = 60 * 60 * 24L
-  final val MICROS_PER_SECOND = 1000L * 1000L
+  final val MICROS_PER_MILLIS = 1000L
+  final val MICROS_PER_SECOND = MICROS_PER_MILLIS * MILLIS_PER_SECOND
   final val MILLIS_PER_SECOND = 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
   final val MICROS_PER_DAY = MICROS_PER_SECOND * SECONDS_PER_DAY
@@ -909,6 +910,15 @@ object DateTimeUtils {
     math.round(diff * 1e8) / 1e8
   }
 
+  // Thursday = 0 since 1970/Jan/01 => Thursday
+  private val SUNDAY = 3
+  private val MONDAY = 4
+  private val TUESDAY = 5
+  private val WEDNESDAY = 6
+  private val THURSDAY = 0
+  private val FRIDAY = 1
+  private val SATURDAY = 2
+
   /*
    * Returns day of week from String. Starting from Thursday, marked as 0.
    * (Because 1970-01-01 is Thursday).
@@ -916,13 +926,13 @@ object DateTimeUtils {
   def getDayOfWeekFromString(string: UTF8String): Int = {
     val dowString = string.toString.toUpperCase(Locale.ROOT)
     dowString match {
-      case "SU" | "SUN" | "SUNDAY" => 3
-      case "MO" | "MON" | "MONDAY" => 4
-      case "TU" | "TUE" | "TUESDAY" => 5
-      case "WE" | "WED" | "WEDNESDAY" => 6
-      case "TH" | "THU" | "THURSDAY" => 0
-      case "FR" | "FRI" | "FRIDAY" => 1
-      case "SA" | "SAT" | "SATURDAY" => 2
+      case "SU" | "SUN" | "SUNDAY" => SUNDAY
+      case "MO" | "MON" | "MONDAY" => MONDAY
+      case "TU" | "TUE" | "TUESDAY" => TUESDAY
+      case "WE" | "WED" | "WEDNESDAY" => WEDNESDAY
+      case "TH" | "THU" | "THURSDAY" => THURSDAY
+      case "FR" | "FRI" | "FRIDAY" => FRIDAY
+      case "SA" | "SAT" | "SATURDAY" => SATURDAY
       case _ => -1
     }
   }
@@ -944,9 +954,16 @@ object DateTimeUtils {
     date + daysToMonthEnd
   }
 
-  private val TRUNC_TO_YEAR = 1
-  private val TRUNC_TO_MONTH = 2
-  private val TRUNC_INVALID = -1
+  // Visible for testing.
+  private[sql] val TRUNC_TO_YEAR = 1
+  private[sql] val TRUNC_TO_MONTH = 2
+  private[sql] val TRUNC_TO_QUARTER = 3
+  private[sql] val TRUNC_TO_WEEK = 4
+  private[sql] val TRUNC_TO_DAY = 5
+  private[sql] val TRUNC_TO_HOUR = 6
+  private[sql] val TRUNC_TO_MINUTE = 7
+  private[sql] val TRUNC_TO_SECOND = 8
+  private[sql] val TRUNC_INVALID = -1
 
   /**
    * Returns the trunc date from original date and trunc level.
@@ -964,7 +981,62 @@ object DateTimeUtils {
   }
 
   /**
-   * Returns the truncate level, could be TRUNC_YEAR, TRUNC_MONTH, or TRUNC_INVALID,
+   * Returns the trunc date time from original date time and trunc level.
+   * Trunc level should be generated using `parseTruncLevel()`, should be between 1 and 8
+   */
+  def truncTimestamp(t: SQLTimestamp, level: Int, timeZone: TimeZone): SQLTimestamp = {
+    var millis = t / MICROS_PER_MILLIS
+    val truncated = level match {
+      case TRUNC_TO_YEAR =>
+        val dDays = millisToDays(millis, timeZone)
+        daysToMillis(truncDate(dDays, level), timeZone)
+      case TRUNC_TO_MONTH =>
+        val dDays = millisToDays(millis, timeZone)
+        daysToMillis(truncDate(dDays, level), timeZone)
+      case TRUNC_TO_DAY =>
+        val offset = timeZone.getOffset(millis)
+        millis += offset
+        millis - millis % (MILLIS_PER_SECOND * SECONDS_PER_DAY) - offset
+      case TRUNC_TO_HOUR =>
+        val offset = timeZone.getOffset(millis)
+        millis += offset
+        millis - millis % (60 * 60 * MILLIS_PER_SECOND) - offset
+      case TRUNC_TO_MINUTE =>
+        millis - millis % (60 * MILLIS_PER_SECOND)
+      case TRUNC_TO_SECOND =>
+        millis - millis % MILLIS_PER_SECOND
+      case TRUNC_TO_WEEK =>
+        val dDays = millisToDays(millis, timeZone)
+        val prevMonday = getNextDateForDayOfWeek(dDays - 7, MONDAY)
+        daysToMillis(prevMonday, timeZone)
+      case TRUNC_TO_QUARTER =>
+        val dDays = millisToDays(millis, timeZone)
+        millis = daysToMillis(truncDate(dDays, TRUNC_TO_MONTH), timeZone)
+        val cal = Calendar.getInstance()
+        cal.setTimeInMillis(millis)
+        val quarter = getQuarter(dDays)
+        val month = quarter match {
+          case 1 => Calendar.JANUARY
+          case 2 => Calendar.APRIL
+          case 3 => Calendar.JULY
+          case 4 => Calendar.OCTOBER
+        }
+        cal.set(Calendar.MONTH, month)
+        cal.getTimeInMillis()
+      case _ =>
+        // caller make sure that this should never be reached
+        sys.error(s"Invalid trunc level: $level")
+    }
+    truncated * MICROS_PER_MILLIS
+  }
+
+  def truncTimestamp(d: SQLTimestamp, level: Int): SQLTimestamp = {
+    truncTimestamp(d, level, defaultTimeZone())
+  }
+
+  /**
+   * Returns the truncate level, could be TRUNC_YEAR, TRUNC_MONTH, TRUNC_TO_DAY, TRUNC_TO_HOUR,
+   * TRUNC_TO_MINUTE, TRUNC_TO_SECOND, TRUNC_TO_WEEK, TRUNC_TO_QUARTER or TRUNC_INVALID,
    * TRUNC_INVALID means unsupported truncate level.
    */
   def parseTruncLevel(format: UTF8String): Int = {
@@ -974,6 +1046,12 @@ object DateTimeUtils {
       format.toString.toUpperCase(Locale.ROOT) match {
         case "YEAR" | "YYYY" | "YY" => TRUNC_TO_YEAR
         case "MON" | "MONTH" | "MM" => TRUNC_TO_MONTH
+        case "DAY" | "DD" => TRUNC_TO_DAY
+        case "HOUR" => TRUNC_TO_HOUR
+        case "MINUTE" => TRUNC_TO_MINUTE
+        case "SECOND" => TRUNC_TO_SECOND
+        case "WEEK" => TRUNC_TO_WEEK
+        case "QUARTER" => TRUNC_TO_QUARTER
         case _ => TRUNC_INVALID
       }
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 89d99f9678cda..63f6ceeb21b96 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -527,7 +527,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       NextDay(Literal(Date.valueOf("2015-07-23")), Literal.create(null, StringType)), null)
   }
 
-  test("function trunc") {
+  test("TruncDate") {
     def testTrunc(input: Date, fmt: String, expected: Date): Unit = {
       checkEvaluation(TruncDate(Literal.create(input, DateType), Literal.create(fmt, StringType)),
         expected)
@@ -543,11 +543,82 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
       testTrunc(date, fmt, Date.valueOf("2015-07-01"))
     }
     testTrunc(date, "DD", null)
+    testTrunc(date, "SECOND", null)
+    testTrunc(date, "HOUR", null)
     testTrunc(date, null, null)
     testTrunc(null, "MON", null)
     testTrunc(null, null, null)
   }
 
+  test("TruncTimestamp") {
+    def testTrunc(input: Timestamp, fmt: String, expected: Timestamp): Unit = {
+      checkEvaluation(
+        TruncTimestamp(Literal.create(fmt, StringType), Literal.create(input, TimestampType)),
+        expected)
+      checkEvaluation(
+        TruncTimestamp(
+          NonFoldableLiteral.create(fmt, StringType), Literal.create(input, TimestampType)),
+        expected)
+    }
+
+    withDefaultTimeZone(TimeZoneGMT) {
+      val inputDate = Timestamp.valueOf("2015-07-22 05:30:06")
+
+      Seq("yyyy", "YYYY", "year", "YEAR", "yy", "YY").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-01-01 00:00:00"))
+      }
+
+      Seq("month", "MONTH", "mon", "MON", "mm", "MM").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-01 00:00:00"))
+      }
+
+      Seq("DAY", "day", "DD", "dd").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-22 00:00:00"))
+      }
+
+      Seq("HOUR", "hour").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-22 05:00:00"))
+      }
+
+      Seq("MINUTE", "minute").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-22 05:30:00"))
+      }
+
+      Seq("SECOND", "second").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-22 05:30:06"))
+      }
+
+      Seq("WEEK", "week").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-20 00:00:00"))
+      }
+
+      Seq("QUARTER", "quarter").foreach { fmt =>
+        testTrunc(
+          inputDate, fmt,
+          Timestamp.valueOf("2015-07-01 00:00:00"))
+      }
+
+      testTrunc(inputDate, "INVALID", null)
+      testTrunc(inputDate, null, null)
+      testTrunc(null, "MON", null)
+      testTrunc(null, null, null)
+    }
+  }
+
   test("from_unixtime") {
     val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index c8cf16d937352..625ff38943fa3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -563,6 +563,76 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     }
   }
 
+  test("truncTimestamp") {
+    def testTrunc(
+        level: Int,
+        expected: String,
+        inputTS: SQLTimestamp,
+        timezone: TimeZone = DateTimeUtils.defaultTimeZone()): Unit = {
+      val truncated =
+        DateTimeUtils.truncTimestamp(inputTS, level, timezone)
+      val expectedTS =
+        DateTimeUtils.stringToTimestamp(UTF8String.fromString(expected))
+      assert(truncated === expectedTS.get)
+    }
+
+    val defaultInputTS =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-05T09:32:05.359"))
+    val defaultInputTS1 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-31T20:32:05.359"))
+    val defaultInputTS2 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-04-01T02:32:05.359"))
+    val defaultInputTS3 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-30T02:32:05.359"))
+    val defaultInputTS4 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-29T02:32:05.359"))
+
+    testTrunc(DateTimeUtils.TRUNC_TO_YEAR, "2015-01-01T00:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_MONTH, "2015-03-01T00:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_DAY, "2015-03-05T00:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T09:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:32:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:05", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-02T00:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", defaultInputTS1.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", defaultInputTS2.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", defaultInputTS3.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-23T00:00:00", defaultInputTS4.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", defaultInputTS.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", defaultInputTS1.get)
+    testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", defaultInputTS2.get)
+
+    for (tz <- DateTimeTestUtils.ALL_TIMEZONES) {
+      DateTimeTestUtils.withDefaultTimeZone(tz) {
+        val inputTS =
+          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-05T09:32:05.359"))
+        val inputTS1 =
+          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-31T20:32:05.359"))
+        val inputTS2 =
+          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-04-01T02:32:05.359"))
+        val inputTS3 =
+          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-30T02:32:05.359"))
+        val inputTS4 =
+          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-29T02:32:05.359"))
+
+        testTrunc(DateTimeUtils.TRUNC_TO_YEAR, "2015-01-01T00:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_MONTH, "2015-03-01T00:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_DAY, "2015-03-05T00:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T09:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:32:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:05", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-02T00:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS1.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS2.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", inputTS3.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-23T00:00:00", inputTS4.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-01-01T00:00:00", inputTS1.get, tz)
+        testTrunc(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", inputTS2.get, tz)
+      }
+    }
+  }
+
   test("daysToMillis and millisToDays") {
     val c = Calendar.getInstance(TimeZonePST)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 3e4659b9eae60..052a3f533da5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2797,6 +2797,21 @@ object functions {
     TruncDate(date.expr, Literal(format))
   }
 
+  /**
+   * Returns timestamp truncated to the unit specified by the format.
+   *
+   * @param format: 'year', 'yyyy', 'yy' for truncate by year,
+   *                'month', 'mon', 'mm' for truncate by month,
+   *                'day', 'dd' for truncate by day,
+   *                Other options are: 'second', 'minute', 'hour', 'week', 'month', 'quarter'
+   *
+   * @group datetime_funcs
+   * @since 2.3.0
+   */
+  def date_trunc(format: String, timestamp: Column): Column = withExpr {
+    TruncTimestamp(Literal(format), timestamp.expr)
+  }
+
   /**
    * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders
    * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 3a8694839bb24..6bbf38516cdf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -435,6 +435,52 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
       Seq(Row(Date.valueOf("2015-07-01")), Row(Date.valueOf("2014-12-01"))))
   }
 
+  test("function date_trunc") {
+    val df = Seq(
+      (1, Timestamp.valueOf("2015-07-22 10:01:40.523")),
+      (2, Timestamp.valueOf("2014-12-31 05:29:06.876"))).toDF("i", "t")
+
+    checkAnswer(
+      df.select(date_trunc("YY", col("t"))),
+      Seq(Row(Timestamp.valueOf("2015-01-01 00:00:00")),
+        Row(Timestamp.valueOf("2014-01-01 00:00:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('MONTH', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-01 00:00:00")),
+        Row(Timestamp.valueOf("2014-12-01 00:00:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('DAY', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-22 00:00:00")),
+        Row(Timestamp.valueOf("2014-12-31 00:00:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('HOUR', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-22 10:00:00")),
+        Row(Timestamp.valueOf("2014-12-31 05:00:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('MINUTE', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-22 10:01:00")),
+        Row(Timestamp.valueOf("2014-12-31 05:29:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('SECOND', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-22 10:01:40")),
+        Row(Timestamp.valueOf("2014-12-31 05:29:06"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('WEEK', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-20 00:00:00")),
+        Row(Timestamp.valueOf("2014-12-29 00:00:00"))))
+
+    checkAnswer(
+      df.selectExpr("date_trunc('QUARTER', t)"),
+      Seq(Row(Timestamp.valueOf("2015-07-01 00:00:00")),
+        Row(Timestamp.valueOf("2014-10-01 00:00:00"))))
+  }
+
   test("from_unixtime") {
     val sdf1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.US)
     val fmt2 = "yyyy-MM-dd HH:mm:ss.SSS"

From 13268a58f8f67bf994f0ad5076419774c45daeeb Mon Sep 17 00:00:00 2001
From: Fernando Pereira <fernando.pereira@epfl.ch>
Date: Tue, 19 Dec 2017 20:47:12 -0800
Subject: [PATCH 151/356] [SPARK-22649][PYTHON][SQL] Adding localCheckpoint to
 Dataset API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This change adds local checkpoint support to datasets and respective bind from Python Dataframe API.

If reliability requirements can be lowered to favor performance, as in cases of further quick transformations followed by a reliable save, localCheckpoints() fit very well.
Furthermore, at the moment Reliable checkpoints still incur double computation (see #9428)
In general it makes the API more complete as well.

## How was this patch tested?

Python land quick use case:

```python
>>> from time import sleep
>>> from pyspark.sql import types as T
>>> from pyspark.sql import functions as F

>>> def f(x):
    sleep(1)
    return x*2
   ...:

>>> df1 = spark.range(30, numPartitions=6)
>>> df2 = df1.select(F.udf(f, T.LongType())("id"))

>>> %time _ = df2.collect()
CPU times: user 7.79 ms, sys: 5.84 ms, total: 13.6 ms
Wall time: 12.2 s

>>> %time df3 = df2.localCheckpoint()
CPU times: user 2.38 ms, sys: 2.3 ms, total: 4.68 ms
Wall time: 10.3 s

>>> %time _ = df3.collect()
CPU times: user 5.09 ms, sys: 410 µs, total: 5.5 ms
Wall time: 148 ms

>>> sc.setCheckpointDir(".")
>>> %time df3 = df2.checkpoint()
CPU times: user 4.04 ms, sys: 1.63 ms, total: 5.67 ms
Wall time: 20.3 s
```

Author: Fernando Pereira <fernando.pereira@epfl.ch>

Closes #19805 from ferdonline/feature_dataset_localCheckpoint.
---
 python/pyspark/sql/dataframe.py               |  14 +++
 .../scala/org/apache/spark/sql/Dataset.scala  |  49 +++++++-
 .../org/apache/spark/sql/DatasetSuite.scala   | 107 ++++++++++--------
 3 files changed, 121 insertions(+), 49 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 9864dc98c1f33..75395a754a831 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -368,6 +368,20 @@ def checkpoint(self, eager=True):
         jdf = self._jdf.checkpoint(eager)
         return DataFrame(jdf, self.sql_ctx)
 
+    @since(2.3)
+    def localCheckpoint(self, eager=True):
+        """Returns a locally checkpointed version of this Dataset. Checkpointing can be used to
+        truncate the logical plan of this DataFrame, which is especially useful in iterative
+        algorithms where the plan may grow exponentially. Local checkpoints are stored in the
+        executors using the caching subsystem and therefore they are not reliable.
+
+        :param eager: Whether to checkpoint this DataFrame immediately
+
+        .. note:: Experimental
+        """
+        jdf = self._jdf.localCheckpoint(eager)
+        return DataFrame(jdf, self.sql_ctx)
+
     @since(2.1)
     def withWatermark(self, eventTime, delayThreshold):
         """Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index c34cf0a7a7718..ef00562672a7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -527,7 +527,7 @@ class Dataset[T] private[sql](
    */
   @Experimental
   @InterfaceStability.Evolving
-  def checkpoint(): Dataset[T] = checkpoint(eager = true)
+  def checkpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = true)
 
   /**
    * Returns a checkpointed version of this Dataset. Checkpointing can be used to truncate the
@@ -540,9 +540,52 @@ class Dataset[T] private[sql](
    */
   @Experimental
   @InterfaceStability.Evolving
-  def checkpoint(eager: Boolean): Dataset[T] = {
+  def checkpoint(eager: Boolean): Dataset[T] = checkpoint(eager = eager, reliableCheckpoint = true)
+
+  /**
+   * Eagerly locally checkpoints a Dataset and return the new Dataset. Checkpointing can be
+   * used to truncate the logical plan of this Dataset, which is especially useful in iterative
+   * algorithms where the plan may grow exponentially. Local checkpoints are written to executor
+   * storage and despite potentially faster they are unreliable and may compromise job completion.
+   *
+   * @group basic
+   * @since 2.3.0
+   */
+  @Experimental
+  @InterfaceStability.Evolving
+  def localCheckpoint(): Dataset[T] = checkpoint(eager = true, reliableCheckpoint = false)
+
+  /**
+   * Locally checkpoints a Dataset and return the new Dataset. Checkpointing can be used to truncate
+   * the logical plan of this Dataset, which is especially useful in iterative algorithms where the
+   * plan may grow exponentially. Local checkpoints are written to executor storage and despite
+   * potentially faster they are unreliable and may compromise job completion.
+   *
+   * @group basic
+   * @since 2.3.0
+   */
+  @Experimental
+  @InterfaceStability.Evolving
+  def localCheckpoint(eager: Boolean): Dataset[T] = checkpoint(
+    eager = eager,
+    reliableCheckpoint = false
+  )
+
+  /**
+   * Returns a checkpointed version of this Dataset.
+   *
+   * @param eager Whether to checkpoint this dataframe immediately
+   * @param reliableCheckpoint Whether to create a reliable checkpoint saved to files inside the
+   *                           checkpoint directory. If false creates a local checkpoint using
+   *                           the caching subsystem
+   */
+  private def checkpoint(eager: Boolean, reliableCheckpoint: Boolean): Dataset[T] = {
     val internalRdd = queryExecution.toRdd.map(_.copy())
-    internalRdd.checkpoint()
+    if (reliableCheckpoint) {
+      internalRdd.checkpoint()
+    } else {
+      internalRdd.localCheckpoint()
+    }
 
     if (eager) {
       internalRdd.count()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index b02db7721aa7f..bd1e7adefc7a9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1156,67 +1156,82 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   }
 
   Seq(true, false).foreach { eager =>
-    def testCheckpointing(testName: String)(f: => Unit): Unit = {
-      test(s"Dataset.checkpoint() - $testName (eager = $eager)") {
-        withTempDir { dir =>
-          val originalCheckpointDir = spark.sparkContext.checkpointDir
-
-          try {
-            spark.sparkContext.setCheckpointDir(dir.getCanonicalPath)
+    Seq(true, false).foreach { reliable =>
+      def testCheckpointing(testName: String)(f: => Unit): Unit = {
+        test(s"Dataset.checkpoint() - $testName (eager = $eager, reliable = $reliable)") {
+          if (reliable) {
+            withTempDir { dir =>
+              val originalCheckpointDir = spark.sparkContext.checkpointDir
+
+              try {
+                spark.sparkContext.setCheckpointDir(dir.getCanonicalPath)
+                f
+              } finally {
+                // Since the original checkpointDir can be None, we need
+                // to set the variable directly.
+                spark.sparkContext.checkpointDir = originalCheckpointDir
+              }
+            }
+          } else {
+            // Local checkpoints dont require checkpoint_dir
             f
-          } finally {
-            // Since the original checkpointDir can be None, we need
-            // to set the variable directly.
-            spark.sparkContext.checkpointDir = originalCheckpointDir
           }
         }
       }
-    }
 
-    testCheckpointing("basic") {
-      val ds = spark.range(10).repartition('id % 2).filter('id > 5).orderBy('id.desc)
-      val cp = ds.checkpoint(eager)
+      testCheckpointing("basic") {
+        val ds = spark.range(10).repartition('id % 2).filter('id > 5).orderBy('id.desc)
+        val cp = if (reliable) ds.checkpoint(eager) else ds.localCheckpoint(eager)
 
-      val logicalRDD = cp.logicalPlan match {
-        case plan: LogicalRDD => plan
-        case _ =>
-          val treeString = cp.logicalPlan.treeString(verbose = true)
-          fail(s"Expecting a LogicalRDD, but got\n$treeString")
-      }
+        val logicalRDD = cp.logicalPlan match {
+          case plan: LogicalRDD => plan
+          case _ =>
+            val treeString = cp.logicalPlan.treeString(verbose = true)
+            fail(s"Expecting a LogicalRDD, but got\n$treeString")
+        }
 
-      val dsPhysicalPlan = ds.queryExecution.executedPlan
-      val cpPhysicalPlan = cp.queryExecution.executedPlan
+        val dsPhysicalPlan = ds.queryExecution.executedPlan
+        val cpPhysicalPlan = cp.queryExecution.executedPlan
 
-      assertResult(dsPhysicalPlan.outputPartitioning) { logicalRDD.outputPartitioning }
-      assertResult(dsPhysicalPlan.outputOrdering) { logicalRDD.outputOrdering }
+        assertResult(dsPhysicalPlan.outputPartitioning) {
+          logicalRDD.outputPartitioning
+        }
+        assertResult(dsPhysicalPlan.outputOrdering) {
+          logicalRDD.outputOrdering
+        }
 
-      assertResult(dsPhysicalPlan.outputPartitioning) { cpPhysicalPlan.outputPartitioning }
-      assertResult(dsPhysicalPlan.outputOrdering) { cpPhysicalPlan.outputOrdering }
+        assertResult(dsPhysicalPlan.outputPartitioning) {
+          cpPhysicalPlan.outputPartitioning
+        }
+        assertResult(dsPhysicalPlan.outputOrdering) {
+          cpPhysicalPlan.outputOrdering
+        }
 
-      // For a lazy checkpoint() call, the first check also materializes the checkpoint.
-      checkDataset(cp, (9L to 6L by -1L).map(java.lang.Long.valueOf): _*)
+        // For a lazy checkpoint() call, the first check also materializes the checkpoint.
+        checkDataset(cp, (9L to 6L by -1L).map(java.lang.Long.valueOf): _*)
 
-      // Reads back from checkpointed data and check again.
-      checkDataset(cp, (9L to 6L by -1L).map(java.lang.Long.valueOf): _*)
-    }
+        // Reads back from checkpointed data and check again.
+        checkDataset(cp, (9L to 6L by -1L).map(java.lang.Long.valueOf): _*)
+      }
 
-    testCheckpointing("should preserve partitioning information") {
-      val ds = spark.range(10).repartition('id % 2)
-      val cp = ds.checkpoint(eager)
+      testCheckpointing("should preserve partitioning information") {
+        val ds = spark.range(10).repartition('id % 2)
+        val cp = if (reliable) ds.checkpoint(eager) else ds.localCheckpoint(eager)
 
-      val agg = cp.groupBy('id % 2).agg(count('id))
+        val agg = cp.groupBy('id % 2).agg(count('id))
 
-      agg.queryExecution.executedPlan.collectFirst {
-        case ShuffleExchangeExec(_, _: RDDScanExec, _) =>
-        case BroadcastExchangeExec(_, _: RDDScanExec) =>
-      }.foreach { _ =>
-        fail(
-          "No Exchange should be inserted above RDDScanExec since the checkpointed Dataset " +
-            "preserves partitioning information:\n\n" + agg.queryExecution
-        )
-      }
+        agg.queryExecution.executedPlan.collectFirst {
+          case ShuffleExchangeExec(_, _: RDDScanExec, _) =>
+          case BroadcastExchangeExec(_, _: RDDScanExec) =>
+        }.foreach { _ =>
+          fail(
+            "No Exchange should be inserted above RDDScanExec since the checkpointed Dataset " +
+              "preserves partitioning information:\n\n" + agg.queryExecution
+          )
+        }
 
-      checkAnswer(agg, ds.groupBy('id % 2).agg(count('id)))
+        checkAnswer(agg, ds.groupBy('id % 2).agg(count('id)))
+      }
     }
   }
 

From 9962390af77d8085ce2a475b5983766c9c307031 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 19 Dec 2017 23:50:06 -0800
Subject: [PATCH 152/356] [SPARK-22781][SS] Support creating streaming dataset
 with ORC files

## What changes were proposed in this pull request?

Like `Parquet`, users can use `ORC` with Apache Spark structured streaming. This PR adds `orc()` to `DataStreamReader`(Scala/Python) in order to support creating streaming dataset with ORC file format more easily like the other file formats. Also, this adds a test coverage for ORC data source and updates the document.

**BEFORE**

```scala
scala> spark.readStream.schema("a int").orc("/tmp/orc_ss").writeStream.format("console").start()
<console>:24: error: value orc is not a member of org.apache.spark.sql.streaming.DataStreamReader
       spark.readStream.schema("a int").orc("/tmp/orc_ss").writeStream.format("console").start()
```

**AFTER**
```scala
scala> spark.readStream.schema("a int").orc("/tmp/orc_ss").writeStream.format("console").start()
res0: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper678b3746

scala>
-------------------------------------------
Batch: 0
-------------------------------------------
+---+
|  a|
+---+
|  1|
+---+
```

## How was this patch tested?

Pass the newly added test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19975 from dongjoon-hyun/SPARK-22781.
---
 .../structured-streaming-programming-guide.md |   2 +-
 python/pyspark/sql/streaming.py               |  17 +++
 .../sql/streaming/DataStreamReader.scala      |  15 +++
 .../sql/streaming/FileStreamSinkSuite.scala   |   4 +
 .../sql/streaming/FileStreamSourceSuite.scala | 111 ++++++++++++++++++
 5 files changed, 148 insertions(+), 1 deletion(-)

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 93bef8d5bb7e2..31fcfabb9cacc 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -493,7 +493,7 @@ returned by `SparkSession.readStream()`. In [R](api/R/read.stream.html), with th
 #### Input Sources
 There are a few built-in sources.
 
-  - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations.
+  - **File source** - Reads files written in a directory as a stream of data. Supported file formats are text, csv, json, orc, parquet. See the docs of the DataStreamReader interface for a more up-to-date list, and supported options for each file format. Note that the files must be atomically placed in the given directory, which in most file systems, can be achieved by file move operations.
 
   - **Kafka source** - Reads data from Kafka. It's compatible with Kafka broker versions 0.10.0 or higher. See the [Kafka Integration Guide](structured-streaming-kafka-integration.html) for more details.
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 0cf702143c773..d0aba28788ac9 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -490,6 +490,23 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
         else:
             raise TypeError("path can be only a single string")
 
+    @since(2.3)
+    def orc(self, path):
+        """Loads a ORC file stream, returning the result as a :class:`DataFrame`.
+
+        .. note:: Evolving.
+
+        >>> orc_sdf = spark.readStream.schema(sdf_schema).orc(tempfile.mkdtemp())
+        >>> orc_sdf.isStreaming
+        True
+        >>> orc_sdf.schema == sdf_schema
+        True
+        """
+        if isinstance(path, basestring):
+            return self._df(self._jreader.orc(path))
+        else:
+            raise TypeError("path can be only a single string")
+
     @since(2.0)
     def parquet(self, path):
         """Loads a Parquet file stream, returning the result as a :class:`DataFrame`.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index a42e28053a96a..41aa02c2b5e35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -298,6 +298,21 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    */
   def csv(path: String): DataFrame = format("csv").load(path)
 
+  /**
+   * Loads a ORC file stream, returning the result as a `DataFrame`.
+   *
+   * You can set the following ORC-specific option(s) for reading ORC files:
+   * <ul>
+   * <li>`maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be
+   * considered in every trigger.</li>
+   * </ul>
+   *
+   * @since 2.3.0
+   */
+  def orc(path: String): DataFrame = {
+    format("orc").load(path)
+  }
+
   /**
    * Loads a Parquet file stream, returning the result as a `DataFrame`.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 08db06b94904b..2a2552211857a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -305,6 +305,10 @@ class FileStreamSinkSuite extends StreamTest {
     testFormat(Some("parquet"))
   }
 
+  test("orc") {
+    testFormat(Some("orc"))
+  }
+
   test("text") {
     testFormat(Some("text"))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index c5b57bca18313..f4fa7fa7954d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -87,6 +87,28 @@ abstract class FileStreamSourceTest
     }
   }
 
+  case class AddOrcFileData(data: DataFrame, src: File, tmp: File) extends AddFileData {
+    override def addData(source: FileStreamSource): Unit = {
+      AddOrcFileData.writeToFile(data, src, tmp)
+    }
+  }
+
+  object AddOrcFileData {
+    def apply(seq: Seq[String], src: File, tmp: File): AddOrcFileData = {
+      AddOrcFileData(seq.toDS().toDF(), src, tmp)
+    }
+
+    /** Write orc files in a temp dir, and move the individual files to the 'src' dir */
+    def writeToFile(df: DataFrame, src: File, tmp: File): Unit = {
+      val tmpDir = Utils.tempFileWith(new File(tmp, "orc"))
+      df.write.orc(tmpDir.getCanonicalPath)
+      src.mkdirs()
+      tmpDir.listFiles().foreach { f =>
+        f.renameTo(new File(src, s"${f.getName}"))
+      }
+    }
+  }
+
   case class AddParquetFileData(data: DataFrame, src: File, tmp: File) extends AddFileData {
     override def addData(source: FileStreamSource): Unit = {
       AddParquetFileData.writeToFile(data, src, tmp)
@@ -249,6 +271,42 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  // =============== ORC file stream schema tests ================
+
+  test("FileStreamSource schema: orc, existing files, no schema") {
+    withTempDir { src =>
+      Seq("a", "b", "c").toDS().as("userColumn").toDF().write
+        .mode(org.apache.spark.sql.SaveMode.Overwrite)
+        .orc(src.getCanonicalPath)
+
+      // Without schema inference, should throw error
+      withSQLConf(SQLConf.STREAMING_SCHEMA_INFERENCE.key -> "false") {
+        intercept[IllegalArgumentException] {
+          createFileStreamSourceAndGetSchema(
+            format = Some("orc"), path = Some(src.getCanonicalPath), schema = None)
+        }
+      }
+
+      // With schema inference, should infer correct schema
+      withSQLConf(SQLConf.STREAMING_SCHEMA_INFERENCE.key -> "true") {
+        val schema = createFileStreamSourceAndGetSchema(
+          format = Some("orc"), path = Some(src.getCanonicalPath), schema = None)
+        assert(schema === new StructType().add("value", StringType))
+      }
+    }
+  }
+
+  test("FileStreamSource schema: orc, existing files, schema") {
+    withTempPath { src =>
+      Seq("a", "b", "c").toDS().as("oldUserColumn").toDF()
+        .write.orc(new File(src, "1").getCanonicalPath)
+      val userSchema = new StructType().add("userColumn", StringType)
+      val schema = createFileStreamSourceAndGetSchema(
+        format = Some("orc"), path = Some(src.getCanonicalPath), schema = Some(userSchema))
+      assert(schema === userSchema)
+    }
+  }
+
   // =============== Parquet file stream schema tests ================
 
   test("FileStreamSource schema: parquet, existing files, no schema") {
@@ -508,6 +566,59 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
     }
   }
 
+  // =============== ORC file stream tests ================
+
+  test("read from orc files") {
+    withTempDirs { case (src, tmp) =>
+      val fileStream = createFileStream("orc", src.getCanonicalPath, Some(valueSchema))
+      val filtered = fileStream.filter($"value" contains "keep")
+
+      testStream(filtered)(
+        AddOrcFileData(Seq("drop1", "keep2", "keep3"), src, tmp),
+        CheckAnswer("keep2", "keep3"),
+        StopStream,
+        AddOrcFileData(Seq("drop4", "keep5", "keep6"), src, tmp),
+        StartStream(),
+        CheckAnswer("keep2", "keep3", "keep5", "keep6"),
+        AddOrcFileData(Seq("drop7", "keep8", "keep9"), src, tmp),
+        CheckAnswer("keep2", "keep3", "keep5", "keep6", "keep8", "keep9")
+      )
+    }
+  }
+
+  test("read from orc files with changing schema") {
+    withTempDirs { case (src, tmp) =>
+      withSQLConf(SQLConf.STREAMING_SCHEMA_INFERENCE.key -> "true") {
+
+        // Add a file so that we can infer its schema
+        AddOrcFileData.writeToFile(Seq("value0").toDF("k"), src, tmp)
+
+        val fileStream = createFileStream("orc", src.getCanonicalPath)
+
+        // FileStreamSource should infer the column "k"
+        assert(fileStream.schema === StructType(Seq(StructField("k", StringType))))
+
+        // After creating DF and before starting stream, add data with different schema
+        // Should not affect the inferred schema any more
+        AddOrcFileData.writeToFile(Seq(("value1", 0)).toDF("k", "v"), src, tmp)
+
+        testStream(fileStream)(
+          // Should not pick up column v in the file added before start
+          AddOrcFileData(Seq("value2").toDF("k"), src, tmp),
+          CheckAnswer("value0", "value1", "value2"),
+
+          // Should read data in column k, and ignore v
+          AddOrcFileData(Seq(("value3", 1)).toDF("k", "v"), src, tmp),
+          CheckAnswer("value0", "value1", "value2", "value3"),
+
+          // Should ignore rows that do not have the necessary k column
+          AddOrcFileData(Seq("value5").toDF("v"), src, tmp),
+          CheckAnswer("value0", "value1", "value2", "value3", null)
+        )
+      }
+    }
+  }
+
   // =============== Parquet file stream tests ================
 
   test("read from parquet files") {

From 7570eab6bee57172ee3746207261307690a57b72 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 20 Dec 2017 11:31:11 -0600
Subject: [PATCH 153/356] [SPARK-22788][STREAMING] Use correct hadoop config
 for fs append support.

Still look at the old one in case any Spark user is setting it
explicitly, though.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19983 from vanzin/SPARK-22788.
---
 .../scala/org/apache/spark/streaming/util/HdfsUtils.scala     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
index 6a3b3200dccdb..a6997359d64d2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/HdfsUtils.scala
@@ -29,7 +29,9 @@ private[streaming] object HdfsUtils {
     // If the file exists and we have append support, append instead of creating a new file
     val stream: FSDataOutputStream = {
       if (dfs.isFile(dfsPath)) {
-        if (conf.getBoolean("hdfs.append.support", false) || dfs.isInstanceOf[RawLocalFileSystem]) {
+        if (conf.getBoolean("dfs.support.append", true) ||
+            conf.getBoolean("hdfs.append.support", false) ||
+            dfs.isInstanceOf[RawLocalFileSystem]) {
           dfs.append(dfsPath)
         } else {
           throw new IllegalStateException("File exists and there is no append support!")

From 7798c9e6ef55dbadfc9eb896fa3f366c76dc187b Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Wed, 20 Dec 2017 10:43:10 -0800
Subject: [PATCH 154/356] [SPARK-22824] Restore old offset for binary
 compatibility

## What changes were proposed in this pull request?

Some users depend on source compatibility with the org.apache.spark.sql.execution.streaming.Offset class. Although this is not a stable interface, we can keep it in place for now to simplify upgrades to 2.3.

Author: Jose Torres <jose@databricks.com>

Closes #20012 from joseph-torres/binary-compat.
---
 .../spark/sql/kafka010/KafkaSource.scala      |  1 -
 .../sql/kafka010/KafkaSourceOffset.scala      |  3 +-
 .../spark/sql/kafka010/KafkaSourceSuite.scala |  1 -
 .../spark/sql/sources/v2/reader/Offset.java   |  6 +-
 .../streaming/FileStreamSource.scala          |  1 -
 .../streaming/FileStreamSourceOffset.scala    |  2 -
 .../sql/execution/streaming/LongOffset.scala  |  2 -
 .../streaming/MicroBatchExecution.scala       |  1 -
 .../spark/sql/execution/streaming/Offset.java | 61 +++++++++++++++++++
 .../sql/execution/streaming/OffsetSeq.scala   |  1 -
 .../execution/streaming/OffsetSeqLog.scala    |  1 -
 .../streaming/RateSourceProvider.scala        |  3 +-
 .../streaming/RateStreamOffset.scala          |  4 +-
 .../{Offset.scala => SerializedOffset.scala}  |  3 -
 .../sql/execution/streaming/Source.scala      |  1 -
 .../execution/streaming/StreamExecution.scala |  1 -
 .../execution/streaming/StreamProgress.scala  |  2 -
 .../ContinuousRateStreamSource.scala          |  3 +-
 .../sql/execution/streaming/memory.scala      |  1 -
 .../sql/execution/streaming/socket.scala      |  1 -
 .../{ => sources}/RateStreamSourceV2.scala    |  7 ++-
 .../streaming/{ => sources}/memoryV2.scala    |  3 +-
 .../streaming/MemorySinkV2Suite.scala         |  1 +
 .../streaming/RateSourceV2Suite.scala         |  1 +
 .../sql/streaming/FileStreamSourceSuite.scala |  1 -
 .../spark/sql/streaming/OffsetSuite.scala     |  3 +-
 .../spark/sql/streaming/StreamSuite.scala     |  1 -
 .../spark/sql/streaming/StreamTest.scala      |  1 -
 .../streaming/StreamingAggregationSuite.scala |  1 -
 .../StreamingQueryListenerSuite.scala         |  1 -
 .../sql/streaming/StreamingQuerySuite.scala   |  2 -
 .../test/DataStreamReaderWriterSuite.scala    |  1 -
 .../sql/streaming/util/BlockingSource.scala   |  3 +-
 33 files changed, 82 insertions(+), 43 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/{Offset.scala => SerializedOffset.scala} (95%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/{ => sources}/RateStreamSourceV2.scala (96%)
 rename sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/{ => sources}/memoryV2.scala (98%)

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 87f31fcc20ae6..e9cff04ba5f2e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.kafka010.KafkaSource._
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index 6e24423df4abc..b5da415b3097e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.kafka010
 
 import org.apache.kafka.common.TopicPartition
 
-import org.apache.spark.sql.execution.streaming.SerializedOffset
-import org.apache.spark.sql.sources.v2.reader.Offset
+import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 9cac0e5ae7117..2034b9be07f24 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -38,7 +38,6 @@ import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
index 1ebd35356f1a3..ce1c489742054 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
@@ -23,7 +23,7 @@
  * restart checkpoints. Sources should provide an Offset implementation which they can use to
  * reconstruct the stream position where the offset was taken.
  */
-public abstract class Offset {
+public abstract class Offset extends org.apache.spark.sql.execution.streaming.Offset {
     /**
      * A JSON-serialized representation of an Offset that is
      * used for saving offsets to the offset log.
@@ -41,8 +41,8 @@ public abstract class Offset {
      */
     @Override
     public boolean equals(Object obj) {
-        if (obj instanceof Offset) {
-            return this.json().equals(((Offset) obj).json());
+        if (obj instanceof org.apache.spark.sql.execution.streaming.Offset) {
+            return this.json().equals(((org.apache.spark.sql.execution.streaming.Offset) obj).json());
         } else {
             return false;
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
index a33b785126765..0debd7db84757 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSource.scala
@@ -27,7 +27,6 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 import org.apache.spark.sql.execution.datasources.{DataSource, InMemoryFileIndex, LogicalRelation}
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
index 431e5b99e3e98..a2b49d944a688 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSourceOffset.scala
@@ -22,8 +22,6 @@ import scala.util.control.Exception._
 import org.json4s.NoTypeHints
 import org.json4s.jackson.Serialization
 
-import org.apache.spark.sql.sources.v2.reader.Offset
-
 /**
  * Offset for the [[FileStreamSource]].
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
index 7ea31462ca7b0..5f0b195fcfcb8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/LongOffset.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.sql.sources.v2.reader.Offset
-
 /**
  * A simple offset for sources that produce a single linear stream of data.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index a67dda99dc01b..4a3de8bae4bc9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
new file mode 100644
index 0000000000000..80aa5505db991
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming;
+
+/**
+ * This is an internal, deprecated interface. New source implementations should use the
+ * org.apache.spark.sql.sources.v2.reader.Offset class, which is the one that will be supported
+ * in the long term.
+ *
+ * This class will be removed in a future release.
+ */
+public abstract class Offset {
+    /**
+     * A JSON-serialized representation of an Offset that is
+     * used for saving offsets to the offset log.
+     * Note: We assume that equivalent/equal offsets serialize to
+     * identical JSON strings.
+     *
+     * @return JSON string encoding
+     */
+    public abstract String json();
+
+    /**
+     * Equality based on JSON string representation. We leverage the
+     * JSON representation for normalization between the Offset's
+     * in memory and on disk representations.
+     */
+    @Override
+    public boolean equals(Object obj) {
+        if (obj instanceof Offset) {
+            return this.json().equals(((Offset) obj).json());
+        } else {
+            return false;
+        }
+    }
+
+    @Override
+    public int hashCode() {
+        return this.json().hashCode();
+    }
+
+    @Override
+    public String toString() {
+        return this.json();
+    }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index dcc5935890c8d..4e0a468b962a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -23,7 +23,6 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.RuntimeConfig
 import org.apache.spark.sql.internal.SQLConf.{SHUFFLE_PARTITIONS, STATE_STORE_PROVIDER_CLASS}
-import org.apache.spark.sql.sources.v2.reader.Offset
 
 /**
  * An ordered collection of offsets, used to track the progress of processing data from one or more
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
index bfdbc65296165..e3f4abcf9f1dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLog.scala
@@ -24,7 +24,6 @@ import java.nio.charset.StandardCharsets._
 import scala.io.{Source => IOSource}
 
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.sources.v2.reader.Offset
 
 /**
  * This class is used to log offsets to persistent files in HDFS.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 50671a46599e6..41761324cf6ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -30,9 +30,10 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamReader
+import org.apache.spark.sql.execution.streaming.sources.RateStreamV2Reader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, MicroBatchReader, Offset}
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, MicroBatchReader}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
index 13679dfbe446b..726d8574af52b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
@@ -20,10 +20,10 @@ package org.apache.spark.sql.execution.streaming
 import org.json4s.DefaultFormats
 import org.json4s.jackson.Serialization
 
-import org.apache.spark.sql.sources.v2.reader.Offset
+import org.apache.spark.sql.sources.v2
 
 case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, (Long, Long)])
-  extends Offset {
+  extends v2.reader.Offset {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
   override val json = Serialization.write(partitionToValueAndRunTimeMs)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/SerializedOffset.scala
similarity index 95%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/SerializedOffset.scala
index 73f0c6221c5c1..129cfed860eb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Offset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/SerializedOffset.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import org.apache.spark.sql.sources.v2.reader.Offset
-
-
 /**
  * Used when loading a JSON serialized offset from external storage.
  * We are currently not responsible for converting JSON serialized
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index dbb408ffc98d8..311942f6dbd84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.streaming
 
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 7946889e85e37..129995dcf3607 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -36,7 +36,6 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.execution.command.StreamingExplainCommand
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming._
 import org.apache.spark.util.{Clock, UninterruptibleThread, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index 770db401c9fd7..a3f3662e6f4c9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -19,8 +19,6 @@ package org.apache.spark.sql.execution.streaming
 
 import scala.collection.{immutable, GenTraversableOnce}
 
-import org.apache.spark.sql.sources.v2.reader.Offset
-
 /**
  * A helper class that looks like a Map[Source, Offset].
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index 77fc26730e52c..4c3a1ee201ac1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -25,7 +25,8 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamOffset}
+import org.apache.spark.sql.execution.streaming.sources.RateStreamSourceV2
 import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
index db0717510a2cb..3041d4d703cb4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memory.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes._
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
index 440cae016a173..0b22cbc46e6bf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/socket.scala
@@ -31,7 +31,6 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
 import org.apache.spark.unsafe.types.UTF8String
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
similarity index 96%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 102551c238bfb..45dc7d75cbc8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.streaming
+package org.apache.spark.sql.execution.streaming.sources
 
 import java.util.Optional
 
@@ -27,6 +27,7 @@ import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.streaming.RateStreamOffset
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
@@ -59,7 +60,9 @@ class RateStreamV2Reader(options: DataSourceV2Options)
   private var start: RateStreamOffset = _
   private var end: RateStreamOffset = _
 
-  override def setOffsetRange(start: Optional[Offset], end: Optional[Offset]): Unit = {
+  override def setOffsetRange(
+      start: Optional[Offset],
+      end: Optional[Offset]): Unit = {
     this.start = start.orElse(
       RateStreamSourceV2.createInitialOffset(numPartitions, creationTimeMs))
       .asInstanceOf[RateStreamOffset]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
similarity index 98%
rename from sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 437040cc12472..94c5dd63089b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.streaming
+package org.apache.spark.sql.execution.streaming.sources
 
 import javax.annotation.concurrent.GuardedBy
 
@@ -26,6 +26,7 @@ import scala.util.control.NonFatal
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
+import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.sources.v2.{ContinuousWriteSupport, DataSourceV2, DataSourceV2Options, MicroBatchWriteSupport}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.streaming.OutputMode
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
index be4b490754986..00d4f0b8503d8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/MemorySinkV2Suite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.streaming
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.streaming.sources._
 import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
 
 class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index ef801ceb1310c..6514c5f0fdfeb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -24,6 +24,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
+import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamSourceV2, RateStreamV2Reader}
 import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Options, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
index f4fa7fa7954d6..39bb572740617 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.FileStreamSource.{FileEntry, SeenFilesMap}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.ExistsThrowsExceptionFileSystem._
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.SharedSQLContext
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
index 429748261f1ea..f208f9bd9b6e3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/OffsetSuite.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.execution.streaming.{LongOffset, SerializedOffset}
-import org.apache.spark.sql.sources.v2.reader.Offset
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, SerializedOffset}
 
 trait OffsetSuite extends SparkFunSuite {
   /** Creates test to check all the comparisons of offsets given a `one` that is less than `two`. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index fa4b2dd6a6c9b..755490308b5b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -39,7 +39,6 @@ import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreCon
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.StreamSourceProvider
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index fb88c5d327043..71a474ef63e84 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -39,7 +39,6 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.{Clock, SystemClock, Utils}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index 38aa5171314f2..97e065193fd05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -34,7 +34,6 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.expressions.scalalang.typed
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode._
 import org.apache.spark.sql.streaming.util.{MockSourceProvider, StreamManualClock}
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index fc9ac2a56c4e5..9ff02dee288fb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -33,7 +33,6 @@ import org.apache.spark.scheduler._
 import org.apache.spark.sql.{Encoder, SparkSession}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.util.JsonProtocol
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index ad4d3abd01aa5..2fa4595dab376 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -33,12 +33,10 @@ import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.util.{BlockingSource, MockSourceProvider, StreamManualClock}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.ManualClock
 
-
 class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging with MockitoSugar {
 
   import AwaitTerminationTester._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
index 952908f21ca60..aa163d2211c38 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/test/DataStreamReaderWriterSuite.scala
@@ -32,7 +32,6 @@ import org.apache.spark.sql._
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{ProcessingTime => DeprecatedProcessingTime, _}
 import org.apache.spark.sql.streaming.Trigger._
 import org.apache.spark.sql.types._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
index 9a35f097e6e40..19ab2ff13e14e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/util/BlockingSource.scala
@@ -20,9 +20,8 @@ package org.apache.spark.sql.streaming.util
 import java.util.concurrent.CountDownLatch
 
 import org.apache.spark.sql.{SQLContext, _}
-import org.apache.spark.sql.execution.streaming.{LongOffset, Sink, Source}
+import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
 import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 

From d762d110d410b8d67d74eb4d2950cc556ac74123 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Wed, 20 Dec 2017 12:50:03 -0600
Subject: [PATCH 155/356] [SPARK-22832][ML] BisectingKMeans unpersist unused
 datasets

## What changes were proposed in this pull request?
unpersist unused datasets

## How was this patch tested?
existing tests and local check in Spark-Shell

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #20017 from zhengruifeng/bkm_unpersist.
---
 .../spark/mllib/clustering/BisectingKMeans.scala     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
index ae98e24a75681..9b9c70cfe5109 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala
@@ -197,7 +197,9 @@ class BisectingKMeans private (
           newClusters = summarize(d, newAssignments)
           newClusterCenters = newClusters.mapValues(_.center).map(identity)
         }
-        if (preIndices != null) preIndices.unpersist()
+        if (preIndices != null) {
+          preIndices.unpersist(false)
+        }
         preIndices = indices
         indices = updateAssignments(assignments, divisibleIndices, newClusterCenters).keys
           .persist(StorageLevel.MEMORY_AND_DISK)
@@ -212,7 +214,13 @@ class BisectingKMeans private (
       }
       level += 1
     }
-    if(indices != null) indices.unpersist()
+    if (preIndices != null) {
+      preIndices.unpersist(false)
+    }
+    if (indices != null) {
+      indices.unpersist(false)
+    }
+    norms.unpersist(false)
     val clusters = activeClusters ++ inactiveClusters
     val root = buildTree(clusters)
     new BisectingKMeansModel(root)

From c89b43118347677f122db190c9033394c15cee30 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 20 Dec 2017 11:19:57 -0800
Subject: [PATCH 156/356] [SPARK-22849] ivy.retrieve pattern should also
 consider `classifier`

## What changes were proposed in this pull request?
In the previous PR https://github.com/apache/spark/pull/5755#discussion_r157848354, we dropped `(-[classifier])` from the retrieval pattern. We should add it back; otherwise,
> If this pattern for instance doesn't has the [type] or [classifier] token, Ivy will download the source/javadoc artifacts to the same file as the regular jar.

## How was this patch tested?
The existing tests

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20037 from gatorsmile/addClassifier.
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index ab834bb682041..cbe1f2c3e08a1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -1271,7 +1271,7 @@ private[spark] object SparkSubmitUtils {
         // retrieve all resolved dependencies
         ivy.retrieve(rr.getModuleDescriptor.getModuleRevisionId,
           packagesDirectory.getAbsolutePath + File.separator +
-            "[organization]_[artifact]-[revision].[ext]",
+            "[organization]_[artifact]-[revision](-[classifier]).[ext]",
           retrieveOptions.setConfs(Array(ivyConfName)))
         resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
       } finally {

From 792915c8449b606cfdd50401fb349194a2558c36 Mon Sep 17 00:00:00 2001
From: chetkhatri <ckhatrimanjal@gmail.com>
Date: Wed, 20 Dec 2017 14:47:49 -0600
Subject: [PATCH 157/356] [SPARK-22830] Scala Coding style has been improved in
 Spark Examples

## What changes were proposed in this pull request?

* Under Spark Scala Examples: Some of the syntax were written like Java way, It has been re-written as per scala style guide.
* Most of all changes are followed to println() statement.

## How was this patch tested?

Since, All changes proposed are re-writing println statements in scala way, manual run used to test println.

Author: chetkhatri <ckhatrimanjal@gmail.com>

Closes #20016 from chetkhatri/scala-style-spark-examples.
---
 .../apache/spark/examples/BroadcastTest.scala |  2 +-
 .../spark/examples/DFSReadWriteTest.scala     | 24 ++++++------
 .../org/apache/spark/examples/HdfsTest.scala  |  2 +-
 .../org/apache/spark/examples/LocalALS.scala  |  3 +-
 .../apache/spark/examples/LocalFileLR.scala   |  6 +--
 .../apache/spark/examples/LocalKMeans.scala   |  4 +-
 .../org/apache/spark/examples/LocalLR.scala   |  6 +--
 .../org/apache/spark/examples/LocalPi.scala   |  2 +-
 .../examples/SimpleSkewedGroupByTest.scala    |  2 +-
 .../org/apache/spark/examples/SparkALS.scala  |  4 +-
 .../apache/spark/examples/SparkHdfsLR.scala   |  6 +--
 .../apache/spark/examples/SparkKMeans.scala   |  2 +-
 .../org/apache/spark/examples/SparkLR.scala   |  6 +--
 .../apache/spark/examples/SparkPageRank.scala |  2 +-
 .../org/apache/spark/examples/SparkPi.scala   |  2 +-
 .../org/apache/spark/examples/SparkTC.scala   |  2 +-
 .../spark/examples/graphx/Analytics.scala     | 37 ++++++++++---------
 17 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
index 25718f904cc49..3311de12dbd97 100644
--- a/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/BroadcastTest.scala
@@ -42,7 +42,7 @@ object BroadcastTest {
     val arr1 = (0 until num).toArray
 
     for (i <- 0 until 3) {
-      println("Iteration " + i)
+      println(s"Iteration $i")
       println("===========")
       val startTime = System.nanoTime
       val barr1 = sc.broadcast(arr1)
diff --git a/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala b/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
index 3bff7ce736d08..1a779716ec4c0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/DFSReadWriteTest.scala
@@ -49,12 +49,10 @@ object DFSReadWriteTest {
   }
 
   private def printUsage(): Unit = {
-    val usage: String = "DFS Read-Write Test\n" +
-    "\n" +
-    "Usage: localFile dfsDir\n" +
-    "\n" +
-    "localFile - (string) local file to use in test\n" +
-    "dfsDir - (string) DFS directory for read/write tests\n"
+    val usage = """DFS Read-Write Test
+    |Usage: localFile dfsDir
+    |localFile - (string) local file to use in test
+    |dfsDir - (string) DFS directory for read/write tests""".stripMargin
 
     println(usage)
   }
@@ -69,13 +67,13 @@ object DFSReadWriteTest {
 
     localFilePath = new File(args(i))
     if (!localFilePath.exists) {
-      System.err.println("Given path (" + args(i) + ") does not exist.\n")
+      System.err.println(s"Given path (${args(i)}) does not exist")
       printUsage()
       System.exit(1)
     }
 
     if (!localFilePath.isFile) {
-      System.err.println("Given path (" + args(i) + ") is not a file.\n")
+      System.err.println(s"Given path (${args(i)}) is not a file")
       printUsage()
       System.exit(1)
     }
@@ -108,7 +106,7 @@ object DFSReadWriteTest {
       .getOrCreate()
 
     println("Writing local file to DFS")
-    val dfsFilename = dfsDirPath + "/dfs_read_write_test"
+    val dfsFilename = s"$dfsDirPath/dfs_read_write_test"
     val fileRDD = spark.sparkContext.parallelize(fileContents)
     fileRDD.saveAsTextFile(dfsFilename)
 
@@ -127,11 +125,11 @@ object DFSReadWriteTest {
     spark.stop()
 
     if (localWordCount == dfsWordCount) {
-      println(s"Success! Local Word Count ($localWordCount) " +
-        s"and DFS Word Count ($dfsWordCount) agree.")
+      println(s"Success! Local Word Count $localWordCount and " +
+        s"DFS Word Count $dfsWordCount agree.")
     } else {
-      println(s"Failure! Local Word Count ($localWordCount) " +
-        s"and DFS Word Count ($dfsWordCount) disagree.")
+      println(s"Failure! Local Word Count $localWordCount " +
+        s"and DFS Word Count $dfsWordCount disagree.")
     }
 
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
index aa8de69839e28..e1f985ece8c06 100644
--- a/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/HdfsTest.scala
@@ -39,7 +39,7 @@ object HdfsTest {
       val start = System.currentTimeMillis()
       for (x <- mapped) { x + 2 }
       val end = System.currentTimeMillis()
-      println("Iteration " + iter + " took " + (end-start) + " ms")
+      println(s"Iteration $iter took ${end-start} ms")
     }
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
index 97aefac025e55..3f9cea35d6503 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalALS.scala
@@ -129,8 +129,7 @@ object LocalALS {
       println(s"Iteration $iter:")
       ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray
       us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray
-      println("RMSE = " + rmse(R, ms, us))
-      println()
+      println(s"RMSE = ${rmse(R, ms, us)}")
     }
   }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
index 8dbb7ee4e5307..5512e33e41ac3 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalFileLR.scala
@@ -58,10 +58,10 @@ object LocalFileLR {
 
     // Initialize w to a random value
     val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
-    println("Initial w: " + w)
+    println(s"Initial w: $w")
 
     for (i <- 1 to ITERATIONS) {
-      println("On iteration " + i)
+      println(s"On iteration $i")
       val gradient = DenseVector.zeros[Double](D)
       for (p <- points) {
         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
@@ -71,7 +71,7 @@ object LocalFileLR {
     }
 
     fileSrc.close()
-    println("Final w: " + w)
+    println(s"Final w: $w")
   }
 }
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
index 963c9a56d6cac..f5162a59522f0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalKMeans.scala
@@ -88,7 +88,7 @@ object LocalKMeans {
       kPoints.put(i, iter.next())
     }
 
-    println("Initial centers: " + kPoints)
+    println(s"Initial centers: $kPoints")
 
     while(tempDist > convergeDist) {
       val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
@@ -114,7 +114,7 @@ object LocalKMeans {
       }
     }
 
-    println("Final centers: " + kPoints)
+    println(s"Final centers: $kPoints")
   }
 }
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
index eb5221f085937..bde8ccd305960 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalLR.scala
@@ -61,10 +61,10 @@ object LocalLR {
     val data = generateData
     // Initialize w to a random value
     val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
-    println("Initial w: " + w)
+    println(s"Initial w: $w")
 
     for (i <- 1 to ITERATIONS) {
-      println("On iteration " + i)
+      println(s"On iteration $i")
       val gradient = DenseVector.zeros[Double](D)
       for (p <- data) {
         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
@@ -73,7 +73,7 @@ object LocalLR {
       w -= gradient
     }
 
-    println("Final w: " + w)
+    println(s"Final w: $w")
   }
 }
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala b/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
index 121b768e4198e..a93c15c85cfc1 100644
--- a/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/LocalPi.scala
@@ -28,7 +28,7 @@ object LocalPi {
       val y = random * 2 - 1
       if (x*x + y*y <= 1) count += 1
     }
-    println("Pi is roughly " + 4 * count / 100000.0)
+    println(s"Pi is roughly ${4 * count / 100000.0}")
   }
 }
 // scalastyle:on println
diff --git a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
index 8e1a574c92221..e64dcbd182d94 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
@@ -59,7 +59,7 @@ object SimpleSkewedGroupByTest {
     // Enforce that everything has been calculated and in cache
     pairs1.count
 
-    println("RESULT: " + pairs1.groupByKey(numReducers).count)
+    println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")
     // Print how many keys each reducer got (for debugging)
     // println("RESULT: " + pairs1.groupByKey(numReducers)
     //                           .map{case (k,v) => (k, v.size)}
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
index a99ddd9fd37db..d3e7b7a967de7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkALS.scala
@@ -135,10 +135,8 @@ object SparkALS {
                 .map(i => update(i, usb.value(i), msb.value, Rc.value.transpose()))
                 .collect()
       usb = sc.broadcast(us) // Re-broadcast us because it was updated
-      println("RMSE = " + rmse(R, ms, us))
-      println()
+      println(s"RMSE = ${rmse(R, ms, us)}")
     }
-
     spark.stop()
   }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
index 9d675bbc18f38..23eaa879114a9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala
@@ -79,17 +79,17 @@ object SparkHdfsLR {
 
     // Initialize w to a random value
     val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
-    println("Initial w: " + w)
+    println(s"Initial w: $w")
 
     for (i <- 1 to ITERATIONS) {
-      println("On iteration " + i)
+      println(s"On iteration $i")
       val gradient = points.map { p =>
         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
       }.reduce(_ + _)
       w -= gradient
     }
 
-    println("Final w: " + w)
+    println(s"Final w: $w")
     spark.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
index fec3160e9f37b..b005cb6971c16 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkKMeans.scala
@@ -95,7 +95,7 @@ object SparkKMeans {
       for (newP <- newPoints) {
         kPoints(newP._1) = newP._2
       }
-      println("Finished iteration (delta = " + tempDist + ")")
+      println(s"Finished iteration (delta = $tempDist)")
     }
 
     println("Final centers:")
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
index c18e3d31f149e..4b1497345af82 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkLR.scala
@@ -73,17 +73,17 @@ object SparkLR {
 
     // Initialize w to a random value
     val w = DenseVector.fill(D) {2 * rand.nextDouble - 1}
-    println("Initial w: " + w)
+    println(s"Initial w: $w")
 
     for (i <- 1 to ITERATIONS) {
-      println("On iteration " + i)
+      println(s"On iteration $i")
       val gradient = points.map { p =>
         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
       }.reduce(_ + _)
       w -= gradient
     }
 
-    println("Final w: " + w)
+    println(s"Final w: $w")
 
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
index 5d8831265e4ad..9299bad5d3290 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPageRank.scala
@@ -77,7 +77,7 @@ object SparkPageRank {
     }
 
     val output = ranks.collect()
-    output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
+    output.foreach(tup => println(s"${tup._1} has rank:  ${tup._2} ."))
 
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
index a5cacf17a5cca..828d98b5001d7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala
@@ -36,7 +36,7 @@ object SparkPi {
       val y = random * 2 - 1
       if (x*x + y*y <= 1) 1 else 0
     }.reduce(_ + _)
-    println("Pi is roughly " + 4.0 * count / (n - 1))
+    println(s"Pi is roughly ${4.0 * count / (n - 1)}")
     spark.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala b/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
index 558295ab928af..f5d42141f5dd2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SparkTC.scala
@@ -68,7 +68,7 @@ object SparkTC {
       nextCount = tc.count()
     } while (nextCount != oldCount)
 
-    println("TC has " + tc.count() + " edges.")
+    println(s"TC has ${tc.count()} edges.")
     spark.stop()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 619e585b6ca17..92936bd30dbc0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -27,6 +27,7 @@ import org.apache.spark.graphx.lib._
 import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StorageLevel
 
+
 /**
  * Driver program for running graph algorithms.
  */
@@ -34,12 +35,12 @@ object Analytics extends Logging {
 
   def main(args: Array[String]): Unit = {
     if (args.length < 2) {
-      System.err.println(
-        "Usage: Analytics <taskType> <file> --numEPart=<num_edge_partitions> [other options]")
-      System.err.println("Supported 'taskType' as follows:")
-      System.err.println("  pagerank    Compute PageRank")
-      System.err.println("  cc          Compute the connected components of vertices")
-      System.err.println("  triangles   Count the number of triangles")
+      val usage = """Usage: Analytics <taskType> <file> --numEPart=<num_edge_partitions>
+      |[other options] Supported 'taskType' as follows:
+      |pagerank    Compute PageRank
+      |cc          Compute the connected components of vertices
+      |triangles   Count the number of triangles""".stripMargin
+      System.err.println(usage)
       System.exit(1)
     }
 
@@ -48,7 +49,7 @@ object Analytics extends Logging {
     val optionsList = args.drop(2).map { arg =>
       arg.dropWhile(_ == '-').split('=') match {
         case Array(opt, v) => (opt -> v)
-        case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
+        case _ => throw new IllegalArgumentException(s"Invalid argument: $arg")
       }
     }
     val options = mutable.Map(optionsList: _*)
@@ -74,14 +75,14 @@ object Analytics extends Logging {
         val numIterOpt = options.remove("numIter").map(_.toInt)
 
         options.foreach {
-          case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+          case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
         }
 
         println("======================================")
         println("|             PageRank               |")
         println("======================================")
 
-        val sc = new SparkContext(conf.setAppName("PageRank(" + fname + ")"))
+        val sc = new SparkContext(conf.setAppName(s"PageRank($fname)"))
 
         val unpartitionedGraph = GraphLoader.edgeListFile(sc, fname,
           numEdgePartitions = numEPart,
@@ -89,18 +90,18 @@ object Analytics extends Logging {
           vertexStorageLevel = vertexStorageLevel).cache()
         val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_))
 
-        println("GRAPHX: Number of vertices " + graph.vertices.count)
-        println("GRAPHX: Number of edges " + graph.edges.count)
+        println(s"GRAPHX: Number of vertices ${graph.vertices.count}")
+        println(s"GRAPHX: Number of edges ${graph.edges.count}")
 
         val pr = (numIterOpt match {
           case Some(numIter) => PageRank.run(graph, numIter)
           case None => PageRank.runUntilConvergence(graph, tol)
         }).vertices.cache()
 
-        println("GRAPHX: Total rank: " + pr.map(_._2).reduce(_ + _))
+        println(s"GRAPHX: Total rank: ${pr.map(_._2).reduce(_ + _)}")
 
         if (!outFname.isEmpty) {
-          logWarning("Saving pageranks of pages to " + outFname)
+          logWarning(s"Saving pageranks of pages to $outFname")
           pr.map { case (id, r) => id + "\t" + r }.saveAsTextFile(outFname)
         }
 
@@ -108,14 +109,14 @@ object Analytics extends Logging {
 
       case "cc" =>
         options.foreach {
-          case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+          case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
         }
 
         println("======================================")
         println("|      Connected Components          |")
         println("======================================")
 
-        val sc = new SparkContext(conf.setAppName("ConnectedComponents(" + fname + ")"))
+        val sc = new SparkContext(conf.setAppName(s"ConnectedComponents($fname)"))
         val unpartitionedGraph = GraphLoader.edgeListFile(sc, fname,
           numEdgePartitions = numEPart,
           edgeStorageLevel = edgeStorageLevel,
@@ -123,19 +124,19 @@ object Analytics extends Logging {
         val graph = partitionStrategy.foldLeft(unpartitionedGraph)(_.partitionBy(_))
 
         val cc = ConnectedComponents.run(graph)
-        println("Components: " + cc.vertices.map { case (vid, data) => data }.distinct())
+        println(s"Components: ${cc.vertices.map { case (vid, data) => data }.distinct()}")
         sc.stop()
 
       case "triangles" =>
         options.foreach {
-          case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+          case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
         }
 
         println("======================================")
         println("|      Triangle Count                |")
         println("======================================")
 
-        val sc = new SparkContext(conf.setAppName("TriangleCount(" + fname + ")"))
+        val sc = new SparkContext(conf.setAppName(s"TriangleCount($fname)"))
         val graph = GraphLoader.edgeListFile(sc, fname,
           canonicalOrientation = true,
           numEdgePartitions = numEPart,

From b176014857b056e45cbc60ac77f1a95bc31cb0fa Mon Sep 17 00:00:00 2001
From: wuyi <ngone_5451@163.com>
Date: Wed, 20 Dec 2017 14:27:56 -0800
Subject: [PATCH 158/356] [SPARK-22847][CORE] Remove redundant code in
 AppStatusListener while assigning schedulingPool for stage

## What changes were proposed in this pull request?

In AppStatusListener's onStageSubmitted(event: SparkListenerStageSubmitted) method, there are duplicate code:
```
// schedulingPool was assigned twice with the same code
stage.schedulingPool = Option(event.properties).flatMap { p =>
      Option(p.getProperty("spark.scheduler.pool"))
    }.getOrElse(SparkUI.DEFAULT_POOL_NAME)
...
...
...
stage.schedulingPool = Option(event.properties).flatMap { p =>
      Option(p.getProperty("spark.scheduler.pool"))
    }.getOrElse(SparkUI.DEFAULT_POOL_NAME)

```
But, it does not make any sense to do this and there are no comment to explain for this.

## How was this patch tested?
N/A

Author: wuyi <ngone_5451@163.com>

Closes #20033 from Ngone51/dev-spark-22847.
---
 .../scala/org/apache/spark/status/AppStatusListener.scala     | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 1fb7b76d43d04..87eb84d94c005 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -329,10 +329,6 @@ private[spark] class AppStatusListener(
       .toSeq
     stage.jobIds = stage.jobs.map(_.jobId).toSet
 
-    stage.schedulingPool = Option(event.properties).flatMap { p =>
-      Option(p.getProperty("spark.scheduler.pool"))
-    }.getOrElse(SparkUI.DEFAULT_POOL_NAME)
-
     stage.description = Option(event.properties).flatMap { p =>
       Option(p.getProperty(SparkContext.SPARK_JOB_DESCRIPTION))
     }

From 0114c89d049724b95f7823b957bf33790216316b Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Wed, 20 Dec 2017 16:14:36 -0800
Subject: [PATCH 159/356] [SPARK-22845][SCHEDULER] Modify
 spark.kubernetes.allocation.batch.delay to take time instead of int

## What changes were proposed in this pull request?

Fixing configuration that was taking an int which should take time. Discussion in https://github.com/apache/spark/pull/19946#discussion_r156682354
Made the granularity milliseconds as opposed to seconds since there's a use-case for sub-second reactions to scale-up rapidly especially with dynamic allocation.

## How was this patch tested?

TODO: manual run of integration tests against this PR.
PTAL

cc/ mccheah liyinan926 kimoonkim vanzin mridulm jiangxb1987 ueshin

Author: foxish <ramanathana@google.com>

Closes #20032 from foxish/fix-time-conf.
---
 .../main/scala/org/apache/spark/deploy/k8s/Config.scala   | 8 ++++----
 .../cluster/k8s/KubernetesClusterSchedulerBackend.scala   | 2 +-
 .../k8s/KubernetesClusterSchedulerBackendSuite.scala      | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index 04aadb4b06af4..45f527959cbe1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -102,10 +102,10 @@ private[spark] object Config extends Logging {
 
   val KUBERNETES_ALLOCATION_BATCH_DELAY =
     ConfigBuilder("spark.kubernetes.allocation.batch.delay")
-      .doc("Number of seconds to wait between each round of executor allocation.")
-      .longConf
-      .checkValue(value => value > 0, "Allocation batch delay should be a positive integer")
-      .createWithDefault(1)
+      .doc("Time to wait between each round of executor allocation.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .checkValue(value => value > 0, "Allocation batch delay must be a positive time value.")
+      .createWithDefaultString("1s")
 
   val KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS =
     ConfigBuilder("spark.kubernetes.executor.lostCheck.maxAttempts")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index e79c987852db2..9de4b16c30d3c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -217,7 +217,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
         .watch(new ExecutorPodsWatcher()))
 
     allocatorExecutor.scheduleWithFixedDelay(
-      allocatorRunnable, 0L, podAllocationInterval, TimeUnit.SECONDS)
+      allocatorRunnable, 0L, podAllocationInterval, TimeUnit.MILLISECONDS)
 
     if (!Utils.isDynamicAllocationEnabled(conf)) {
       doRequestTotalExecutors(initialExecutors)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index 13c09033a50ee..b2f26f205a329 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -46,7 +46,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   private val NAMESPACE = "test-namespace"
   private val SPARK_DRIVER_HOST = "localhost"
   private val SPARK_DRIVER_PORT = 7077
-  private val POD_ALLOCATION_INTERVAL = 60L
+  private val POD_ALLOCATION_INTERVAL = "1m"
   private val DRIVER_URL = RpcEndpointAddress(
     SPARK_DRIVER_HOST, SPARK_DRIVER_PORT, CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
   private val FIRST_EXECUTOR_POD = new PodBuilder()
@@ -144,7 +144,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
       .set(KUBERNETES_NAMESPACE, NAMESPACE)
       .set("spark.driver.host", SPARK_DRIVER_HOST)
       .set("spark.driver.port", SPARK_DRIVER_PORT.toString)
-      .set(KUBERNETES_ALLOCATION_BATCH_DELAY, POD_ALLOCATION_INTERVAL)
+      .set(KUBERNETES_ALLOCATION_BATCH_DELAY.key, POD_ALLOCATION_INTERVAL)
     executorPodsWatcherArgument = ArgumentCaptor.forClass(classOf[Watcher[Pod]])
     allocatorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
     requestExecutorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
@@ -162,8 +162,8 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     when(allocatorExecutor.scheduleWithFixedDelay(
       allocatorRunnable.capture(),
       mockitoEq(0L),
-      mockitoEq(POD_ALLOCATION_INTERVAL),
-      mockitoEq(TimeUnit.SECONDS))).thenReturn(null)
+      mockitoEq(TimeUnit.MINUTES.toMillis(1)),
+      mockitoEq(TimeUnit.MILLISECONDS))).thenReturn(null)
     // Creating Futures in Scala backed by a Java executor service resolves to running
     // ExecutorService#execute (as opposed to submit)
     doNothing().when(requestExecutorsService).execute(requestExecutorRunnable.capture())

From fb0562f34605cd27fd39d09e6664a46e55eac327 Mon Sep 17 00:00:00 2001
From: Yanbo Liang <ybliang8@gmail.com>
Date: Wed, 20 Dec 2017 17:51:42 -0800
Subject: [PATCH 160/356] [SPARK-22810][ML][PYSPARK] Expose Python API for
 LinearRegression with huber loss.

## What changes were proposed in this pull request?
Expose Python API for _LinearRegression_ with _huber_ loss.

## How was this patch tested?
Unit test.

Author: Yanbo Liang <ybliang8@gmail.com>

Closes #19994 from yanboliang/spark-22810.
---
 .../ml/param/_shared_params_code_gen.py       |  3 +-
 python/pyspark/ml/param/shared.py             | 23 +++++++
 python/pyspark/ml/regression.py               | 64 +++++++++++++++----
 python/pyspark/ml/tests.py                    | 21 ++++++
 4 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 130d1a0bae7f0..d55d209d09398 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -154,7 +154,8 @@ def get$Name(self):
         ("aggregationDepth", "suggested depth for treeAggregate (>= 2).", "2",
          "TypeConverters.toInt"),
         ("parallelism", "the number of threads to use when running parallel algorithms (>= 1).",
-         "1", "TypeConverters.toInt")]
+         "1", "TypeConverters.toInt"),
+        ("loss", "the loss function to be optimized.", None, "TypeConverters.toString")]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 4041d9c43b236..e5c5ddfba6c1f 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -632,6 +632,29 @@ def getParallelism(self):
         return self.getOrDefault(self.parallelism)
 
 
+class HasLoss(Params):
+    """
+    Mixin for param loss: the loss function to be optimized.
+    """
+
+    loss = Param(Params._dummy(), "loss", "the loss function to be optimized.", typeConverter=TypeConverters.toString)
+
+    def __init__(self):
+        super(HasLoss, self).__init__()
+
+    def setLoss(self, value):
+        """
+        Sets the value of :py:attr:`loss`.
+        """
+        return self._set(loss=value)
+
+    def getLoss(self):
+        """
+        Gets the value of loss or its default value.
+        """
+        return self.getOrDefault(self.loss)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 9d5b768091cf4..f0812bd1d4a39 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -39,23 +39,26 @@
 @inherit_doc
 class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter,
                        HasRegParam, HasTol, HasElasticNetParam, HasFitIntercept,
-                       HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth,
+                       HasStandardization, HasSolver, HasWeightCol, HasAggregationDepth, HasLoss,
                        JavaMLWritable, JavaMLReadable):
     """
     Linear regression.
 
-    The learning objective is to minimize the squared error, with regularization.
-    The specific squared error loss function used is: L = 1/2n ||A coefficients - y||^2^
+    The learning objective is to minimize the specified loss function, with regularization.
+    This supports two kinds of loss:
 
-    This supports multiple types of regularization:
-
-     * none (a.k.a. ordinary least squares)
+    * squaredError (a.k.a squared loss)
+    * huber (a hybrid of squared error for relatively small errors and absolute error for \
+    relatively large ones, and we estimate the scale parameter from training data)
 
-     * L2 (ridge regression)
+    This supports multiple types of regularization:
 
-     * L1 (Lasso)
+    * none (a.k.a. ordinary least squares)
+    * L2 (ridge regression)
+    * L1 (Lasso)
+    * L2 + L1 (elastic net)
 
-     * L2 + L1 (elastic net)
+    Note: Fitting with huber loss only supports none and L2 regularization.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> df = spark.createDataFrame([
@@ -98,19 +101,28 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     solver = Param(Params._dummy(), "solver", "The solver algorithm for optimization. Supported " +
                    "options: auto, normal, l-bfgs.", typeConverter=TypeConverters.toString)
 
+    loss = Param(Params._dummy(), "loss", "The loss function to be optimized. Supported " +
+                 "options: squaredError, huber.", typeConverter=TypeConverters.toString)
+
+    epsilon = Param(Params._dummy(), "epsilon", "The shape parameter to control the amount of " +
+                    "robustness. Must be > 1.0. Only valid when loss is huber",
+                    typeConverter=TypeConverters.toFloat)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2):
+                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
+                 loss="squaredError", epsilon=1.35):
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2)
+                 standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \
+                 loss="squaredError", epsilon=1.35)
         """
         super(LinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.LinearRegression", self.uid)
-        self._setDefault(maxIter=100, regParam=0.0, tol=1e-6)
+        self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, loss="squaredError", epsilon=1.35)
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
@@ -118,11 +130,13 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred
     @since("1.4.0")
     def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
-                  standardization=True, solver="auto", weightCol=None, aggregationDepth=2):
+                  standardization=True, solver="auto", weightCol=None, aggregationDepth=2,
+                  loss="squaredError", epsilon=1.35):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
-                  standardization=True, solver="auto", weightCol=None, aggregationDepth=2)
+                  standardization=True, solver="auto", weightCol=None, aggregationDepth=2, \
+                  loss="squaredError", epsilon=1.35)
         Sets params for linear regression.
         """
         kwargs = self._input_kwargs
@@ -131,6 +145,20 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
     def _create_model(self, java_model):
         return LinearRegressionModel(java_model)
 
+    @since("2.3.0")
+    def setEpsilon(self, value):
+        """
+        Sets the value of :py:attr:`epsilon`.
+        """
+        return self._set(epsilon=value)
+
+    @since("2.3.0")
+    def getEpsilon(self):
+        """
+        Gets the value of epsilon or its default value.
+        """
+        return self.getOrDefault(self.epsilon)
+
 
 class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable):
     """
@@ -155,6 +183,14 @@ def intercept(self):
         """
         return self._call_java("intercept")
 
+    @property
+    @since("2.3.0")
+    def scale(self):
+        """
+        The value by which \|y - X'w\| is scaled down when loss is "huber", otherwise 1.0.
+        """
+        return self._call_java("scale")
+
     @property
     @since("2.0.0")
     def summary(self):
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index be1521154f042..afcb0881c4dcb 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1726,6 +1726,27 @@ def test_offset(self):
         self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
 
 
+class LinearRegressionTest(SparkSessionTestCase):
+
+    def test_linear_regression_with_huber_loss(self):
+
+        data_path = "data/mllib/sample_linear_regression_data.txt"
+        df = self.spark.read.format("libsvm").load(data_path)
+
+        lir = LinearRegression(loss="huber", epsilon=2.0)
+        model = lir.fit(df)
+
+        expectedCoefficients = [0.136, 0.7648, -0.7761, 2.4236, 0.537,
+                                1.2612, -0.333, -0.5694, -0.6311, 0.6053]
+        expectedIntercept = 0.1607
+        expectedScale = 9.758
+
+        self.assertTrue(
+            np.allclose(model.coefficients.toArray(), expectedCoefficients, atol=1E-3))
+        self.assertTrue(np.isclose(model.intercept, expectedIntercept, atol=1E-3))
+        self.assertTrue(np.isclose(model.scale, expectedScale, atol=1E-3))
+
+
 class LogisticRegressionTest(SparkSessionTestCase):
 
     def test_binomial_logistic_regression_with_bound(self):

From 9c289a5cb46e00cd60db4794357f070dfdf80691 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Thu, 21 Dec 2017 10:02:30 +0800
Subject: [PATCH 161/356] [SPARK-22387][SQL] Propagate session configs to data
 source read/write options

## What changes were proposed in this pull request?

Introduce a new interface `SessionConfigSupport` for `DataSourceV2`, it can help to propagate session configs with the specified key-prefix to all data source operations in this session.

## How was this patch tested?

Add new test suite `DataSourceV2UtilsSuite`.

Author: Xingbo Jiang <xingbo.jiang@databricks.com>

Closes #19861 from jiangxb1987/datasource-configs.
---
 .../sql/sources/v2/SessionConfigSupport.java  | 39 +++++++++++++
 .../apache/spark/sql/DataFrameReader.scala    | 11 +++-
 .../apache/spark/sql/DataFrameWriter.scala    | 15 +++--
 .../datasources/v2/DataSourceV2Utils.scala    | 58 +++++++++++++++++++
 .../sources/v2/DataSourceV2UtilsSuite.scala   | 49 ++++++++++++++++
 5 files changed, 164 insertions(+), 8 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
new file mode 100644
index 0000000000000..0b5b6ac675f2c
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2;
+
+import org.apache.spark.annotation.InterfaceStability;
+
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
+ * propagate session configs with the specified key-prefix to all data source operations in this
+ * session.
+ */
+@InterfaceStability.Evolving
+public interface SessionConfigSupport {
+
+    /**
+     * Key prefix of the session configs to propagate. Spark will extract all session configs that
+     * starts with `spark.datasource.$keyPrefix`, turn `spark.datasource.$keyPrefix.xxx -&gt; yyy`
+     * into `xxx -&gt; yyy`, and propagate them to all data source operations in this session.
+     */
+    String keyPrefix();
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 39fec8f983b65..c43ee91294a27 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -33,7 +33,8 @@ import org.apache.spark.sql.execution.datasources.csv._
 import org.apache.spark.sql.execution.datasources.jdbc._
 import org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options, ReadSupport, ReadSupportWithSchema}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils
+import org.apache.spark.sql.sources.v2._
 import org.apache.spark.sql.types.{StringType, StructType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -184,9 +185,13 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
 
     val cls = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
-      val options = new DataSourceV2Options(extraOptions.asJava)
+      val ds = cls.newInstance()
+      val options = new DataSourceV2Options((extraOptions ++
+        DataSourceV2Utils.extractSessionConfigs(
+          ds = ds.asInstanceOf[DataSourceV2],
+          conf = sparkSession.sessionState.conf)).asJava)
 
-      val reader = (cls.newInstance(), userSpecifiedSchema) match {
+      val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 59a01e61124f7..7ccda0ad36d13 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -30,9 +30,10 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils
 import org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2
 import org.apache.spark.sql.sources.BaseRelation
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options, WriteSupport}
+import org.apache.spark.sql.sources.v2._
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -236,14 +237,18 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
 
     val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
     if (classOf[DataSourceV2].isAssignableFrom(cls)) {
-      cls.newInstance() match {
-        case ds: WriteSupport =>
-          val options = new DataSourceV2Options(extraOptions.asJava)
+      val ds = cls.newInstance()
+      ds match {
+        case ws: WriteSupport =>
+          val options = new DataSourceV2Options((extraOptions ++
+            DataSourceV2Utils.extractSessionConfigs(
+              ds = ds.asInstanceOf[DataSourceV2],
+              conf = df.sparkSession.sessionState.conf)).asJava)
           // Using a timestamp and a random UUID to distinguish different writing jobs. This is good
           // enough as there won't be tons of writing jobs created at the same second.
           val jobId = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US)
             .format(new Date()) + "-" + UUID.randomUUID()
-          val writer = ds.createWriter(jobId, df.logicalPlan.schema, mode, options)
+          val writer = ws.createWriter(jobId, df.logicalPlan.schema, mode, options)
           if (writer.isPresent) {
             runCommand(df.sparkSession, "save") {
               WriteToDataSourceV2(writer.get(), df.logicalPlan)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala
new file mode 100644
index 0000000000000..5267f5f1580c3
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Utils.scala
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.v2
+
+import java.util.regex.Pattern
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.{DataSourceV2, SessionConfigSupport}
+
+private[sql] object DataSourceV2Utils extends Logging {
+
+  /**
+   * Helper method that extracts and transforms session configs into k/v pairs, the k/v pairs will
+   * be used to create data source options.
+   * Only extract when `ds` implements [[SessionConfigSupport]], in this case we may fetch the
+   * specified key-prefix from `ds`, and extract session configs with config keys that start with
+   * `spark.datasource.$keyPrefix`. A session config `spark.datasource.$keyPrefix.xxx -> yyy` will
+   * be transformed into `xxx -> yyy`.
+   *
+   * @param ds a [[DataSourceV2]] object
+   * @param conf the session conf
+   * @return an immutable map that contains all the extracted and transformed k/v pairs.
+   */
+  def extractSessionConfigs(ds: DataSourceV2, conf: SQLConf): Map[String, String] = ds match {
+    case cs: SessionConfigSupport =>
+      val keyPrefix = cs.keyPrefix()
+      require(keyPrefix != null, "The data source config key prefix can't be null.")
+
+      val pattern = Pattern.compile(s"^spark\\.datasource\\.$keyPrefix\\.(.+)")
+
+      conf.getAllConfs.flatMap { case (key, value) =>
+        val m = pattern.matcher(key)
+        if (m.matches() && m.groupCount() > 0) {
+          Seq((m.group(1), value))
+        } else {
+          Seq.empty
+        }
+      }
+
+    case _ => Map.empty
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala
new file mode 100644
index 0000000000000..4911e3225552d
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/DataSourceV2UtilsSuite.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.sources.v2
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils
+import org.apache.spark.sql.internal.SQLConf
+
+class DataSourceV2UtilsSuite extends SparkFunSuite {
+
+  private val keyPrefix = new DataSourceV2WithSessionConfig().keyPrefix
+
+  test("method withSessionConfig() should propagate session configs correctly") {
+    // Only match configs with keys start with "spark.datasource.${keyPrefix}".
+    val conf = new SQLConf
+    conf.setConfString(s"spark.datasource.$keyPrefix.foo.bar", "false")
+    conf.setConfString(s"spark.datasource.$keyPrefix.whateverConfigName", "123")
+    conf.setConfString(s"spark.sql.$keyPrefix.config.name", "false")
+    conf.setConfString("spark.datasource.another.config.name", "123")
+    conf.setConfString(s"spark.datasource.$keyPrefix.", "123")
+    val cs = classOf[DataSourceV2WithSessionConfig].newInstance()
+    val confs = DataSourceV2Utils.extractSessionConfigs(cs.asInstanceOf[DataSourceV2], conf)
+    assert(confs.size == 2)
+    assert(confs.keySet.filter(_.startsWith("spark.datasource")).size == 0)
+    assert(confs.keySet.filter(_.startsWith("not.exist.prefix")).size == 0)
+    assert(confs.keySet.contains("foo.bar"))
+    assert(confs.keySet.contains("whateverConfigName"))
+  }
+}
+
+class DataSourceV2WithSessionConfig extends SimpleDataSourceV2 with SessionConfigSupport {
+
+  override def keyPrefix: String = "userDefinedDataSource"
+}

From d3ae3e1e894f88a8500752d9633fe9ad00da5f20 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Wed, 20 Dec 2017 19:53:35 -0800
Subject: [PATCH 162/356] [SPARK-19634][SQL][ML][FOLLOW-UP] Improve interface
 of dataframe vectorized summarizer

## What changes were proposed in this pull request?

Make several improvements in dataframe vectorized summarizer.

1. Make the summarizer return `Vector` type for all metrics (except "count").
It will return "WrappedArray" type before which won't be very convenient.

2. Make `MetricsAggregate` inherit `ImplicitCastInputTypes` trait. So it can check and implicitly cast input values.

3. Add "weight" parameter for all single metric method.

4. Update doc and improve the example code in doc.

5. Simplified test cases.

## How was this patch tested?

Test added and simplified.

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19156 from WeichenXu123/improve_vec_summarizer.
---
 .../org/apache/spark/ml/stat/Summarizer.scala | 128 ++++---
 .../spark/ml/stat/JavaSummarizerSuite.java    |  64 ++++
 .../spark/ml/stat/SummarizerSuite.scala       | 362 ++++++++++--------
 3 files changed, 341 insertions(+), 213 deletions(-)
 create mode 100644 mllib/src/test/java/org/apache/spark/ml/stat/JavaSummarizerSuite.java

diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
index cae41edb7aca8..9bed74a9f2c05 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/Summarizer.scala
@@ -24,7 +24,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
 import org.apache.spark.sql.Column
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Expression, UnsafeArrayData}
+import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, UnsafeArrayData}
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, TypedImperativeAggregate}
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.types._
@@ -41,7 +41,7 @@ sealed abstract class SummaryBuilder {
   /**
    * Returns an aggregate object that contains the summary of the column with the requested metrics.
    * @param featuresCol a column that contains features Vector object.
-   * @param weightCol a column that contains weight value.
+   * @param weightCol a column that contains weight value. Default weight is 1.0.
    * @return an aggregate column that contains the statistics. The exact content of this
    *         structure is determined during the creation of the builder.
    */
@@ -50,6 +50,7 @@ sealed abstract class SummaryBuilder {
 
   @Since("2.3.0")
   def summary(featuresCol: Column): Column = summary(featuresCol, lit(1.0))
+
 }
 
 /**
@@ -60,15 +61,18 @@ sealed abstract class SummaryBuilder {
  * This class lets users pick the statistics they would like to extract for a given column. Here is
  * an example in Scala:
  * {{{
- *   val dataframe = ... // Some dataframe containing a feature column
- *   val allStats = dataframe.select(Summarizer.metrics("min", "max").summary($"features"))
- *   val Row(Row(min_, max_)) = allStats.first()
+ *   import org.apache.spark.ml.linalg._
+ *   import org.apache.spark.sql.Row
+ *   val dataframe = ... // Some dataframe containing a feature column and a weight column
+ *   val multiStatsDF = dataframe.select(
+ *       Summarizer.metrics("min", "max", "count").summary($"features", $"weight")
+ *   val Row(Row(minVec, maxVec, count)) = multiStatsDF.first()
  * }}}
  *
  * If one wants to get a single metric, shortcuts are also available:
  * {{{
  *   val meanDF = dataframe.select(Summarizer.mean($"features"))
- *   val Row(mean_) = meanDF.first()
+ *   val Row(meanVec) = meanDF.first()
  * }}}
  *
  * Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
@@ -94,8 +98,7 @@ object Summarizer extends Logging {
    *  - min: the minimum for each coefficient.
    *  - normL2: the Euclidian norm for each coefficient.
    *  - normL1: the L1 norm of each coefficient (sum of the absolute values).
-   * @param firstMetric the metric being provided
-   * @param metrics additional metrics that can be provided.
+   * @param metrics metrics that can be provided.
    * @return a builder.
    * @throws IllegalArgumentException if one of the metric names is not understood.
    *
@@ -103,37 +106,79 @@ object Summarizer extends Logging {
    * interface.
    */
   @Since("2.3.0")
-  def metrics(firstMetric: String, metrics: String*): SummaryBuilder = {
-    val (typedMetrics, computeMetrics) = getRelevantMetrics(Seq(firstMetric) ++ metrics)
+  @scala.annotation.varargs
+  def metrics(metrics: String*): SummaryBuilder = {
+    require(metrics.size >= 1, "Should include at least one metric")
+    val (typedMetrics, computeMetrics) = getRelevantMetrics(metrics)
     new SummaryBuilderImpl(typedMetrics, computeMetrics)
   }
 
   @Since("2.3.0")
-  def mean(col: Column): Column = getSingleMetric(col, "mean")
+  def mean(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "mean")
+  }
+
+  @Since("2.3.0")
+  def mean(col: Column): Column = mean(col, lit(1.0))
+
+  @Since("2.3.0")
+  def variance(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "variance")
+  }
+
+  @Since("2.3.0")
+  def variance(col: Column): Column = variance(col, lit(1.0))
+
+  @Since("2.3.0")
+  def count(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "count")
+  }
+
+  @Since("2.3.0")
+  def count(col: Column): Column = count(col, lit(1.0))
 
   @Since("2.3.0")
-  def variance(col: Column): Column = getSingleMetric(col, "variance")
+  def numNonZeros(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "numNonZeros")
+  }
+
+  @Since("2.3.0")
+  def numNonZeros(col: Column): Column = numNonZeros(col, lit(1.0))
+
+  @Since("2.3.0")
+  def max(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "max")
+  }
+
+  @Since("2.3.0")
+  def max(col: Column): Column = max(col, lit(1.0))
 
   @Since("2.3.0")
-  def count(col: Column): Column = getSingleMetric(col, "count")
+  def min(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "min")
+  }
 
   @Since("2.3.0")
-  def numNonZeros(col: Column): Column = getSingleMetric(col, "numNonZeros")
+  def min(col: Column): Column = min(col, lit(1.0))
 
   @Since("2.3.0")
-  def max(col: Column): Column = getSingleMetric(col, "max")
+  def normL1(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "normL1")
+  }
 
   @Since("2.3.0")
-  def min(col: Column): Column = getSingleMetric(col, "min")
+  def normL1(col: Column): Column = normL1(col, lit(1.0))
 
   @Since("2.3.0")
-  def normL1(col: Column): Column = getSingleMetric(col, "normL1")
+  def normL2(col: Column, weightCol: Column): Column = {
+    getSingleMetric(col, weightCol, "normL2")
+  }
 
   @Since("2.3.0")
-  def normL2(col: Column): Column = getSingleMetric(col, "normL2")
+  def normL2(col: Column): Column = normL2(col, lit(1.0))
 
-  private def getSingleMetric(col: Column, metric: String): Column = {
-    val c1 = metrics(metric).summary(col)
+  private def getSingleMetric(col: Column, weightCol: Column, metric: String): Column = {
+    val c1 = metrics(metric).summary(col, weightCol)
     c1.getField(metric).as(s"$metric($col)")
   }
 }
@@ -187,8 +232,7 @@ private[ml] object SummaryBuilderImpl extends Logging {
     StructType(fields)
   }
 
-  private val arrayDType = ArrayType(DoubleType, containsNull = false)
-  private val arrayLType = ArrayType(LongType, containsNull = false)
+  private val vectorUDT = new VectorUDT
 
   /**
    * All the metrics that can be currently computed by Spark for vectors.
@@ -197,14 +241,14 @@ private[ml] object SummaryBuilderImpl extends Logging {
    * metrics that need to de computed internally to get the final result.
    */
   private val allMetrics: Seq[(String, Metric, DataType, Seq[ComputeMetric])] = Seq(
-    ("mean", Mean, arrayDType, Seq(ComputeMean, ComputeWeightSum)),
-    ("variance", Variance, arrayDType, Seq(ComputeWeightSum, ComputeMean, ComputeM2n)),
+    ("mean", Mean, vectorUDT, Seq(ComputeMean, ComputeWeightSum)),
+    ("variance", Variance, vectorUDT, Seq(ComputeWeightSum, ComputeMean, ComputeM2n)),
     ("count", Count, LongType, Seq()),
-    ("numNonZeros", NumNonZeros, arrayLType, Seq(ComputeNNZ)),
-    ("max", Max, arrayDType, Seq(ComputeMax, ComputeNNZ)),
-    ("min", Min, arrayDType, Seq(ComputeMin, ComputeNNZ)),
-    ("normL2", NormL2, arrayDType, Seq(ComputeM2)),
-    ("normL1", NormL1, arrayDType, Seq(ComputeL1))
+    ("numNonZeros", NumNonZeros, vectorUDT, Seq(ComputeNNZ)),
+    ("max", Max, vectorUDT, Seq(ComputeMax, ComputeNNZ)),
+    ("min", Min, vectorUDT, Seq(ComputeMin, ComputeNNZ)),
+    ("normL2", NormL2, vectorUDT, Seq(ComputeM2)),
+    ("normL1", NormL1, vectorUDT, Seq(ComputeL1))
   )
 
   /**
@@ -527,27 +571,28 @@ private[ml] object SummaryBuilderImpl extends Logging {
       weightExpr: Expression,
       mutableAggBufferOffset: Int,
       inputAggBufferOffset: Int)
-    extends TypedImperativeAggregate[SummarizerBuffer] {
+    extends TypedImperativeAggregate[SummarizerBuffer] with ImplicitCastInputTypes {
 
-    override def eval(state: SummarizerBuffer): InternalRow = {
+    override def eval(state: SummarizerBuffer): Any = {
       val metrics = requestedMetrics.map {
-        case Mean => UnsafeArrayData.fromPrimitiveArray(state.mean.toArray)
-        case Variance => UnsafeArrayData.fromPrimitiveArray(state.variance.toArray)
+        case Mean => vectorUDT.serialize(state.mean)
+        case Variance => vectorUDT.serialize(state.variance)
         case Count => state.count
-        case NumNonZeros => UnsafeArrayData.fromPrimitiveArray(
-          state.numNonzeros.toArray.map(_.toLong))
-        case Max => UnsafeArrayData.fromPrimitiveArray(state.max.toArray)
-        case Min => UnsafeArrayData.fromPrimitiveArray(state.min.toArray)
-        case NormL2 => UnsafeArrayData.fromPrimitiveArray(state.normL2.toArray)
-        case NormL1 => UnsafeArrayData.fromPrimitiveArray(state.normL1.toArray)
+        case NumNonZeros => vectorUDT.serialize(state.numNonzeros)
+        case Max => vectorUDT.serialize(state.max)
+        case Min => vectorUDT.serialize(state.min)
+        case NormL2 => vectorUDT.serialize(state.normL2)
+        case NormL1 => vectorUDT.serialize(state.normL1)
       }
       InternalRow.apply(metrics: _*)
     }
 
+    override def inputTypes: Seq[DataType] = vectorUDT :: DoubleType :: Nil
+
     override def children: Seq[Expression] = featuresExpr :: weightExpr :: Nil
 
     override def update(state: SummarizerBuffer, row: InternalRow): SummarizerBuffer = {
-      val features = udt.deserialize(featuresExpr.eval(row))
+      val features = vectorUDT.deserialize(featuresExpr.eval(row))
       val weight = weightExpr.eval(row).asInstanceOf[Double]
       state.add(features, weight)
       state
@@ -591,7 +636,4 @@ private[ml] object SummaryBuilderImpl extends Logging {
     override def prettyName: String = "aggregate_metrics"
 
   }
-
-  private[this] val udt = new VectorUDT
-
 }
diff --git a/mllib/src/test/java/org/apache/spark/ml/stat/JavaSummarizerSuite.java b/mllib/src/test/java/org/apache/spark/ml/stat/JavaSummarizerSuite.java
new file mode 100644
index 0000000000000..38ab39aa0f492
--- /dev/null
+++ b/mllib/src/test/java/org/apache/spark/ml/stat/JavaSummarizerSuite.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertArrayEquals;
+
+import org.apache.spark.SharedSparkSession;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.Dataset;
+import static org.apache.spark.sql.functions.col;
+import org.apache.spark.ml.feature.LabeledPoint;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.Vectors;
+
+public class JavaSummarizerSuite extends SharedSparkSession {
+
+  private transient Dataset<Row> dataset;
+
+  @Override
+  public void setUp() throws IOException {
+    super.setUp();
+    List<LabeledPoint> points = new ArrayList<LabeledPoint>();
+    points.add(new LabeledPoint(0.0, Vectors.dense(1.0, 2.0)));
+    points.add(new LabeledPoint(0.0, Vectors.dense(3.0, 4.0)));
+
+    dataset = spark.createDataFrame(jsc.parallelize(points, 2), LabeledPoint.class);
+  }
+
+  @Test
+  public void testSummarizer() {
+    dataset.select(col("features"));
+    Row result = dataset
+      .select(Summarizer.metrics("mean", "max", "count").summary(col("features")))
+      .first().getStruct(0);
+    Vector meanVec = result.getAs("mean");
+    Vector maxVec = result.getAs("max");
+    long count = result.getAs("count");
+
+    assertEquals(2L, count);
+    assertArrayEquals(new double[]{2.0, 3.0}, meanVec.toArray(), 0.0);
+    assertArrayEquals(new double[]{3.0, 4.0}, maxVec.toArray(), 0.0);
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
index 1ea851ef2d676..5e4f402989697 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/stat/SummarizerSuite.scala
@@ -17,16 +17,13 @@
 
 package org.apache.spark.ml.stat
 
-import org.scalatest.exceptions.TestFailedException
-
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
 import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, Statistics}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
-import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
+import org.apache.spark.sql.Row
 
 class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
 
@@ -35,237 +32,262 @@ class SummarizerSuite extends SparkFunSuite with MLlibTestSparkContext {
   import SummaryBuilderImpl._
 
   private case class ExpectedMetrics(
-      mean: Seq[Double],
-      variance: Seq[Double],
+      mean: Vector,
+      variance: Vector,
       count: Long,
-      numNonZeros: Seq[Long],
-      max: Seq[Double],
-      min: Seq[Double],
-      normL2: Seq[Double],
-      normL1: Seq[Double])
+      numNonZeros: Vector,
+      max: Vector,
+      min: Vector,
+      normL2: Vector,
+      normL1: Vector)
 
   /**
-   * The input is expected to be either a sparse vector, a dense vector or an array of doubles
-   * (which will be converted to a dense vector)
-   * The expected is the list of all the known metrics.
+   * The input is expected to be either a sparse vector, a dense vector.
    *
-   * The tests take an list of input vectors and a list of all the summary values that
-   * are expected for this input. They currently test against some fixed subset of the
-   * metrics, but should be made fuzzy in the future.
+   * The tests take an list of input vectors, and compare results with
+   * `mllib.stat.MultivariateOnlineSummarizer`. They currently test against some fixed subset
+   * of the metrics, but should be made fuzzy in the future.
    */
-  private def testExample(name: String, input: Seq[Any], exp: ExpectedMetrics): Unit = {
+  private def testExample(name: String, inputVec: Seq[(Vector, Double)],
+      exp: ExpectedMetrics, expWithoutWeight: ExpectedMetrics): Unit = {
 
-    def inputVec: Seq[Vector] = input.map {
-      case x: Array[Double @unchecked] => Vectors.dense(x)
-      case x: Seq[Double @unchecked] => Vectors.dense(x.toArray)
-      case x: Vector => x
-      case x => throw new Exception(x.toString)
+    val summarizer = {
+      val _summarizer = new MultivariateOnlineSummarizer
+      inputVec.foreach(v => _summarizer.add(OldVectors.fromML(v._1), v._2))
+      _summarizer
     }
 
-    val summarizer = {
+    val summarizerWithoutWeight = {
       val _summarizer = new MultivariateOnlineSummarizer
-      inputVec.foreach(v => _summarizer.add(OldVectors.fromML(v)))
+      inputVec.foreach(v => _summarizer.add(OldVectors.fromML(v._1)))
       _summarizer
     }
 
     // Because the Spark context is reset between tests, we cannot hold a reference onto it.
     def wrappedInit() = {
-      val df = inputVec.map(Tuple1.apply).toDF("features")
-      val col = df.col("features")
-      (df, col)
+      val df = inputVec.toDF("features", "weight")
+      val featuresCol = df.col("features")
+      val weightCol = df.col("weight")
+      (df, featuresCol, weightCol)
     }
 
     registerTest(s"$name - mean only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("mean").summary(c), mean(c)), Seq(Row(exp.mean), summarizer.mean))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("mean").summary(c, w), mean(c, w)).first(),
+        Row(Row(summarizer.mean), exp.mean))
     }
 
-    registerTest(s"$name - mean only (direct)") {
-      val (df, c) = wrappedInit()
-      compare(df.select(mean(c)), Seq(exp.mean))
+    registerTest(s"$name - mean only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("mean").summary(c), mean(c)).first(),
+        Row(Row(summarizerWithoutWeight.mean), expWithoutWeight.mean))
     }
 
     registerTest(s"$name - variance only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("variance").summary(c), variance(c)),
-        Seq(Row(exp.variance), summarizer.variance))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("variance").summary(c, w), variance(c, w)).first(),
+        Row(Row(summarizer.variance), exp.variance))
     }
 
-    registerTest(s"$name - variance only (direct)") {
-      val (df, c) = wrappedInit()
-      compare(df.select(variance(c)), Seq(summarizer.variance))
+    registerTest(s"$name - variance only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("variance").summary(c), variance(c)).first(),
+        Row(Row(summarizerWithoutWeight.variance), expWithoutWeight.variance))
     }
 
     registerTest(s"$name - count only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("count").summary(c), count(c)),
-        Seq(Row(exp.count), exp.count))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("count").summary(c, w), count(c, w)).first(),
+        Row(Row(summarizer.count), exp.count))
     }
 
-    registerTest(s"$name - count only (direct)") {
-      val (df, c) = wrappedInit()
-      compare(df.select(count(c)),
-        Seq(exp.count))
+    registerTest(s"$name - count only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("count").summary(c), count(c)).first(),
+        Row(Row(summarizerWithoutWeight.count), expWithoutWeight.count))
     }
 
     registerTest(s"$name - numNonZeros only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)),
-        Seq(Row(exp.numNonZeros), exp.numNonZeros))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("numNonZeros").summary(c, w), numNonZeros(c, w)).first(),
+        Row(Row(summarizer.numNonzeros), exp.numNonZeros))
     }
 
-    registerTest(s"$name - numNonZeros only (direct)") {
-      val (df, c) = wrappedInit()
-      compare(df.select(numNonZeros(c)),
-        Seq(exp.numNonZeros))
+    registerTest(s"$name - numNonZeros only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("numNonZeros").summary(c), numNonZeros(c)).first(),
+        Row(Row(summarizerWithoutWeight.numNonzeros), expWithoutWeight.numNonZeros))
     }
 
     registerTest(s"$name - min only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("min").summary(c), min(c)),
-        Seq(Row(exp.min), exp.min))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("min").summary(c, w), min(c, w)).first(),
+        Row(Row(summarizer.min), exp.min))
+    }
+
+    registerTest(s"$name - min only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("min").summary(c), min(c)).first(),
+        Row(Row(summarizerWithoutWeight.min), expWithoutWeight.min))
     }
 
     registerTest(s"$name - max only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("max").summary(c), max(c)),
-        Seq(Row(exp.max), exp.max))
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("max").summary(c, w), max(c, w)).first(),
+        Row(Row(summarizer.max), exp.max))
     }
 
-    registerTest(s"$name - normL1 only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("normL1").summary(c), normL1(c)),
-        Seq(Row(exp.normL1), exp.normL1))
+    registerTest(s"$name - max only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("max").summary(c), max(c)).first(),
+        Row(Row(summarizerWithoutWeight.max), expWithoutWeight.max))
     }
 
-    registerTest(s"$name - normL2 only") {
-      val (df, c) = wrappedInit()
-      compare(df.select(metrics("normL2").summary(c), normL2(c)),
-        Seq(Row(exp.normL2), exp.normL2))
+    registerTest(s"$name - normL1 only") {
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("normL1").summary(c, w), normL1(c, w)).first(),
+        Row(Row(summarizer.normL1), exp.normL1))
     }
 
-    registerTest(s"$name - all metrics at once") {
-      val (df, c) = wrappedInit()
-      compare(df.select(
-        metrics("mean", "variance", "count", "numNonZeros").summary(c),
-        mean(c), variance(c), count(c), numNonZeros(c)),
-        Seq(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros),
-          exp.mean, exp.variance, exp.count, exp.numNonZeros))
+    registerTest(s"$name - normL1 only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("normL1").summary(c), normL1(c)).first(),
+        Row(Row(summarizerWithoutWeight.normL1), expWithoutWeight.normL1))
     }
-  }
 
-  private def denseData(input: Seq[Seq[Double]]): DataFrame = {
-    input.map(_.toArray).map(Vectors.dense).map(Tuple1.apply).toDF("features")
-  }
+    registerTest(s"$name - normL2 only") {
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(metrics("normL2").summary(c, w), normL2(c, w)).first(),
+        Row(Row(summarizer.normL2), exp.normL2))
+    }
 
-  private def compare(df: DataFrame, exp: Seq[Any]): Unit = {
-    val coll = df.collect().toSeq
-    val Seq(row) = coll
-    val res = row.toSeq
-    val names = df.schema.fieldNames.zipWithIndex.map { case (n, idx) => s"$n ($idx)" }
-    assert(res.size === exp.size, (res.size, exp.size))
-    for (((x1, x2), name) <- res.zip(exp).zip(names)) {
-      compareStructures(x1, x2, name)
+    registerTest(s"$name - normL2 only w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(metrics("normL2").summary(c), normL2(c)).first(),
+        Row(Row(summarizerWithoutWeight.normL2), expWithoutWeight.normL2))
     }
-  }
 
-  // Compares structured content.
-  private def compareStructures(x1: Any, x2: Any, name: String): Unit = (x1, x2) match {
-    case (y1: Seq[Double @unchecked], v1: OldVector) =>
-      compareStructures(y1, v1.toArray.toSeq, name)
-    case (d1: Double, d2: Double) =>
-      assert2(Vectors.dense(d1) ~== Vectors.dense(d2) absTol 1e-4, name)
-    case (r1: GenericRowWithSchema, r2: Row) =>
-      assert(r1.size === r2.size, (r1, r2))
-      for (((fname, x1), x2) <- r1.schema.fieldNames.zip(r1.toSeq).zip(r2.toSeq)) {
-        compareStructures(x1, x2, s"$name.$fname")
-      }
-    case (r1: Row, r2: Row) =>
-      assert(r1.size === r2.size, (r1, r2))
-      for ((x1, x2) <- r1.toSeq.zip(r2.toSeq)) { compareStructures(x1, x2, name) }
-    case (v1: Vector, v2: Vector) =>
-      assert2(v1 ~== v2 absTol 1e-4, name)
-    case (l1: Long, l2: Long) => assert(l1 === l2)
-    case (s1: Seq[_], s2: Seq[_]) =>
-      assert(s1.size === s2.size, s"$name ${(s1, s2)}")
-      for (((x1, idx), x2) <- s1.zipWithIndex.zip(s2)) {
-        compareStructures(x1, x2, s"$name.$idx")
-      }
-    case (arr1: Array[_], arr2: Array[_]) =>
-      assert(arr1.toSeq === arr2.toSeq)
-    case _ => throw new Exception(s"$name: ${x1.getClass} ${x2.getClass} $x1 $x2")
-  }
+    registerTest(s"$name - multiple metrics at once") {
+      val (df, c, w) = wrappedInit()
+      compareRow(df.select(
+        metrics("mean", "variance", "count", "numNonZeros").summary(c, w)).first(),
+        Row(Row(exp.mean, exp.variance, exp.count, exp.numNonZeros))
+      )
+    }
 
-  private def assert2(x: => Boolean, hint: String): Unit = {
-    try {
-      assert(x, hint)
-    } catch {
-      case tfe: TestFailedException =>
-        throw new TestFailedException(Some(s"Failure with hint $hint"), Some(tfe), 1)
+    registerTest(s"$name - multiple metrics at once w/o weight") {
+      val (df, c, _) = wrappedInit()
+      compareRow(df.select(
+        metrics("mean", "variance", "count", "numNonZeros").summary(c)).first(),
+        Row(Row(expWithoutWeight.mean, expWithoutWeight.variance,
+          expWithoutWeight.count, expWithoutWeight.numNonZeros))
+      )
     }
   }
 
-  test("debugging test") {
-    val df = denseData(Nil)
-    val c = df.col("features")
-    val c1 = metrics("mean").summary(c)
-    val res = df.select(c1)
-    intercept[SparkException] {
-      compare(res, Seq.empty)
+  private def compareRow(r1: Row, r2: Row): Unit = {
+    assert(r1.size === r2.size, (r1, r2))
+    r1.toSeq.zip(r2.toSeq).foreach {
+      case (v1: Vector, v2: Vector) =>
+        assert(v1 ~== v2 absTol 1e-4)
+      case (v1: Vector, v2: OldVector) =>
+        assert(v1 ~== v2.asML absTol 1e-4)
+      case (l1: Long, l2: Long) =>
+        assert(l1 === l2)
+      case (r1: Row, r2: Row) =>
+        compareRow(r1, r2)
+      case (x1: Any, x2: Any) =>
+        throw new Exception(s"type mismatch: ${x1.getClass} ${x2.getClass} $x1 $x2")
     }
   }
 
-  test("basic error handling") {
-    val df = denseData(Nil)
+  test("no element") {
+    val df = Seq[Tuple1[Vector]]().toDF("features")
     val c = df.col("features")
-    val res = df.select(metrics("mean").summary(c), mean(c))
     intercept[SparkException] {
-      compare(res, Seq.empty)
+      df.select(metrics("mean").summary(c), mean(c)).first()
     }
+    compareRow(df.select(metrics("count").summary(c), count(c)).first(),
+      Row(Row(0L), 0L))
   }
 
-  test("no element, working metrics") {
-    val df = denseData(Nil)
-    val c = df.col("features")
-    val res = df.select(metrics("count").summary(c), count(c))
-    compare(res, Seq(Row(0L), 0L))
-  }
+  val singleElem = Vectors.dense(0.0, 1.0, 2.0)
+  testExample("single element", Seq((singleElem, 2.0)),
+    ExpectedMetrics(
+      mean = singleElem,
+      variance = Vectors.dense(0.0, 0.0, 0.0),
+      count = 1L,
+      numNonZeros = Vectors.dense(0.0, 1.0, 1.0),
+      max = singleElem,
+      min = singleElem,
+      normL1 = Vectors.dense(0.0, 2.0, 4.0),
+      normL2 = Vectors.dense(0.0, 1.414213, 2.828427)
+    ),
+    ExpectedMetrics(
+      mean = singleElem,
+      variance = Vectors.dense(0.0, 0.0, 0.0),
+      count = 1L,
+      numNonZeros = Vectors.dense(0.0, 1.0, 1.0),
+      max = singleElem,
+      min = singleElem,
+      normL1 = singleElem,
+      normL2 = singleElem
+    )
+  )
+
+  testExample("multiple elements (dense)",
+    Seq(
+      (Vectors.dense(-1.0, 0.0, 6.0), 0.5),
+      (Vectors.dense(3.0, -3.0, 0.0), 2.8),
+      (Vectors.dense(1.0, -3.0, 0.0), 0.0)
+    ),
+    ExpectedMetrics(
+      mean = Vectors.dense(2.393939, -2.545454, 0.909090),
+      variance = Vectors.dense(8.0, 4.5, 18.0),
+      count = 2L,
+      numNonZeros = Vectors.dense(2.0, 1.0, 1.0),
+      max = Vectors.dense(3.0, 0.0, 6.0),
+      min = Vectors.dense(-1.0, -3.0, 0.0),
+      normL1 = Vectors.dense(8.9, 8.4, 3.0),
+      normL2 = Vectors.dense(5.069516, 5.019960, 4.242640)
+    ),
+    ExpectedMetrics(
+      mean = Vectors.dense(1.0, -2.0, 2.0),
+      variance = Vectors.dense(4.0, 3.0, 12.0),
+      count = 3L,
+      numNonZeros = Vectors.dense(3.0, 2.0, 1.0),
+      max = Vectors.dense(3.0, 0.0, 6.0),
+      min = Vectors.dense(-1.0, -3.0, 0.0),
+      normL1 = Vectors.dense(5.0, 6.0, 6.0),
+      normL2 = Vectors.dense(3.316624, 4.242640, 6.0)
+    )
+  )
 
-  val singleElem = Seq(0.0, 1.0, 2.0)
-  testExample("single element", Seq(singleElem), ExpectedMetrics(
-    mean = singleElem,
-    variance = Seq(0.0, 0.0, 0.0),
-    count = 1,
-    numNonZeros = Seq(0, 1, 1),
-    max = singleElem,
-    min = singleElem,
-    normL1 = singleElem,
-    normL2 = singleElem
-  ))
-
-  testExample("two elements", Seq(Seq(0.0, 1.0, 2.0), Seq(0.0, -1.0, -2.0)), ExpectedMetrics(
-    mean = Seq(0.0, 0.0, 0.0),
-    // TODO: I have a doubt about these values, they are not normalized.
-    variance = Seq(0.0, 2.0, 8.0),
-    count = 2,
-    numNonZeros = Seq(0, 2, 2),
-    max = Seq(0.0, 1.0, 2.0),
-    min = Seq(0.0, -1.0, -2.0),
-    normL1 = Seq(0.0, 2.0, 4.0),
-    normL2 = Seq(0.0, math.sqrt(2.0), math.sqrt(2.0) * 2.0)
-  ))
-
-  testExample("dense vector input",
-    Seq(Seq(-1.0, 0.0, 6.0), Seq(3.0, -3.0, 0.0)),
+  testExample("multiple elements (sparse)",
+    Seq(
+      (Vectors.dense(-1.0, 0.0, 6.0).toSparse, 0.5),
+      (Vectors.dense(3.0, -3.0, 0.0).toSparse, 2.8),
+      (Vectors.dense(1.0, -3.0, 0.0).toSparse, 0.0)
+    ),
+    ExpectedMetrics(
+      mean = Vectors.dense(2.393939, -2.545454, 0.909090),
+      variance = Vectors.dense(8.0, 4.5, 18.0),
+      count = 2L,
+      numNonZeros = Vectors.dense(2.0, 1.0, 1.0),
+      max = Vectors.dense(3.0, 0.0, 6.0),
+      min = Vectors.dense(-1.0, -3.0, 0.0),
+      normL1 = Vectors.dense(8.9, 8.4, 3.0),
+      normL2 = Vectors.dense(5.069516, 5.019960, 4.242640)
+    ),
     ExpectedMetrics(
-      mean = Seq(1.0, -1.5, 3.0),
-      variance = Seq(8.0, 4.5, 18.0),
-      count = 2,
-      numNonZeros = Seq(2, 1, 1),
-      max = Seq(3.0, 0.0, 6.0),
-      min = Seq(-1.0, -3, 0.0),
-      normL1 = Seq(4.0, 3.0, 6.0),
-      normL2 = Seq(math.sqrt(10), 3, 6.0)
+      mean = Vectors.dense(1.0, -2.0, 2.0),
+      variance = Vectors.dense(4.0, 3.0, 12.0),
+      count = 3L,
+      numNonZeros = Vectors.dense(3.0, 2.0, 1.0),
+      max = Vectors.dense(3.0, 0.0, 6.0),
+      min = Vectors.dense(-1.0, -3.0, 0.0),
+      normL1 = Vectors.dense(5.0, 6.0, 6.0),
+      normL2 = Vectors.dense(3.316624, 4.242640, 6.0)
     )
   )
 

From cb9fc8d9b6d385c9d193801edaea2d52e29a90fb Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 21 Dec 2017 14:54:38 +0800
Subject: [PATCH 163/356] [SPARK-22848][SQL] Eliminate mutable state from Stack

## What changes were proposed in this pull request?

This PR eliminates mutable states from the generated code for `Stack`.

## How was this patch tested?

Existing test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #20035 from kiszk/SPARK-22848.
---
 .../spark/sql/catalyst/expressions/generators.scala    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 1cd73a92a8635..69af7a250a5ac 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -214,11 +214,11 @@ case class Stack(children: Seq[Expression]) extends Generator {
 
     // Create the collection.
     val wrapperClass = classOf[mutable.WrappedArray[_]].getName
-    ctx.addMutableState(
-      s"$wrapperClass<InternalRow>",
-      ev.value,
-      v => s"$v = $wrapperClass$$.MODULE$$.make($rowData);", useFreshName = false)
-    ev.copy(code = code, isNull = "false")
+    ev.copy(code =
+      s"""
+         |$code
+         |$wrapperClass<InternalRow> ${ev.value} = $wrapperClass$$.MODULE$$.make($rowData);
+       """.stripMargin, isNull = "false")
   }
 }
 

From 59d52631eb86394f1d981419cb744c20bd4e0b87 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Thu, 21 Dec 2017 20:43:56 +0900
Subject: [PATCH 164/356] [SPARK-22324][SQL][PYTHON] Upgrade Arrow to 0.8.0

## What changes were proposed in this pull request?

Upgrade Spark to Arrow 0.8.0 for Java and Python.  Also includes an upgrade of Netty to 4.1.17 to resolve dependency requirements.

The highlights that pertain to Spark for the update from Arrow versoin 0.4.1 to 0.8.0 include:

* Java refactoring for more simple API
* Java reduced heap usage and streamlined hot code paths
* Type support for DecimalType, ArrayType
* Improved type casting support in Python
* Simplified type checking in Python

## How was this patch tested?

Existing tests

Author: Bryan Cutler <cutlerb@gmail.com>
Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #19884 from BryanCutler/arrow-upgrade-080-SPARK-22324.
---
 .../spark/network/crypto/TransportCipher.java |  41 +-
 .../network/protocol/MessageWithHeader.java   |  39 +-
 .../spark/network/sasl/SaslEncryption.java    |  41 +-
 .../network/util/AbstractFileRegion.java      |  53 ++
 .../apache/spark/network/ProtocolSuite.java   |   4 +-
 .../protocol/MessageWithHeaderSuite.java      |   7 +-
 .../org/apache/spark/storage/DiskStore.scala  |   9 +-
 dev/deps/spark-deps-hadoop-2.6                |  10 +-
 dev/deps/spark-deps-hadoop-2.7                |  10 +-
 pom.xml                                       |  12 +-
 python/pyspark/serializers.py                 |  27 +-
 python/pyspark/sql/dataframe.py               |   2 +
 python/pyspark/sql/functions.py               |  13 +-
 python/pyspark/sql/group.py                   |   2 +-
 python/pyspark/sql/session.py                 |   3 +
 python/pyspark/sql/tests.py                   |  12 +-
 python/pyspark/sql/types.py                   |  25 +-
 python/pyspark/sql/udf.py                     |  16 +-
 python/pyspark/sql/utils.py                   |   9 +
 .../vectorized/ArrowColumnVector.java         | 136 ++---
 .../sql/execution/arrow/ArrowConverters.scala |  13 +-
 .../sql/execution/arrow/ArrowWriter.scala     | 132 ++--
 .../execution/python/ArrowPythonRunner.scala  |  27 +-
 .../arrow/ArrowConvertersSuite.scala          | 571 ++----------------
 .../vectorized/ArrowColumnVectorSuite.scala   | 150 +++--
 .../vectorized/ColumnarBatchSuite.scala       |  20 +-
 26 files changed, 515 insertions(+), 869 deletions(-)
 create mode 100644 common/network-common/src/main/java/org/apache/spark/network/util/AbstractFileRegion.java

diff --git a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
index 7376d1ddc4818..e04524dde0a75 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/crypto/TransportCipher.java
@@ -30,10 +30,10 @@
 import io.netty.buffer.ByteBuf;
 import io.netty.buffer.Unpooled;
 import io.netty.channel.*;
-import io.netty.util.AbstractReferenceCounted;
 import org.apache.commons.crypto.stream.CryptoInputStream;
 import org.apache.commons.crypto.stream.CryptoOutputStream;
 
+import org.apache.spark.network.util.AbstractFileRegion;
 import org.apache.spark.network.util.ByteArrayReadableChannel;
 import org.apache.spark.network.util.ByteArrayWritableChannel;
 
@@ -161,7 +161,7 @@ public void channelInactive(ChannelHandlerContext ctx) throws Exception {
     }
   }
 
-  private static class EncryptedMessage extends AbstractReferenceCounted implements FileRegion {
+  private static class EncryptedMessage extends AbstractFileRegion {
     private final boolean isByteBuf;
     private final ByteBuf buf;
     private final FileRegion region;
@@ -199,10 +199,45 @@ public long position() {
     }
 
     @Override
-    public long transfered() {
+    public long transferred() {
       return transferred;
     }
 
+    @Override
+    public EncryptedMessage touch(Object o) {
+      super.touch(o);
+      if (region != null) {
+        region.touch(o);
+      }
+      if (buf != null) {
+        buf.touch(o);
+      }
+      return this;
+    }
+
+    @Override
+    public EncryptedMessage retain(int increment) {
+      super.retain(increment);
+      if (region != null) {
+        region.retain(increment);
+      }
+      if (buf != null) {
+        buf.retain(increment);
+      }
+      return this;
+    }
+
+    @Override
+    public boolean release(int decrement) {
+      if (region != null) {
+        region.release(decrement);
+      }
+      if (buf != null) {
+        buf.release(decrement);
+      }
+      return super.release(decrement);
+    }
+
     @Override
     public long transferTo(WritableByteChannel target, long position) throws IOException {
       Preconditions.checkArgument(position == transfered(), "Invalid position.");
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
index 4f8781b42a0e4..897d0f9e4fb89 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
@@ -25,17 +25,17 @@
 import com.google.common.base.Preconditions;
 import io.netty.buffer.ByteBuf;
 import io.netty.channel.FileRegion;
-import io.netty.util.AbstractReferenceCounted;
 import io.netty.util.ReferenceCountUtil;
 
 import org.apache.spark.network.buffer.ManagedBuffer;
+import org.apache.spark.network.util.AbstractFileRegion;
 
 /**
  * A wrapper message that holds two separate pieces (a header and a body).
  *
  * The header must be a ByteBuf, while the body can be a ByteBuf or a FileRegion.
  */
-class MessageWithHeader extends AbstractReferenceCounted implements FileRegion {
+class MessageWithHeader extends AbstractFileRegion {
 
   @Nullable private final ManagedBuffer managedBuffer;
   private final ByteBuf header;
@@ -91,7 +91,7 @@ public long position() {
   }
 
   @Override
-  public long transfered() {
+  public long transferred() {
     return totalBytesTransferred;
   }
 
@@ -160,4 +160,37 @@ private int writeNioBuffer(
 
     return ret;
   }
+
+  @Override
+  public MessageWithHeader touch(Object o) {
+    super.touch(o);
+    header.touch(o);
+    ReferenceCountUtil.touch(body, o);
+    return this;
+  }
+
+  @Override
+  public MessageWithHeader retain(int increment) {
+    super.retain(increment);
+    header.retain(increment);
+    ReferenceCountUtil.retain(body, increment);
+    if (managedBuffer != null) {
+      for (int i = 0; i < increment; i++) {
+        managedBuffer.retain();
+      }
+    }
+    return this;
+  }
+
+  @Override
+  public boolean release(int decrement) {
+    header.release(decrement);
+    ReferenceCountUtil.release(body, decrement);
+    if (managedBuffer != null) {
+      for (int i = 0; i < decrement; i++) {
+        managedBuffer.release();
+      }
+    }
+    return super.release(decrement);
+  }
 }
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
index 3d71ebaa7ea0c..16ab4efcd4f5f 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
@@ -32,8 +32,8 @@
 import io.netty.channel.ChannelPromise;
 import io.netty.channel.FileRegion;
 import io.netty.handler.codec.MessageToMessageDecoder;
-import io.netty.util.AbstractReferenceCounted;
 
+import org.apache.spark.network.util.AbstractFileRegion;
 import org.apache.spark.network.util.ByteArrayWritableChannel;
 import org.apache.spark.network.util.NettyUtils;
 
@@ -129,7 +129,7 @@ protected void decode(ChannelHandlerContext ctx, ByteBuf msg, List<Object> out)
   }
 
   @VisibleForTesting
-  static class EncryptedMessage extends AbstractReferenceCounted implements FileRegion {
+  static class EncryptedMessage extends AbstractFileRegion {
 
     private final SaslEncryptionBackend backend;
     private final boolean isByteBuf;
@@ -183,10 +183,45 @@ public long position() {
      * Returns an approximation of the amount of data transferred. See {@link #count()}.
      */
     @Override
-    public long transfered() {
+    public long transferred() {
       return transferred;
     }
 
+    @Override
+    public EncryptedMessage touch(Object o) {
+      super.touch(o);
+      if (buf != null) {
+        buf.touch(o);
+      }
+      if (region != null) {
+        region.touch(o);
+      }
+      return this;
+    }
+
+    @Override
+    public EncryptedMessage retain(int increment) {
+      super.retain(increment);
+      if (buf != null) {
+        buf.retain(increment);
+      }
+      if (region != null) {
+        region.retain(increment);
+      }
+      return this;
+    }
+
+    @Override
+    public boolean release(int decrement) {
+      if (region != null) {
+        region.release(decrement);
+      }
+      if (buf != null) {
+        buf.release(decrement);
+      }
+      return super.release(decrement);
+    }
+
     /**
      * Transfers data from the original message to the channel, encrypting it in the process.
      *
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/AbstractFileRegion.java b/common/network-common/src/main/java/org/apache/spark/network/util/AbstractFileRegion.java
new file mode 100644
index 0000000000000..8651297d97ec2
--- /dev/null
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/AbstractFileRegion.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.network.util;
+
+import io.netty.channel.FileRegion;
+import io.netty.util.AbstractReferenceCounted;
+
+public abstract class AbstractFileRegion extends AbstractReferenceCounted implements FileRegion {
+
+  @Override
+  @SuppressWarnings("deprecation")
+  public final long transfered() {
+    return transferred();
+  }
+
+  @Override
+  public AbstractFileRegion retain() {
+    super.retain();
+    return this;
+  }
+
+  @Override
+  public AbstractFileRegion retain(int increment) {
+    super.retain(increment);
+    return this;
+  }
+
+  @Override
+  public AbstractFileRegion touch() {
+    super.touch();
+    return this;
+  }
+
+  @Override
+  public AbstractFileRegion touch(Object o) {
+    return this;
+  }
+}
diff --git a/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java b/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
index bb1c40c4b0e06..bc94f7ca63a96 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/ProtocolSuite.java
@@ -56,7 +56,7 @@ private void testServerToClient(Message msg) {
         NettyUtils.createFrameDecoder(), MessageDecoder.INSTANCE);
 
     while (!serverChannel.outboundMessages().isEmpty()) {
-      clientChannel.writeInbound(serverChannel.readOutbound());
+      clientChannel.writeOneInbound(serverChannel.readOutbound());
     }
 
     assertEquals(1, clientChannel.inboundMessages().size());
@@ -72,7 +72,7 @@ private void testClientToServer(Message msg) {
         NettyUtils.createFrameDecoder(), MessageDecoder.INSTANCE);
 
     while (!clientChannel.outboundMessages().isEmpty()) {
-      serverChannel.writeInbound(clientChannel.readOutbound());
+      serverChannel.writeOneInbound(clientChannel.readOutbound());
     }
 
     assertEquals(1, serverChannel.inboundMessages().size());
diff --git a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
index b341c5681e00c..ecb66fcf2ff76 100644
--- a/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
+++ b/common/network-common/src/test/java/org/apache/spark/network/protocol/MessageWithHeaderSuite.java
@@ -23,8 +23,7 @@
 
 import io.netty.buffer.ByteBuf;
 import io.netty.buffer.Unpooled;
-import io.netty.channel.FileRegion;
-import io.netty.util.AbstractReferenceCounted;
+import org.apache.spark.network.util.AbstractFileRegion;
 import org.junit.Test;
 import org.mockito.Mockito;
 
@@ -108,7 +107,7 @@ private ByteBuf doWrite(MessageWithHeader msg, int minExpectedWrites) throws Exc
     return Unpooled.wrappedBuffer(channel.getData());
   }
 
-  private static class TestFileRegion extends AbstractReferenceCounted implements FileRegion {
+  private static class TestFileRegion extends AbstractFileRegion {
 
     private final int writeCount;
     private final int writesPerCall;
@@ -130,7 +129,7 @@ public long position() {
     }
 
     @Override
-    public long transfered() {
+    public long transferred() {
       return 8 * written;
     }
 
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
index 97abd92d4b70f..39249d411b582 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskStore.scala
@@ -26,12 +26,11 @@ import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable.ListBuffer
 
 import com.google.common.io.Closeables
-import io.netty.channel.{DefaultFileRegion, FileRegion}
-import io.netty.util.AbstractReferenceCounted
+import io.netty.channel.DefaultFileRegion
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.internal.Logging
-import org.apache.spark.network.util.JavaUtils
+import org.apache.spark.network.util.{AbstractFileRegion, JavaUtils}
 import org.apache.spark.security.CryptoStreamUtils
 import org.apache.spark.util.Utils
 import org.apache.spark.util.io.ChunkedByteBuffer
@@ -266,7 +265,7 @@ private class EncryptedBlockData(
 }
 
 private class ReadableChannelFileRegion(source: ReadableByteChannel, blockSize: Long)
-  extends AbstractReferenceCounted with FileRegion {
+  extends AbstractFileRegion {
 
   private var _transferred = 0L
 
@@ -277,7 +276,7 @@ private class ReadableChannelFileRegion(source: ReadableByteChannel, blockSize:
 
   override def position(): Long = 0
 
-  override def transfered(): Long = _transferred
+  override def transferred(): Long = _transferred
 
   override def transferTo(target: WritableByteChannel, pos: Long): Long = {
     assert(pos == transfered(), "Invalid position.")
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 1831f3378e852..fea642737cc0c 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -14,9 +14,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.4.0.jar
-arrow-memory-0.4.0.jar
-arrow-vector-0.4.0.jar
+arrow-format-0.8.0.jar
+arrow-memory-0.8.0.jar
+arrow-vector-0.8.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -82,7 +82,7 @@ hadoop-yarn-server-web-proxy-2.6.5.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-hppc-0.7.1.jar
+hppc-0.7.2.jar
 htrace-core-3.0.4.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
@@ -144,7 +144,7 @@ metrics-json-3.1.5.jar
 metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 netty-3.9.9.Final.jar
-netty-all-4.0.47.Final.jar
+netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index fe14c05987327..6dd44333f21ca 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -14,9 +14,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.4.0.jar
-arrow-memory-0.4.0.jar
-arrow-vector-0.4.0.jar
+arrow-format-0.8.0.jar
+arrow-memory-0.8.0.jar
+arrow-vector-0.8.0.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -82,7 +82,7 @@ hadoop-yarn-server-web-proxy-2.7.3.jar
 hk2-api-2.4.0-b34.jar
 hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
-hppc-0.7.1.jar
+hppc-0.7.2.jar
 htrace-core-3.1.0-incubating.jar
 httpclient-4.5.2.jar
 httpcore-4.4.4.jar
@@ -145,7 +145,7 @@ metrics-json-3.1.5.jar
 metrics-jvm-3.1.5.jar
 minlog-1.3.0.jar
 netty-3.9.9.Final.jar
-netty-all-4.0.47.Final.jar
+netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
diff --git a/pom.xml b/pom.xml
index 52db79eaf036b..92f897095f087 100644
--- a/pom.xml
+++ b/pom.xml
@@ -185,7 +185,7 @@
     <paranamer.version>2.8</paranamer.version>
     <maven-antrun.version>1.8</maven-antrun.version>
     <commons-crypto.version>1.0.0</commons-crypto.version>
-    <arrow.version>0.4.0</arrow.version>
+    <arrow.version>0.8.0</arrow.version>
 
     <test.java.home>${java.home}</test.java.home>
     <test.exclude.tags></test.exclude.tags>
@@ -580,7 +580,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-        <version>4.0.47.Final</version>
+        <version>4.1.17.Final</version>
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>
@@ -1972,6 +1972,14 @@
             <groupId>com.fasterxml.jackson.core</groupId>
             <artifactId>jackson-databind</artifactId>
           </exclusion>
+          <exclusion>
+            <groupId>io.netty</groupId>
+            <artifactId>netty-buffer</artifactId>
+          </exclusion>
+          <exclusion>
+            <groupId>io.netty</groupId>
+            <artifactId>netty-common</artifactId>
+          </exclusion>
           <exclusion>
             <groupId>io.netty</groupId>
             <artifactId>netty-handler</artifactId>
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 37e7cf3fa662e..88d6a191babca 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -223,27 +223,14 @@ def _create_batch(series, timezone):
         series = [series]
     series = ((s, None) if not isinstance(s, (list, tuple)) else s for s in series)
 
-    # If a nullable integer series has been promoted to floating point with NaNs, need to cast
-    # NOTE: this is not necessary with Arrow >= 0.7
-    def cast_series(s, t):
-        if type(t) == pa.TimestampType:
-            # NOTE: convert to 'us' with astype here, unit ignored in `from_pandas` see ARROW-1680
-            return _check_series_convert_timestamps_internal(s.fillna(0), timezone)\
-                .values.astype('datetime64[us]', copy=False)
-        # NOTE: can not compare None with pyarrow.DataType(), fixed with Arrow >= 0.7.1
-        elif t is not None and t == pa.date32():
-            # TODO: this converts the series to Python objects, possibly avoid with Arrow >= 0.8
-            return s.dt.date
-        elif t is None or s.dtype == t.to_pandas_dtype():
-            return s
-        else:
-            return s.fillna(0).astype(t.to_pandas_dtype(), copy=False)
-
-    # Some object types don't support masks in Arrow, see ARROW-1721
     def create_array(s, t):
-        casted = cast_series(s, t)
-        mask = None if casted.dtype == 'object' else s.isnull()
-        return pa.Array.from_pandas(casted, mask=mask, type=t)
+        mask = s.isnull()
+        # Ensure timestamp series are in expected form for Spark internal representation
+        if t is not None and pa.types.is_timestamp(t):
+            s = _check_series_convert_timestamps_internal(s.fillna(0), timezone)
+            # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2
+            return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False)
+        return pa.Array.from_pandas(s, mask=mask, type=t)
 
     arrs = [create_array(s, t) for s, t in series]
     return pa.RecordBatch.from_arrays(arrs, ["_%d" % i for i in xrange(len(arrs))])
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 75395a754a831..440684d3edfa6 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1906,7 +1906,9 @@ def toPandas(self):
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
                 from pyspark.sql.types import _check_dataframe_localize_timestamps
+                from pyspark.sql.utils import _require_minimum_pyarrow_version
                 import pyarrow
+                _require_minimum_pyarrow_version()
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 54530055dfa85..ddd8df3b15bf6 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2159,16 +2159,17 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> from pyspark.sql.types import IntegerType, StringType
-       >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())
-       >>> @pandas_udf(StringType())
+       >>> slen = pandas_udf(lambda s: s.str.len(), IntegerType())  # doctest: +SKIP
+       >>> @pandas_udf(StringType())  # doctest: +SKIP
        ... def to_upper(s):
        ...     return s.str.upper()
        ...
-       >>> @pandas_udf("integer", PandasUDFType.SCALAR)
+       >>> @pandas_udf("integer", PandasUDFType.SCALAR)  # doctest: +SKIP
        ... def add_one(x):
        ...     return x + 1
        ...
-       >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
+       >>> df = spark.createDataFrame([(1, "John Doe", 21)],
+       ...                            ("id", "name", "age"))  # doctest: +SKIP
        >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")) \\
        ...     .show()  # doctest: +SKIP
        +----------+--------------+------------+
@@ -2189,8 +2190,8 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> df = spark.createDataFrame(
        ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
-       ...     ("id", "v"))
-       >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)
+       ...     ("id", "v"))  # doctest: +SKIP
+       >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)  # doctest: +SKIP
        ... def normalize(pdf):
        ...     v = pdf.v
        ...     return pdf.assign(v=(v - v.mean()) / v.std())
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 4d47dd6a3e878..09fae46adf014 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -218,7 +218,7 @@ def apply(self, udf):
         >>> df = spark.createDataFrame(
         ...     [(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
         ...     ("id", "v"))
-        >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)
+        >>> @pandas_udf("id long, v double", PandasUDFType.GROUP_MAP)  # doctest: +SKIP
         ... def normalize(pdf):
         ...     v = pdf.v
         ...     return pdf.assign(v=(v - v.mean()) / v.std())
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index e2435e09af23d..86db16eca7889 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -495,11 +495,14 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
         from pyspark.serializers import ArrowSerializer, _create_batch
         from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
             _old_pandas_exception_message, TimestampType
+        from pyspark.sql.utils import _require_minimum_pyarrow_version
         try:
             from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
         except ImportError as e:
             raise ImportError(_old_pandas_exception_message(e))
 
+        _require_minimum_pyarrow_version()
+
         # Determine arrow types to coerce data when creating batches
         if isinstance(schema, StructType):
             arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index b4d32d8de8a22..6fdfda1cc831b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3339,10 +3339,11 @@ def test_createDataFrame_with_single_data_type(self):
                 self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int")
 
     def test_createDataFrame_does_not_modify_input(self):
+        import pandas as pd
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
         # Use a nanosecond value to make sure it is not truncated
-        pdf.ix[0, '7_timestamp_t'] = 1
+        pdf.ix[0, '7_timestamp_t'] = pd.Timestamp(1)
         # Integers with nulls will get NaNs filled with 0 and will be casted
         pdf.ix[1, '2_int_t'] = None
         pdf_copy = pdf.copy(deep=True)
@@ -3356,6 +3357,7 @@ def test_schema_conversion_roundtrip(self):
         self.assertEquals(self.schema, schema_rt)
 
 
+@unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class PandasUDFTests(ReusedSQLTestCase):
     def test_pandas_udf_basic(self):
         from pyspark.rdd import PythonEvalType
@@ -3671,9 +3673,9 @@ def test_vectorized_udf_chained(self):
     def test_vectorized_udf_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10)
-        f = pandas_udf(lambda x: x * 1.0, StringType())
+        f = pandas_udf(lambda x: x * 1.0, ArrayType(LongType()))
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
+            with self.assertRaisesRegexp(Exception, 'Unsupported.*type.*conversion'):
                 df.select(f(col('id'))).collect()
 
     def test_vectorized_udf_return_scalar(self):
@@ -3974,12 +3976,12 @@ def test_wrong_return_type(self):
 
         foo = pandas_udf(
             lambda pdf: pdf,
-            'id long, v string',
+            'id long, v array<int>',
             PandasUDFType.GROUP_MAP
         )
 
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(Exception, 'Invalid.*type'):
+            with self.assertRaisesRegexp(Exception, 'Unsupported.*type.*conversion'):
                 df.groupby('id').apply(foo).sort('id').toPandas()
 
     def test_wrong_args(self):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 78abc32a35a1c..46d9a417414b5 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1642,29 +1642,28 @@ def to_arrow_schema(schema):
 def from_arrow_type(at):
     """ Convert pyarrow type to Spark data type.
     """
-    # TODO: newer pyarrow has is_boolean(at) functions that would be better to check type
-    import pyarrow as pa
-    if at == pa.bool_():
+    import pyarrow.types as types
+    if types.is_boolean(at):
         spark_type = BooleanType()
-    elif at == pa.int8():
+    elif types.is_int8(at):
         spark_type = ByteType()
-    elif at == pa.int16():
+    elif types.is_int16(at):
         spark_type = ShortType()
-    elif at == pa.int32():
+    elif types.is_int32(at):
         spark_type = IntegerType()
-    elif at == pa.int64():
+    elif types.is_int64(at):
         spark_type = LongType()
-    elif at == pa.float32():
+    elif types.is_float32(at):
         spark_type = FloatType()
-    elif at == pa.float64():
+    elif types.is_float64(at):
         spark_type = DoubleType()
-    elif type(at) == pa.DecimalType:
+    elif types.is_decimal(at):
         spark_type = DecimalType(precision=at.precision, scale=at.scale)
-    elif at == pa.string():
+    elif types.is_string(at):
         spark_type = StringType()
-    elif at == pa.date32():
+    elif types.is_date32(at):
         spark_type = DateType()
-    elif type(at) == pa.TimestampType:
+    elif types.is_timestamp(at):
         spark_type = TimestampType()
     else:
         raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index c3301a41ccd5a..50c87ba1ac882 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -33,19 +33,23 @@ def _wrap_function(sc, func, returnType):
 
 
 def _create_udf(f, returnType, evalType):
-    if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF:
+
+    if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF or \
+            evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
         import inspect
+        from pyspark.sql.utils import _require_minimum_pyarrow_version
+
+        _require_minimum_pyarrow_version()
         argspec = inspect.getargspec(f)
-        if len(argspec.args) == 0 and argspec.varargs is None:
+
+        if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF and len(argspec.args) == 0 and \
+                argspec.varargs is None:
             raise ValueError(
                 "Invalid function: 0-arg pandas_udfs are not supported. "
                 "Instead, create a 1-arg pandas_udf and ignore the arg in your function."
             )
 
-    elif evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
-        import inspect
-        argspec = inspect.getargspec(f)
-        if len(argspec.args) != 1:
+        if evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF and len(argspec.args) != 1:
             raise ValueError(
                 "Invalid function: pandas_udfs with function type GROUP_MAP "
                 "must take a single arg that is a pandas DataFrame."
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index 7bc6a59ad3b26..cc7dabb64b3ec 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -110,3 +110,12 @@ def toJArray(gateway, jtype, arr):
     for i in range(0, len(arr)):
         jarr[i] = arr[i]
     return jarr
+
+
+def _require_minimum_pyarrow_version():
+    """ Raise ImportError if minimum version of pyarrow is not installed
+    """
+    from distutils.version import LooseVersion
+    import pyarrow
+    if LooseVersion(pyarrow.__version__) < LooseVersion('0.8.0'):
+        raise ImportError("pyarrow >= 0.8.0 must be installed on calling Python process")
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index d53e1fcab0c5a..528f66f342dc9 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -275,30 +275,30 @@ public byte[] getBinary(int rowId) {
   public ArrowColumnVector(ValueVector vector) {
     super(ArrowUtils.fromArrowField(vector.getField()));
 
-    if (vector instanceof NullableBitVector) {
-      accessor = new BooleanAccessor((NullableBitVector) vector);
-    } else if (vector instanceof NullableTinyIntVector) {
-      accessor = new ByteAccessor((NullableTinyIntVector) vector);
-    } else if (vector instanceof NullableSmallIntVector) {
-      accessor = new ShortAccessor((NullableSmallIntVector) vector);
-    } else if (vector instanceof NullableIntVector) {
-      accessor = new IntAccessor((NullableIntVector) vector);
-    } else if (vector instanceof NullableBigIntVector) {
-      accessor = new LongAccessor((NullableBigIntVector) vector);
-    } else if (vector instanceof NullableFloat4Vector) {
-      accessor = new FloatAccessor((NullableFloat4Vector) vector);
-    } else if (vector instanceof NullableFloat8Vector) {
-      accessor = new DoubleAccessor((NullableFloat8Vector) vector);
-    } else if (vector instanceof NullableDecimalVector) {
-      accessor = new DecimalAccessor((NullableDecimalVector) vector);
-    } else if (vector instanceof NullableVarCharVector) {
-      accessor = new StringAccessor((NullableVarCharVector) vector);
-    } else if (vector instanceof NullableVarBinaryVector) {
-      accessor = new BinaryAccessor((NullableVarBinaryVector) vector);
-    } else if (vector instanceof NullableDateDayVector) {
-      accessor = new DateAccessor((NullableDateDayVector) vector);
-    } else if (vector instanceof NullableTimeStampMicroTZVector) {
-      accessor = new TimestampAccessor((NullableTimeStampMicroTZVector) vector);
+    if (vector instanceof BitVector) {
+      accessor = new BooleanAccessor((BitVector) vector);
+    } else if (vector instanceof TinyIntVector) {
+      accessor = new ByteAccessor((TinyIntVector) vector);
+    } else if (vector instanceof SmallIntVector) {
+      accessor = new ShortAccessor((SmallIntVector) vector);
+    } else if (vector instanceof IntVector) {
+      accessor = new IntAccessor((IntVector) vector);
+    } else if (vector instanceof BigIntVector) {
+      accessor = new LongAccessor((BigIntVector) vector);
+    } else if (vector instanceof Float4Vector) {
+      accessor = new FloatAccessor((Float4Vector) vector);
+    } else if (vector instanceof Float8Vector) {
+      accessor = new DoubleAccessor((Float8Vector) vector);
+    } else if (vector instanceof DecimalVector) {
+      accessor = new DecimalAccessor((DecimalVector) vector);
+    } else if (vector instanceof VarCharVector) {
+      accessor = new StringAccessor((VarCharVector) vector);
+    } else if (vector instanceof VarBinaryVector) {
+      accessor = new BinaryAccessor((VarBinaryVector) vector);
+    } else if (vector instanceof DateDayVector) {
+      accessor = new DateAccessor((DateDayVector) vector);
+    } else if (vector instanceof TimeStampMicroTZVector) {
+      accessor = new TimestampAccessor((TimeStampMicroTZVector) vector);
     } else if (vector instanceof ListVector) {
       ListVector listVector = (ListVector) vector;
       accessor = new ArrayAccessor(listVector);
@@ -321,23 +321,21 @@ public ArrowColumnVector(ValueVector vector) {
   private abstract static class ArrowVectorAccessor {
 
     private final ValueVector vector;
-    private final ValueVector.Accessor nulls;
 
     ArrowVectorAccessor(ValueVector vector) {
       this.vector = vector;
-      this.nulls = vector.getAccessor();
     }
 
     final boolean isNullAt(int rowId) {
-      return nulls.isNull(rowId);
+      return vector.isNull(rowId);
     }
 
     final int getValueCount() {
-      return nulls.getValueCount();
+      return vector.getValueCount();
     }
 
     final int getNullCount() {
-      return nulls.getNullCount();
+      return vector.getNullCount();
     }
 
     final void close() {
@@ -395,11 +393,11 @@ int getArrayOffset(int rowId) {
 
   private static class BooleanAccessor extends ArrowVectorAccessor {
 
-    private final NullableBitVector.Accessor accessor;
+    private final BitVector accessor;
 
-    BooleanAccessor(NullableBitVector vector) {
+    BooleanAccessor(BitVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -410,11 +408,11 @@ final boolean getBoolean(int rowId) {
 
   private static class ByteAccessor extends ArrowVectorAccessor {
 
-    private final NullableTinyIntVector.Accessor accessor;
+    private final TinyIntVector accessor;
 
-    ByteAccessor(NullableTinyIntVector vector) {
+    ByteAccessor(TinyIntVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -425,11 +423,11 @@ final byte getByte(int rowId) {
 
   private static class ShortAccessor extends ArrowVectorAccessor {
 
-    private final NullableSmallIntVector.Accessor accessor;
+    private final SmallIntVector accessor;
 
-    ShortAccessor(NullableSmallIntVector vector) {
+    ShortAccessor(SmallIntVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -440,11 +438,11 @@ final short getShort(int rowId) {
 
   private static class IntAccessor extends ArrowVectorAccessor {
 
-    private final NullableIntVector.Accessor accessor;
+    private final IntVector accessor;
 
-    IntAccessor(NullableIntVector vector) {
+    IntAccessor(IntVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -455,11 +453,11 @@ final int getInt(int rowId) {
 
   private static class LongAccessor extends ArrowVectorAccessor {
 
-    private final NullableBigIntVector.Accessor accessor;
+    private final BigIntVector accessor;
 
-    LongAccessor(NullableBigIntVector vector) {
+    LongAccessor(BigIntVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -470,11 +468,11 @@ final long getLong(int rowId) {
 
   private static class FloatAccessor extends ArrowVectorAccessor {
 
-    private final NullableFloat4Vector.Accessor accessor;
+    private final Float4Vector accessor;
 
-    FloatAccessor(NullableFloat4Vector vector) {
+    FloatAccessor(Float4Vector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -485,11 +483,11 @@ final float getFloat(int rowId) {
 
   private static class DoubleAccessor extends ArrowVectorAccessor {
 
-    private final NullableFloat8Vector.Accessor accessor;
+    private final Float8Vector accessor;
 
-    DoubleAccessor(NullableFloat8Vector vector) {
+    DoubleAccessor(Float8Vector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -500,11 +498,11 @@ final double getDouble(int rowId) {
 
   private static class DecimalAccessor extends ArrowVectorAccessor {
 
-    private final NullableDecimalVector.Accessor accessor;
+    private final DecimalVector accessor;
 
-    DecimalAccessor(NullableDecimalVector vector) {
+    DecimalAccessor(DecimalVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -516,12 +514,12 @@ final Decimal getDecimal(int rowId, int precision, int scale) {
 
   private static class StringAccessor extends ArrowVectorAccessor {
 
-    private final NullableVarCharVector.Accessor accessor;
+    private final VarCharVector accessor;
     private final NullableVarCharHolder stringResult = new NullableVarCharHolder();
 
-    StringAccessor(NullableVarCharVector vector) {
+    StringAccessor(VarCharVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -539,11 +537,11 @@ final UTF8String getUTF8String(int rowId) {
 
   private static class BinaryAccessor extends ArrowVectorAccessor {
 
-    private final NullableVarBinaryVector.Accessor accessor;
+    private final VarBinaryVector accessor;
 
-    BinaryAccessor(NullableVarBinaryVector vector) {
+    BinaryAccessor(VarBinaryVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -554,11 +552,11 @@ final byte[] getBinary(int rowId) {
 
   private static class DateAccessor extends ArrowVectorAccessor {
 
-    private final NullableDateDayVector.Accessor accessor;
+    private final DateDayVector accessor;
 
-    DateAccessor(NullableDateDayVector vector) {
+    DateAccessor(DateDayVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -569,11 +567,11 @@ final int getInt(int rowId) {
 
   private static class TimestampAccessor extends ArrowVectorAccessor {
 
-    private final NullableTimeStampMicroTZVector.Accessor accessor;
+    private final TimeStampMicroTZVector accessor;
 
-    TimestampAccessor(NullableTimeStampMicroTZVector vector) {
+    TimestampAccessor(TimeStampMicroTZVector vector) {
       super(vector);
-      this.accessor = vector.getAccessor();
+      this.accessor = vector;
     }
 
     @Override
@@ -584,21 +582,21 @@ final long getLong(int rowId) {
 
   private static class ArrayAccessor extends ArrowVectorAccessor {
 
-    private final UInt4Vector.Accessor accessor;
+    private final ListVector accessor;
 
     ArrayAccessor(ListVector vector) {
       super(vector);
-      this.accessor = vector.getOffsetVector().getAccessor();
+      this.accessor = vector;
     }
 
     @Override
     final int getArrayLength(int rowId) {
-      return accessor.get(rowId + 1) - accessor.get(rowId);
+      return accessor.getInnerValueCountAt(rowId);
     }
 
     @Override
     final int getArrayOffset(int rowId) {
-      return accessor.get(rowId);
+      return accessor.getOffsetBuffer().getInt(rowId * accessor.OFFSET_WIDTH);
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index 3cafb344ef553..bcfc412430263 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -24,8 +24,8 @@ import scala.collection.JavaConverters._
 
 import org.apache.arrow.memory.BufferAllocator
 import org.apache.arrow.vector._
-import org.apache.arrow.vector.file._
-import org.apache.arrow.vector.schema.ArrowRecordBatch
+import org.apache.arrow.vector.ipc.{ArrowFileReader, ArrowFileWriter}
+import org.apache.arrow.vector.ipc.message.ArrowRecordBatch
 import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel
 
 import org.apache.spark.TaskContext
@@ -86,13 +86,9 @@ private[sql] object ArrowConverters {
     val root = VectorSchemaRoot.create(arrowSchema, allocator)
     val arrowWriter = ArrowWriter.create(root)
 
-    var closed = false
-
     context.addTaskCompletionListener { _ =>
-      if (!closed) {
-        root.close()
-        allocator.close()
-      }
+      root.close()
+      allocator.close()
     }
 
     new Iterator[ArrowPayload] {
@@ -100,7 +96,6 @@ private[sql] object ArrowConverters {
       override def hasNext: Boolean = rowIter.hasNext || {
         root.close()
         allocator.close()
-        closed = true
         false
       }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
index e4af4f65da127..0258056d9de49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -46,17 +46,17 @@ object ArrowWriter {
   private def createFieldWriter(vector: ValueVector): ArrowFieldWriter = {
     val field = vector.getField()
     (ArrowUtils.fromArrowField(field), vector) match {
-      case (BooleanType, vector: NullableBitVector) => new BooleanWriter(vector)
-      case (ByteType, vector: NullableTinyIntVector) => new ByteWriter(vector)
-      case (ShortType, vector: NullableSmallIntVector) => new ShortWriter(vector)
-      case (IntegerType, vector: NullableIntVector) => new IntegerWriter(vector)
-      case (LongType, vector: NullableBigIntVector) => new LongWriter(vector)
-      case (FloatType, vector: NullableFloat4Vector) => new FloatWriter(vector)
-      case (DoubleType, vector: NullableFloat8Vector) => new DoubleWriter(vector)
-      case (StringType, vector: NullableVarCharVector) => new StringWriter(vector)
-      case (BinaryType, vector: NullableVarBinaryVector) => new BinaryWriter(vector)
-      case (DateType, vector: NullableDateDayVector) => new DateWriter(vector)
-      case (TimestampType, vector: NullableTimeStampMicroTZVector) => new TimestampWriter(vector)
+      case (BooleanType, vector: BitVector) => new BooleanWriter(vector)
+      case (ByteType, vector: TinyIntVector) => new ByteWriter(vector)
+      case (ShortType, vector: SmallIntVector) => new ShortWriter(vector)
+      case (IntegerType, vector: IntVector) => new IntegerWriter(vector)
+      case (LongType, vector: BigIntVector) => new LongWriter(vector)
+      case (FloatType, vector: Float4Vector) => new FloatWriter(vector)
+      case (DoubleType, vector: Float8Vector) => new DoubleWriter(vector)
+      case (StringType, vector: VarCharVector) => new StringWriter(vector)
+      case (BinaryType, vector: VarBinaryVector) => new BinaryWriter(vector)
+      case (DateType, vector: DateDayVector) => new DateWriter(vector)
+      case (TimestampType, vector: TimeStampMicroTZVector) => new TimestampWriter(vector)
       case (ArrayType(_, _), vector: ListVector) =>
         val elementVector = createFieldWriter(vector.getDataVector())
         new ArrayWriter(vector, elementVector)
@@ -103,7 +103,6 @@ class ArrowWriter(val root: VectorSchemaRoot, fields: Array[ArrowFieldWriter]) {
 private[arrow] abstract class ArrowFieldWriter {
 
   def valueVector: ValueVector
-  def valueMutator: ValueVector.Mutator
 
   def name: String = valueVector.getField().getName()
   def dataType: DataType = ArrowUtils.fromArrowField(valueVector.getField())
@@ -124,161 +123,144 @@ private[arrow] abstract class ArrowFieldWriter {
   }
 
   def finish(): Unit = {
-    valueMutator.setValueCount(count)
+    valueVector.setValueCount(count)
   }
 
   def reset(): Unit = {
-    valueMutator.reset()
+    // TODO: reset() should be in a common interface
+    valueVector match {
+      case fixedWidthVector: BaseFixedWidthVector => fixedWidthVector.reset()
+      case variableWidthVector: BaseVariableWidthVector => variableWidthVector.reset()
+      case _ =>
+    }
     count = 0
   }
 }
 
-private[arrow] class BooleanWriter(val valueVector: NullableBitVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableBitVector#Mutator = valueVector.getMutator()
+private[arrow] class BooleanWriter(val valueVector: BitVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, if (input.getBoolean(ordinal)) 1 else 0)
+    valueVector.setSafe(count, if (input.getBoolean(ordinal)) 1 else 0)
   }
 }
 
-private[arrow] class ByteWriter(val valueVector: NullableTinyIntVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableTinyIntVector#Mutator = valueVector.getMutator()
+private[arrow] class ByteWriter(val valueVector: TinyIntVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getByte(ordinal))
+    valueVector.setSafe(count, input.getByte(ordinal))
   }
 }
 
-private[arrow] class ShortWriter(val valueVector: NullableSmallIntVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableSmallIntVector#Mutator = valueVector.getMutator()
+private[arrow] class ShortWriter(val valueVector: SmallIntVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getShort(ordinal))
+    valueVector.setSafe(count, input.getShort(ordinal))
   }
 }
 
-private[arrow] class IntegerWriter(val valueVector: NullableIntVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableIntVector#Mutator = valueVector.getMutator()
+private[arrow] class IntegerWriter(val valueVector: IntVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getInt(ordinal))
+    valueVector.setSafe(count, input.getInt(ordinal))
   }
 }
 
-private[arrow] class LongWriter(val valueVector: NullableBigIntVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableBigIntVector#Mutator = valueVector.getMutator()
+private[arrow] class LongWriter(val valueVector: BigIntVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getLong(ordinal))
+    valueVector.setSafe(count, input.getLong(ordinal))
   }
 }
 
-private[arrow] class FloatWriter(val valueVector: NullableFloat4Vector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableFloat4Vector#Mutator = valueVector.getMutator()
+private[arrow] class FloatWriter(val valueVector: Float4Vector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getFloat(ordinal))
+    valueVector.setSafe(count, input.getFloat(ordinal))
   }
 }
 
-private[arrow] class DoubleWriter(val valueVector: NullableFloat8Vector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableFloat8Vector#Mutator = valueVector.getMutator()
+private[arrow] class DoubleWriter(val valueVector: Float8Vector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getDouble(ordinal))
+    valueVector.setSafe(count, input.getDouble(ordinal))
   }
 }
 
-private[arrow] class StringWriter(val valueVector: NullableVarCharVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableVarCharVector#Mutator = valueVector.getMutator()
+private[arrow] class StringWriter(val valueVector: VarCharVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
     val utf8 = input.getUTF8String(ordinal)
     val utf8ByteBuffer = utf8.getByteBuffer
     // todo: for off-heap UTF8String, how to pass in to arrow without copy?
-    valueMutator.setSafe(count, utf8ByteBuffer, utf8ByteBuffer.position(), utf8.numBytes())
+    valueVector.setSafe(count, utf8ByteBuffer, utf8ByteBuffer.position(), utf8.numBytes())
   }
 }
 
 private[arrow] class BinaryWriter(
-    val valueVector: NullableVarBinaryVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableVarBinaryVector#Mutator = valueVector.getMutator()
+    val valueVector: VarBinaryVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
     val bytes = input.getBinary(ordinal)
-    valueMutator.setSafe(count, bytes, 0, bytes.length)
+    valueVector.setSafe(count, bytes, 0, bytes.length)
   }
 }
 
-private[arrow] class DateWriter(val valueVector: NullableDateDayVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableDateDayVector#Mutator = valueVector.getMutator()
+private[arrow] class DateWriter(val valueVector: DateDayVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getInt(ordinal))
+    valueVector.setSafe(count, input.getInt(ordinal))
   }
 }
 
 private[arrow] class TimestampWriter(
-    val valueVector: NullableTimeStampMicroTZVector) extends ArrowFieldWriter {
-
-  override def valueMutator: NullableTimeStampMicroTZVector#Mutator = valueVector.getMutator()
+    val valueVector: TimeStampMicroTZVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
-    valueMutator.setSafe(count, input.getLong(ordinal))
+    valueVector.setSafe(count, input.getLong(ordinal))
   }
 }
 
@@ -286,20 +268,18 @@ private[arrow] class ArrayWriter(
     val valueVector: ListVector,
     val elementWriter: ArrowFieldWriter) extends ArrowFieldWriter {
 
-  override def valueMutator: ListVector#Mutator = valueVector.getMutator()
-
   override def setNull(): Unit = {
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
     val array = input.getArray(ordinal)
     var i = 0
-    valueMutator.startNewValue(count)
+    valueVector.startNewValue(count)
     while (i < array.numElements()) {
       elementWriter.write(array, i)
       i += 1
     }
-    valueMutator.endValue(count, array.numElements())
+    valueVector.endValue(count, array.numElements())
   }
 
   override def finish(): Unit = {
@@ -317,8 +297,6 @@ private[arrow] class StructWriter(
     val valueVector: NullableMapVector,
     children: Array[ArrowFieldWriter]) extends ArrowFieldWriter {
 
-  override def valueMutator: NullableMapVector#Mutator = valueVector.getMutator()
-
   override def setNull(): Unit = {
     var i = 0
     while (i < children.length) {
@@ -326,7 +304,7 @@ private[arrow] class StructWriter(
       children(i).count += 1
       i += 1
     }
-    valueMutator.setNull(count)
+    valueVector.setNull(count)
   }
 
   override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
@@ -336,7 +314,7 @@ private[arrow] class StructWriter(
       children(i).write(struct, i)
       i += 1
     }
-    valueMutator.setIndexDefined(count)
+    valueVector.setIndexDefined(count)
   }
 
   override def finish(): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index 9a94d771a01b0..5cc8ed3535654 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicBoolean
 import scala.collection.JavaConverters._
 
 import org.apache.arrow.vector.VectorSchemaRoot
-import org.apache.arrow.vector.stream.{ArrowStreamReader, ArrowStreamWriter}
+import org.apache.arrow.vector.ipc.{ArrowStreamReader, ArrowStreamWriter}
 
 import org.apache.spark._
 import org.apache.spark.api.python._
@@ -74,13 +74,9 @@ class ArrowPythonRunner(
         val root = VectorSchemaRoot.create(arrowSchema, allocator)
         val arrowWriter = ArrowWriter.create(root)
 
-        var closed = false
-
         context.addTaskCompletionListener { _ =>
-          if (!closed) {
-            root.close()
-            allocator.close()
-          }
+          root.close()
+          allocator.close()
         }
 
         val writer = new ArrowStreamWriter(root, null, dataOut)
@@ -102,7 +98,6 @@ class ArrowPythonRunner(
           writer.end()
           root.close()
           allocator.close()
-          closed = true
         }
       }
     }
@@ -126,18 +121,11 @@ class ArrowPythonRunner(
       private var schema: StructType = _
       private var vectors: Array[ColumnVector] = _
 
-      private var closed = false
-
       context.addTaskCompletionListener { _ =>
-        // todo: we need something like `reader.end()`, which release all the resources, but leave
-        // the input stream open. `reader.close()` will close the socket and we can't reuse worker.
-        // So here we simply not close the reader, which is problematic.
-        if (!closed) {
-          if (root != null) {
-            root.close()
-          }
-          allocator.close()
+        if (reader != null) {
+          reader.close(false)
         }
+        allocator.close()
       }
 
       private var batchLoaded = true
@@ -154,9 +142,8 @@ class ArrowPythonRunner(
               batch.setNumRows(root.getRowCount)
               batch
             } else {
-              root.close()
+              reader.close(false)
               allocator.close()
-              closed = true
               // Reach end of stream. Call `read()` again to read control data.
               read()
             }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index 57958f7239224..fd5a3df6abc68 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -25,7 +25,7 @@ import java.util.Locale
 import com.google.common.io.Files
 import org.apache.arrow.memory.RootAllocator
 import org.apache.arrow.vector.{VectorLoader, VectorSchemaRoot}
-import org.apache.arrow.vector.file.json.JsonFileReader
+import org.apache.arrow.vector.ipc.JsonFileReader
 import org.apache.arrow.vector.util.Validator
 import org.scalatest.BeforeAndAfterAll
 
@@ -76,16 +76,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 16
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_s",
          |      "type" : {
@@ -94,16 +85,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 16
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -143,16 +125,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_i",
          |      "type" : {
@@ -161,16 +134,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -210,16 +174,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 64
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_l",
          |      "type" : {
@@ -228,16 +183,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 64
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -276,16 +222,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "SINGLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_f",
          |      "type" : {
@@ -293,16 +230,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "SINGLE"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -341,16 +269,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "DOUBLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_d",
          |      "type" : {
@@ -358,16 +277,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "DOUBLE"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -408,16 +318,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -449,16 +350,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 16
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 16
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b",
          |      "type" : {
@@ -466,16 +358,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "SINGLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "c",
          |      "type" : {
@@ -484,16 +367,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "d",
          |      "type" : {
@@ -501,16 +375,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "DOUBLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "e",
          |      "type" : {
@@ -519,16 +384,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 64
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -583,57 +439,21 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "name" : "utf8"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "lower_case",
          |      "type" : {
          |        "name" : "utf8"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "null_str",
          |      "type" : {
          |        "name" : "utf8"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -681,16 +501,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "name" : "bool"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 1
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -721,16 +532,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 8
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -760,19 +562,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "name" : "binary"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 8
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -807,16 +597,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "unit" : "DAY"
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -855,16 +636,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
            |        "timezone" : "America/Los_Angeles"
            |      },
            |      "nullable" : true,
-           |      "children" : [ ],
-           |      "typeLayout" : {
-           |        "vectors" : [ {
-           |          "type" : "VALIDITY",
-           |          "typeBitWidth" : 1
-           |        }, {
-           |          "type" : "DATA",
-           |          "typeBitWidth" : 64
-           |        } ]
-           |      }
+           |      "children" : [ ]
            |    } ]
            |  },
            |  "batches" : [ {
@@ -904,16 +676,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "SINGLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "NaN_d",
          |      "type" : {
@@ -921,16 +684,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "precision" : "DOUBLE"
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 64
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -939,12 +693,12 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |      "name" : "NaN_f",
          |      "count" : 2,
          |      "VALIDITY" : [ 1, 1 ],
-         |      "DATA" : [ 1.2000000476837158, "NaN" ]
+         |      "DATA" : [ 1.2000000476837158, NaN ]
          |    }, {
          |      "name" : "NaN_d",
          |      "count" : 2,
          |      "VALIDITY" : [ 1, 1 ],
-         |      "DATA" : [ "NaN", 1.2 ]
+         |      "DATA" : [ NaN, 1.2 ]
          |    } ]
          |  } ]
          |}
@@ -976,26 +730,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "b_arr",
          |      "nullable" : true,
@@ -1010,26 +746,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "c_arr",
          |      "nullable" : true,
@@ -1044,26 +762,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "d_arr",
          |      "nullable" : true,
@@ -1084,36 +784,9 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |            "bitWidth" : 32,
          |            "isSigned" : true
          |          },
-         |          "children" : [ ],
-         |          "typeLayout" : {
-         |            "vectors" : [ {
-         |              "type" : "VALIDITY",
-         |              "typeBitWidth" : 1
-         |            }, {
-         |              "type" : "DATA",
-         |              "typeBitWidth" : 32
-         |            } ]
-         |          }
-         |        } ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "OFFSET",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "OFFSET",
-         |          "typeBitWidth" : 32
+         |          "children" : [ ]
          |        } ]
-         |      }
+         |      } ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -1204,23 +877,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "b_struct",
          |      "nullable" : true,
@@ -1235,23 +893,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "c_struct",
          |      "nullable" : false,
@@ -1266,23 +909,8 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |          "bitWidth" : 32,
          |          "isSigned" : true
          |        },
-         |        "children" : [ ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          }, {
-         |            "type" : "DATA",
-         |            "typeBitWidth" : 32
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        } ]
-         |      }
+         |        "children" : [ ]
+         |      } ]
          |    }, {
          |      "name" : "d_struct",
          |      "nullable" : true,
@@ -1303,30 +931,9 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |            "bitWidth" : 32,
          |            "isSigned" : true
          |          },
-         |          "children" : [ ],
-         |          "typeLayout" : {
-         |            "vectors" : [ {
-         |              "type" : "VALIDITY",
-         |              "typeBitWidth" : 1
-         |            }, {
-         |              "type" : "DATA",
-         |              "typeBitWidth" : 32
-         |            } ]
-         |          }
-         |        } ],
-         |        "typeLayout" : {
-         |          "vectors" : [ {
-         |            "type" : "VALIDITY",
-         |            "typeBitWidth" : 1
-         |          } ]
-         |        }
-         |      } ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
+         |          "children" : [ ]
          |        } ]
-         |      }
+         |      } ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -1413,16 +1020,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b",
          |      "type" : {
@@ -1431,16 +1029,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -1471,16 +1060,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b",
          |      "type" : {
@@ -1489,16 +1069,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -1600,16 +1171,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "b_i",
          |      "type" : {
@@ -1618,16 +1180,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
@@ -1658,16 +1211,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : true,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    }, {
          |      "name" : "a_i",
          |      "type" : {
@@ -1676,16 +1220,7 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
          |        "bitWidth" : 32
          |      },
          |      "nullable" : false,
-         |      "children" : [ ],
-         |      "typeLayout" : {
-         |        "vectors" : [ {
-         |          "type" : "VALIDITY",
-         |          "typeBitWidth" : 1
-         |        }, {
-         |          "type" : "DATA",
-         |          "typeBitWidth" : 32
-         |        } ]
-         |      }
+         |      "children" : [ ]
          |    } ]
          |  },
          |  "batches" : [ {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index e460d0721e7bf..03490ad15a655 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -30,15 +30,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("boolean") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("boolean", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("boolean", BooleanType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableBitVector]
+      .createVector(allocator).asInstanceOf[BitVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, if (i % 2 == 0) 1 else 0)
+      vector.setSafe(i, if (i % 2 == 0) 1 else 0)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BooleanType)
@@ -58,15 +57,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("byte") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("byte", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("byte", ByteType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableTinyIntVector]
+      .createVector(allocator).asInstanceOf[TinyIntVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i.toByte)
+      vector.setSafe(i, i.toByte)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ByteType)
@@ -86,15 +84,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("short") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("short", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("short", ShortType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableSmallIntVector]
+      .createVector(allocator).asInstanceOf[SmallIntVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i.toShort)
+      vector.setSafe(i, i.toShort)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ShortType)
@@ -114,15 +111,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("int") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("int", IntegerType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableIntVector]
+      .createVector(allocator).asInstanceOf[IntVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i)
+      vector.setSafe(i, i)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === IntegerType)
@@ -142,15 +138,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("long") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("long", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("long", LongType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableBigIntVector]
+      .createVector(allocator).asInstanceOf[BigIntVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i.toLong)
+      vector.setSafe(i, i.toLong)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === LongType)
@@ -170,15 +165,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("float") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("float", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("float", FloatType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableFloat4Vector]
+      .createVector(allocator).asInstanceOf[Float4Vector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i.toFloat)
+      vector.setSafe(i, i.toFloat)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === FloatType)
@@ -198,15 +192,14 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("double") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("double", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("double", DoubleType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableFloat8Vector]
+      .createVector(allocator).asInstanceOf[Float8Vector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator.setSafe(i, i.toDouble)
+      vector.setSafe(i, i.toDouble)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === DoubleType)
@@ -226,16 +219,15 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("string") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("string", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("string", StringType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableVarCharVector]
+      .createVector(allocator).asInstanceOf[VarCharVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
       val utf8 = s"str$i".getBytes("utf8")
-      mutator.setSafe(i, utf8, 0, utf8.length)
+      vector.setSafe(i, utf8, 0, utf8.length)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === StringType)
@@ -253,16 +245,15 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
   test("binary") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("binary", 0, Long.MaxValue)
     val vector = ArrowUtils.toArrowField("binary", BinaryType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableVarBinaryVector]
+      .createVector(allocator).asInstanceOf[VarBinaryVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
 
     (0 until 10).foreach { i =>
       val utf8 = s"str$i".getBytes("utf8")
-      mutator.setSafe(i, utf8, 0, utf8.length)
+      vector.setSafe(i, utf8, 0, utf8.length)
     }
-    mutator.setNull(10)
-    mutator.setValueCount(11)
+    vector.setNull(10)
+    vector.setValueCount(11)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === BinaryType)
@@ -282,31 +273,29 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     val vector = ArrowUtils.toArrowField("array", ArrayType(IntegerType), nullable = true, null)
       .createVector(allocator).asInstanceOf[ListVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
-    val elementVector = vector.getDataVector().asInstanceOf[NullableIntVector]
-    val elementMutator = elementVector.getMutator()
+    val elementVector = vector.getDataVector().asInstanceOf[IntVector]
 
     // [1, 2]
-    mutator.startNewValue(0)
-    elementMutator.setSafe(0, 1)
-    elementMutator.setSafe(1, 2)
-    mutator.endValue(0, 2)
+    vector.startNewValue(0)
+    elementVector.setSafe(0, 1)
+    elementVector.setSafe(1, 2)
+    vector.endValue(0, 2)
 
     // [3, null, 5]
-    mutator.startNewValue(1)
-    elementMutator.setSafe(2, 3)
-    elementMutator.setNull(3)
-    elementMutator.setSafe(4, 5)
-    mutator.endValue(1, 3)
+    vector.startNewValue(1)
+    elementVector.setSafe(2, 3)
+    elementVector.setNull(3)
+    elementVector.setSafe(4, 5)
+    vector.endValue(1, 3)
 
     // null
 
     // []
-    mutator.startNewValue(3)
-    mutator.endValue(3, 0)
+    vector.startNewValue(3)
+    vector.endValue(3, 0)
 
-    elementMutator.setValueCount(5)
-    mutator.setValueCount(4)
+    elementVector.setValueCount(5)
+    vector.setValueCount(4)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === ArrayType(IntegerType))
@@ -338,38 +327,35 @@ class ArrowColumnVectorSuite extends SparkFunSuite {
     val vector = ArrowUtils.toArrowField("struct", schema, nullable = true, null)
       .createVector(allocator).asInstanceOf[NullableMapVector]
     vector.allocateNew()
-    val mutator = vector.getMutator()
-    val intVector = vector.getChildByOrdinal(0).asInstanceOf[NullableIntVector]
-    val intMutator = intVector.getMutator()
-    val longVector = vector.getChildByOrdinal(1).asInstanceOf[NullableBigIntVector]
-    val longMutator = longVector.getMutator()
+    val intVector = vector.getChildByOrdinal(0).asInstanceOf[IntVector]
+    val longVector = vector.getChildByOrdinal(1).asInstanceOf[BigIntVector]
 
     // (1, 1L)
-    mutator.setIndexDefined(0)
-    intMutator.setSafe(0, 1)
-    longMutator.setSafe(0, 1L)
+    vector.setIndexDefined(0)
+    intVector.setSafe(0, 1)
+    longVector.setSafe(0, 1L)
 
     // (2, null)
-    mutator.setIndexDefined(1)
-    intMutator.setSafe(1, 2)
-    longMutator.setNull(1)
+    vector.setIndexDefined(1)
+    intVector.setSafe(1, 2)
+    longVector.setNull(1)
 
     // (null, 3L)
-    mutator.setIndexDefined(2)
-    intMutator.setNull(2)
-    longMutator.setSafe(2, 3L)
+    vector.setIndexDefined(2)
+    intVector.setNull(2)
+    longVector.setSafe(2, 3L)
 
     // null
-    mutator.setNull(3)
+    vector.setNull(3)
 
     // (5, 5L)
-    mutator.setIndexDefined(4)
-    intMutator.setSafe(4, 5)
-    longMutator.setSafe(4, 5L)
+    vector.setIndexDefined(4)
+    intVector.setSafe(4, 5)
+    longVector.setSafe(4, 5L)
 
-    intMutator.setValueCount(5)
-    longMutator.setValueCount(5)
-    mutator.setValueCount(5)
+    intVector.setValueCount(5)
+    longVector.setValueCount(5)
+    vector.setValueCount(5)
 
     val columnVector = new ArrowColumnVector(vector)
     assert(columnVector.dataType === schema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index d3ed8276b8f10..7848ebdcab6d0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.util.Random
 
-import org.apache.arrow.vector.NullableIntVector
+import org.apache.arrow.vector.IntVector
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.memory.MemoryMode
@@ -1137,22 +1137,20 @@ class ColumnarBatchSuite extends SparkFunSuite {
   test("create columnar batch from Arrow column vectors") {
     val allocator = ArrowUtils.rootAllocator.newChildAllocator("int", 0, Long.MaxValue)
     val vector1 = ArrowUtils.toArrowField("int1", IntegerType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableIntVector]
+      .createVector(allocator).asInstanceOf[IntVector]
     vector1.allocateNew()
-    val mutator1 = vector1.getMutator()
     val vector2 = ArrowUtils.toArrowField("int2", IntegerType, nullable = true, null)
-      .createVector(allocator).asInstanceOf[NullableIntVector]
+      .createVector(allocator).asInstanceOf[IntVector]
     vector2.allocateNew()
-    val mutator2 = vector2.getMutator()
 
     (0 until 10).foreach { i =>
-      mutator1.setSafe(i, i)
-      mutator2.setSafe(i + 1, i)
+      vector1.setSafe(i, i)
+      vector2.setSafe(i + 1, i)
     }
-    mutator1.setNull(10)
-    mutator1.setValueCount(11)
-    mutator2.setNull(0)
-    mutator2.setValueCount(11)
+    vector1.setNull(10)
+    vector1.setValueCount(11)
+    vector2.setNull(0)
+    vector2.setValueCount(11)
 
     val columnVectors = Seq(new ArrowColumnVector(vector1), new ArrowColumnVector(vector2))
 

From 0abaf31be7ab9e030ea9433938b9123596954814 Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Thu, 21 Dec 2017 09:38:21 -0600
Subject: [PATCH 165/356] [SPARK-22852][BUILD] Exclude -Xlint:unchecked from
 sbt javadoc flags

## What changes were proposed in this pull request?

Moves the -Xlint:unchecked flag in the sbt build configuration from Compile to (Compile, compile) scope, allowing publish and publishLocal commands to work.

## How was this patch tested?

Successfully published the spark-launcher subproject from within sbt successfully, where it fails without this patch.

Author: Erik LaBianca <erik.labianca@gmail.com>

Closes #20040 from easel/javadoc-xlint.
---
 project/SparkBuild.scala | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 75703380cdb4a..83054945afcf0 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -239,14 +239,14 @@ object SparkBuild extends PomBuild {
 
     javacOptions in Compile ++= Seq(
       "-encoding", "UTF-8",
-      "-source", javacJVMVersion.value,
-      "-Xlint:unchecked"
+      "-source", javacJVMVersion.value
     ),
-    // This -target option cannot be set in the Compile configuration scope since `javadoc` doesn't
-    // play nicely with it; see https://github.com/sbt/sbt/issues/355#issuecomment-3817629 for
-    // additional discussion and explanation.
+    // This -target and Xlint:unchecked options cannot be set in the Compile configuration scope since
+    // `javadoc` doesn't play nicely with them; see https://github.com/sbt/sbt/issues/355#issuecomment-3817629
+    // for additional discussion and explanation.
     javacOptions in (Compile, compile) ++= Seq(
-      "-target", javacJVMVersion.value
+      "-target", javacJVMVersion.value,
+      "-Xlint:unchecked"
     ),
 
     scalacOptions in Compile ++= Seq(

From 4c2efde9314a5f67052ac87bfa1472ebb9aca74a Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Thu, 21 Dec 2017 10:08:38 -0600
Subject: [PATCH 166/356] [SPARK-22855][BUILD] Add -no-java-comments to sbt
 docs/scalacOptions

Prevents Scala 2.12 scaladoc from blowing up attempting to parse java comments.

## What changes were proposed in this pull request?

Adds -no-java-comments to docs/scalacOptions under Scala 2.12. Also
moves scaladoc configs out of the TestSettings and into the standard sharedSettings
section in SparkBuild.scala.

## How was this patch tested?

SBT_OPTS=-Dscala-2.12 sbt
++2.12.4
tags/publishLocal

Author: Erik LaBianca <erik.labianca@gmail.com>

Closes #20042 from easel/scaladoc-212.
---
 project/SparkBuild.scala | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 83054945afcf0..7469f11df0294 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -254,6 +254,21 @@ object SparkBuild extends PomBuild {
       "-sourcepath", (baseDirectory in ThisBuild).value.getAbsolutePath  // Required for relative source links in scaladoc
     ),
 
+    // Remove certain packages from Scaladoc
+    scalacOptions in (Compile, doc) := Seq(
+      "-groups",
+      "-skip-packages", Seq(
+        "org.apache.spark.api.python",
+        "org.apache.spark.network",
+        "org.apache.spark.deploy",
+        "org.apache.spark.util.collection"
+      ).mkString(":"),
+      "-doc-title", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " ScalaDoc"
+    ) ++ {
+      // Do not attempt to scaladoc javadoc comments under 2.12 since it can't handle inner classes
+      if (scalaBinaryVersion.value == "2.12") Seq("-no-java-comments") else Seq.empty
+    },
+
     // Implements -Xfatal-warnings, ignoring deprecation warnings.
     // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
     compile in Compile := {
@@ -828,18 +843,7 @@ object TestSettings {
       }
       Seq.empty[File]
     }).value,
-    concurrentRestrictions in Global += Tags.limit(Tags.Test, 1),
-    // Remove certain packages from Scaladoc
-    scalacOptions in (Compile, doc) := Seq(
-      "-groups",
-      "-skip-packages", Seq(
-        "org.apache.spark.api.python",
-        "org.apache.spark.network",
-        "org.apache.spark.deploy",
-        "org.apache.spark.util.collection"
-      ).mkString(":"),
-      "-doc-title", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " ScalaDoc"
-    )
+    concurrentRestrictions in Global += Tags.limit(Tags.Test, 1)
   )
 
 }

From 8a0ed5a5ee64a6e854c516f80df5a9729435479b Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 22 Dec 2017 00:21:27 +0800
Subject: [PATCH 167/356] [SPARK-22668][SQL] Ensure no global variables in
 arguments of method split by CodegenContext.splitExpressions()

## What changes were proposed in this pull request?

Passing global variables to the split method is dangerous, as any mutating to it is ignored and may lead to unexpected behavior.

To prevent this, one approach is to make sure no expression would output global variables: Localizing lifetime of mutable states in expressions.

Another approach is, when calling `ctx.splitExpression`, make sure we don't use children's output as parameter names.

Approach 1 is actually hard to do, as we need to check all expressions and operators that support whole-stage codegen. Approach 2 is easier as the callers of `ctx.splitExpressions` are not too many.

Besides, approach 2 is more flexible, as children's output may be other stuff that can't be parameter name: literal, inlined statement(a + 1), etc.

close https://github.com/apache/spark/pull/19865
close https://github.com/apache/spark/pull/19938

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20021 from cloud-fan/codegen.
---
 .../sql/catalyst/expressions/arithmetic.scala | 18 +++++------
 .../expressions/codegen/CodeGenerator.scala   | 32 ++++++++++++++++---
 .../expressions/conditionalExpressions.scala  |  8 ++---
 .../expressions/nullExpressions.scala         |  9 +++---
 .../sql/catalyst/expressions/predicates.scala |  2 +-
 5 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
index d3a8cb5804717..8bb14598a6d7b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala
@@ -602,13 +602,13 @@ case class Least(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "leastTmpIsNull")
+    ev.isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
     val evals = evalChildren.map(eval =>
       s"""
          |${eval.code}
-         |if (!${eval.isNull} && ($tmpIsNull ||
+         |if (!${eval.isNull} && (${ev.isNull} ||
          |  ${ctx.genGreater(dataType, ev.value, eval.value)})) {
-         |  $tmpIsNull = false;
+         |  ${ev.isNull} = false;
          |  ${ev.value} = ${eval.value};
          |}
       """.stripMargin
@@ -628,10 +628,9 @@ case class Least(children: Seq[Expression]) extends Expression {
       foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
     ev.copy(code =
       s"""
-         |$tmpIsNull = true;
+         |${ev.isNull} = true;
          |${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
          |$codes
-         |final boolean ${ev.isNull} = $tmpIsNull;
       """.stripMargin)
   }
 }
@@ -682,13 +681,13 @@ case class Greatest(children: Seq[Expression]) extends Expression {
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val evalChildren = children.map(_.genCode(ctx))
-    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "greatestTmpIsNull")
+    ev.isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
     val evals = evalChildren.map(eval =>
       s"""
          |${eval.code}
-         |if (!${eval.isNull} && ($tmpIsNull ||
+         |if (!${eval.isNull} && (${ev.isNull} ||
          |  ${ctx.genGreater(dataType, eval.value, ev.value)})) {
-         |  $tmpIsNull = false;
+         |  ${ev.isNull} = false;
          |  ${ev.value} = ${eval.value};
          |}
       """.stripMargin
@@ -708,10 +707,9 @@ case class Greatest(children: Seq[Expression]) extends Expression {
       foldFunctions = _.map(funcCall => s"${ev.value} = $funcCall;").mkString("\n"))
     ev.copy(code =
       s"""
-         |$tmpIsNull = true;
+         |${ev.isNull} = true;
          |${ctx.javaType(dataType)} ${ev.value} = ${ctx.defaultValue(dataType)};
          |$codes
-         |final boolean ${ev.isNull} = $tmpIsNull;
       """.stripMargin)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 41a920ba3d677..9adf632ddcde8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -128,7 +128,7 @@ class CodegenContext {
    * `currentVars` to null, or set `currentVars(i)` to null for certain columns, before calling
    * `Expression.genCode`.
    */
-  final var INPUT_ROW = "i"
+  var INPUT_ROW = "i"
 
   /**
    * Holding a list of generated columns as input of current operator, will be used by
@@ -146,22 +146,30 @@ class CodegenContext {
    * as a member variable
    *
    * They will be kept as member variables in generated classes like `SpecificProjection`.
+   *
+   * Exposed for tests only.
    */
-  val inlinedMutableStates: mutable.ArrayBuffer[(String, String)] =
+  private[catalyst] val inlinedMutableStates: mutable.ArrayBuffer[(String, String)] =
     mutable.ArrayBuffer.empty[(String, String)]
 
   /**
    * The mapping between mutable state types and corrseponding compacted arrays.
    * The keys are java type string. The values are [[MutableStateArrays]] which encapsulates
    * the compacted arrays for the mutable states with the same java type.
+   *
+   * Exposed for tests only.
    */
-  val arrayCompactedMutableStates: mutable.Map[String, MutableStateArrays] =
+  private[catalyst] val arrayCompactedMutableStates: mutable.Map[String, MutableStateArrays] =
     mutable.Map.empty[String, MutableStateArrays]
 
   // An array holds the code that will initialize each state
-  val mutableStateInitCode: mutable.ArrayBuffer[String] =
+  // Exposed for tests only.
+  private[catalyst] val mutableStateInitCode: mutable.ArrayBuffer[String] =
     mutable.ArrayBuffer.empty[String]
 
+  // Tracks the names of all the mutable states.
+  private val mutableStateNames: mutable.HashSet[String] = mutable.HashSet.empty
+
   /**
    * This class holds a set of names of mutableStateArrays that is used for compacting mutable
    * states for a certain type, and holds the next available slot of the current compacted array.
@@ -172,7 +180,11 @@ class CodegenContext {
 
     private[this] var currentIndex = 0
 
-    private def createNewArray() = arrayNames.append(freshName("mutableStateArray"))
+    private def createNewArray() = {
+      val newArrayName = freshName("mutableStateArray")
+      mutableStateNames += newArrayName
+      arrayNames.append(newArrayName)
+    }
 
     def getCurrentIndex: Int = currentIndex
 
@@ -241,6 +253,7 @@ class CodegenContext {
       val initCode = initFunc(varName)
       inlinedMutableStates += ((javaType, varName))
       mutableStateInitCode += initCode
+      mutableStateNames += varName
       varName
     } else {
       val arrays = arrayCompactedMutableStates.getOrElseUpdate(javaType, new MutableStateArrays)
@@ -930,6 +943,15 @@ class CodegenContext {
       // inline execution if only one block
       blocks.head
     } else {
+      if (Utils.isTesting) {
+        // Passing global variables to the split method is dangerous, as any mutating to it is
+        // ignored and may lead to unexpected behavior.
+        arguments.foreach { case (_, name) =>
+          assert(!mutableStateNames.contains(name),
+            s"split function argument $name cannot be a global variable.")
+        }
+      }
+
       val func = freshName(funcName)
       val argString = arguments.map { case (t, name) => s"$t $name" }.mkString(", ")
       val functions = blocks.zipWithIndex.map { case (body, i) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 1a9b68222a7f4..142dfb02be0a8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -190,7 +190,7 @@ case class CaseWhen(
     // It is initialized to `NOT_MATCHED`, and if it's set to `HAS_NULL` or `HAS_NONNULL`,
     // We won't go on anymore on the computation.
     val resultState = ctx.freshName("caseWhenResultState")
-    val tmpResult = ctx.addMutableState(ctx.javaType(dataType), "caseWhenTmpResult")
+    ev.value = ctx.addMutableState(ctx.javaType(dataType), ev.value)
 
     // these blocks are meant to be inside a
     // do {
@@ -205,7 +205,7 @@ case class CaseWhen(
          |if (!${cond.isNull} && ${cond.value}) {
          |  ${res.code}
          |  $resultState = (byte)(${res.isNull} ? $HAS_NULL : $HAS_NONNULL);
-         |  $tmpResult = ${res.value};
+         |  ${ev.value} = ${res.value};
          |  continue;
          |}
        """.stripMargin
@@ -216,7 +216,7 @@ case class CaseWhen(
       s"""
          |${res.code}
          |$resultState = (byte)(${res.isNull} ? $HAS_NULL : $HAS_NONNULL);
-         |$tmpResult = ${res.value};
+         |${ev.value} = ${res.value};
        """.stripMargin
     }
 
@@ -264,13 +264,11 @@ case class CaseWhen(
     ev.copy(code =
       s"""
          |${ctx.JAVA_BYTE} $resultState = $NOT_MATCHED;
-         |$tmpResult = ${ctx.defaultValue(dataType)};
          |do {
          |  $codes
          |} while (false);
          |// TRUE if any condition is met and the result is null, or no any condition is met.
          |final boolean ${ev.isNull} = ($resultState != $HAS_NONNULL);
-         |final ${ctx.javaType(dataType)} ${ev.value} = $tmpResult;
        """.stripMargin)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index b4f895fffda38..470d5da041ea5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -72,7 +72,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val tmpIsNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, "coalesceTmpIsNull")
+    ev.isNull = ctx.addMutableState(ctx.JAVA_BOOLEAN, ev.isNull)
 
     // all the evals are meant to be in a do { ... } while (false); loop
     val evals = children.map { e =>
@@ -80,7 +80,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
       s"""
          |${eval.code}
          |if (!${eval.isNull}) {
-         |  $tmpIsNull = false;
+         |  ${ev.isNull} = false;
          |  ${ev.value} = ${eval.value};
          |  continue;
          |}
@@ -103,7 +103,7 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
       foldFunctions = _.map { funcCall =>
         s"""
            |${ev.value} = $funcCall;
-           |if (!$tmpIsNull) {
+           |if (!${ev.isNull}) {
            |  continue;
            |}
          """.stripMargin
@@ -112,12 +112,11 @@ case class Coalesce(children: Seq[Expression]) extends Expression {
 
     ev.copy(code =
       s"""
-         |$tmpIsNull = true;
+         |${ev.isNull} = true;
          |$resultType ${ev.value} = ${ctx.defaultValue(dataType)};
          |do {
          |  $codes
          |} while (false);
-         |final boolean ${ev.isNull} = $tmpIsNull;
        """.stripMargin)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index ac9f56f78eb2e..f4ee3d10f3f43 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -285,7 +285,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
          |${valueGen.code}
          |byte $tmpResult = $HAS_NULL;
          |if (!${valueGen.isNull}) {
-         |  $tmpResult = 0;
+         |  $tmpResult = $NOT_MATCHED;
          |  $javaDataType $valueArg = ${valueGen.value};
          |  do {
          |    $codes

From d3a1d9527bcd6675cc45773f01d4558cf4b46b3d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 22 Dec 2017 01:08:13 +0800
Subject: [PATCH 168/356] [SPARK-22786][SQL] only use AppStatusPlugin in
 history server

## What changes were proposed in this pull request?

In https://github.com/apache/spark/pull/19681 we introduced a new interface called `AppStatusPlugin`, to register listeners and set up the UI for both live and history UI.

However I think it's an overkill for live UI. For example, we should not register `SQLListener` if users are not using SQL functions. Previously we register the `SQLListener` and set up SQL tab when `SparkSession` is firstly created, which indicates users are going to use SQL functions. But in #19681 , we register the SQL functions during `SparkContext` creation. The same thing should apply to streaming too.

I think we should keep the previous behavior, and only use this new interface for history server.

To reflect this change, I also rename the new interface to `SparkHistoryUIPlugin`

This PR also refines the tests for sql listener.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19981 from cloud-fan/listener.
---
 .../scala/org/apache/spark/SparkContext.scala |  16 +-
 .../deploy/history/FsHistoryProvider.scala    |  14 +-
 .../spark/status/AppHistoryServerPlugin.scala |  38 +++
 .../spark/status/AppStatusListener.scala      |   2 +-
 .../apache/spark/status/AppStatusPlugin.scala |  71 ------
 .../apache/spark/status/AppStatusStore.scala  |  17 +-
 .../org/apache/spark/ui/StagePageSuite.scala  |  20 +-
 ...apache.spark.status.AppHistoryServerPlugin |   1 +
 .../org.apache.spark.status.AppStatusPlugin   |   1 -
 .../execution/ui/SQLAppStatusListener.scala   |  23 +-
 .../sql/execution/ui/SQLAppStatusStore.scala  |  62 +----
 .../execution/ui/SQLHistoryServerPlugin.scala |  36 +++
 .../spark/sql/internal/SharedState.scala      |  18 +-
 .../execution/metric/SQLMetricsSuite.scala    |   8 -
 .../metric/SQLMetricsTestUtils.scala          |  10 +-
 ....scala => SQLAppStatusListenerSuite.scala} | 238 +++++++++---------
 16 files changed, 256 insertions(+), 319 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala
 delete mode 100644 core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
 create mode 100644 sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin
 delete mode 100644 sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppStatusPlugin
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala
 rename sql/core/src/test/scala/org/apache/spark/sql/execution/ui/{SQLListenerSuite.scala => SQLAppStatusListenerSuite.scala} (71%)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 92e13ce1ba042..fcbeddd2a9ac3 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -53,7 +53,7 @@ import org.apache.spark.rpc.RpcEndpointRef
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, StandaloneSchedulerBackend}
 import org.apache.spark.scheduler.local.LocalSchedulerBackend
-import org.apache.spark.status.{AppStatusPlugin, AppStatusStore}
+import org.apache.spark.status.AppStatusStore
 import org.apache.spark.storage._
 import org.apache.spark.storage.BlockManagerMessages.TriggerThreadDump
 import org.apache.spark.ui.{ConsoleProgressBar, SparkUI}
@@ -416,7 +416,8 @@ class SparkContext(config: SparkConf) extends Logging {
 
     // Initialize the app status store and listener before SparkEnv is created so that it gets
     // all events.
-    _statusStore = AppStatusStore.createLiveStore(conf, l => listenerBus.addToStatusQueue(l))
+    _statusStore = AppStatusStore.createLiveStore(conf)
+    listenerBus.addToStatusQueue(_statusStore.listener.get)
 
     // Create the Spark execution environment (cache, map output tracker, etc)
     _env = createSparkEnv(_conf, isLocal, listenerBus)
@@ -445,14 +446,9 @@ class SparkContext(config: SparkConf) extends Logging {
         // For tests, do not enable the UI
         None
       }
-    _ui.foreach { ui =>
-      // Load any plugins that might want to modify the UI.
-      AppStatusPlugin.loadPlugins().foreach(_.setupUI(ui))
-
-      // Bind the UI before starting the task scheduler to communicate
-      // the bound port to the cluster manager properly
-      ui.bind()
-    }
+    // Bind the UI before starting the task scheduler to communicate
+    // the bound port to the cluster manager properly
+    _ui.foreach(_.bind())
 
     _hadoopConfiguration = SparkHadoopUtil.get.newConfiguration(_conf)
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index fa2c5194aa41b..a299b79850613 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -44,7 +44,6 @@ import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.status._
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo}
-import org.apache.spark.status.config._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 import org.apache.spark.util.kvstore._
@@ -322,15 +321,18 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         (new InMemoryStore(), true)
     }
 
+    val plugins = ServiceLoader.load(
+      classOf[AppHistoryServerPlugin], Utils.getContextOrSparkClassLoader).asScala
     val trackingStore = new ElementTrackingStore(kvstore, conf)
     if (needReplay) {
       val replayBus = new ReplayListenerBus()
       val listener = new AppStatusListener(trackingStore, conf, false,
         lastUpdateTime = Some(attempt.info.lastUpdated.getTime()))
       replayBus.addListener(listener)
-      AppStatusPlugin.loadPlugins().foreach { plugin =>
-        plugin.setupListeners(conf, trackingStore, l => replayBus.addListener(l), false)
-      }
+      for {
+        plugin <- plugins
+        listener <- plugin.createListeners(conf, trackingStore)
+      } replayBus.addListener(listener)
       try {
         val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
         replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
@@ -353,9 +355,7 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       HistoryServer.getAttemptURI(appId, attempt.info.attemptId),
       attempt.info.startTime.getTime(),
       attempt.info.appSparkVersion)
-    AppStatusPlugin.loadPlugins().foreach { plugin =>
-      plugin.setupUI(ui)
-    }
+    plugins.foreach(_.setupUI(ui))
 
     val loadedUI = LoadedAppUI(ui)
 
diff --git a/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala b/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala
new file mode 100644
index 0000000000000..d144a0e998fa1
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/AppHistoryServerPlugin.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.SparkConf
+import org.apache.spark.scheduler.SparkListener
+import org.apache.spark.ui.SparkUI
+
+/**
+ * An interface for creating history listeners(to replay event logs) defined in other modules like
+ * SQL, and setup the UI of the plugin to rebuild the history UI.
+ */
+private[spark] trait AppHistoryServerPlugin {
+  /**
+   * Creates listeners to replay the event logs.
+   */
+  def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener]
+
+  /**
+   * Sets up UI of this plugin to rebuild the history UI.
+   */
+  def setupUI(ui: SparkUI): Unit
+}
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 87eb84d94c005..4db797e1d24c6 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -48,7 +48,7 @@ private[spark] class AppStatusListener(
 
   import config._
 
-  private var sparkVersion = SPARK_VERSION
+  private val sparkVersion = SPARK_VERSION
   private var appInfo: v1.ApplicationInfo = null
   private var appSummary = new AppSummary(0, 0)
   private var coresPerTask: Int = 1
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala b/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
deleted file mode 100644
index 4cada5c7b0de4..0000000000000
--- a/core/src/main/scala/org/apache/spark/status/AppStatusPlugin.scala
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.status
-
-import java.util.ServiceLoader
-
-import scala.collection.JavaConverters._
-
-import org.apache.spark.SparkConf
-import org.apache.spark.scheduler.SparkListener
-import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.Utils
-import org.apache.spark.util.kvstore.KVStore
-
-/**
- * An interface that defines plugins for collecting and storing application state.
- *
- * The plugin implementations are invoked for both live and replayed applications. For live
- * applications, it's recommended that plugins defer creation of UI tabs until there's actual
- * data to be shown.
- */
-private[spark] trait AppStatusPlugin {
-
-  /**
-   * Install listeners to collect data about the running application and populate the given
-   * store.
-   *
-   * @param conf The Spark configuration.
-   * @param store The KVStore where to keep application data.
-   * @param addListenerFn Function to register listeners with a bus.
-   * @param live Whether this is a live application (or an application being replayed by the
-   *             HistoryServer).
-   */
-  def setupListeners(
-      conf: SparkConf,
-      store: ElementTrackingStore,
-      addListenerFn: SparkListener => Unit,
-      live: Boolean): Unit
-
-  /**
-   * Install any needed extensions (tabs, pages, etc) to a Spark UI. The plugin can detect whether
-   * the app is live or replayed by looking at the UI's SparkContext field `sc`.
-   *
-   * @param ui The Spark UI instance for the application.
-   */
-  def setupUI(ui: SparkUI): Unit
-
-}
-
-private[spark] object AppStatusPlugin {
-
-  def loadPlugins(): Iterable[AppStatusPlugin] = {
-    ServiceLoader.load(classOf[AppStatusPlugin], Utils.getContextOrSparkClassLoader).asScala
-  }
-
-}
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 9987419b170f6..5a942f5284018 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -17,16 +17,14 @@
 
 package org.apache.spark.status
 
-import java.io.File
-import java.util.{Arrays, List => JList}
+import java.util.{List => JList}
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{JobExecutionStatus, SparkConf}
-import org.apache.spark.scheduler.SparkListener
 import org.apache.spark.status.api.v1
 import org.apache.spark.ui.scope._
-import org.apache.spark.util.{Distribution, Utils}
+import org.apache.spark.util.Distribution
 import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
 
 /**
@@ -34,7 +32,7 @@ import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
  */
 private[spark] class AppStatusStore(
     val store: KVStore,
-    listener: Option[AppStatusListener] = None) {
+    val listener: Option[AppStatusListener] = None) {
 
   def applicationInfo(): v1.ApplicationInfo = {
     store.view(classOf[ApplicationInfoWrapper]).max(1).iterator().next().info
@@ -346,17 +344,10 @@ private[spark] object AppStatusStore {
 
   /**
    * Create an in-memory store for a live application.
-   *
-   * @param conf Configuration.
-   * @param addListenerFn Function to register a listener with a bus.
    */
-  def createLiveStore(conf: SparkConf, addListenerFn: SparkListener => Unit): AppStatusStore = {
+  def createLiveStore(conf: SparkConf): AppStatusStore = {
     val store = new ElementTrackingStore(new InMemoryStore(), conf)
     val listener = new AppStatusListener(store, conf, true)
-    addListenerFn(listener)
-    AppStatusPlugin.loadPlugins().foreach { p =>
-      p.setupListeners(conf, store, addListenerFn, true)
-    }
     new AppStatusStore(store, listener = Some(listener))
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 46932a02f1a1b..661d0d48d2f37 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -22,7 +22,6 @@ import javax.servlet.http.HttpServletRequest
 
 import scala.xml.Node
 
-import org.mockito.Matchers.anyString
 import org.mockito.Mockito.{mock, when, RETURNS_SMART_NULLS}
 
 import org.apache.spark._
@@ -30,7 +29,6 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
 import org.apache.spark.ui.jobs.{StagePage, StagesTab}
-import org.apache.spark.util.Utils
 
 class StagePageSuite extends SparkFunSuite with LocalSparkContext {
 
@@ -55,12 +53,12 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
    * This also runs a dummy stage to populate the page with useful content.
    */
   private def renderStagePage(conf: SparkConf): Seq[Node] = {
-    val bus = new ReplayListenerBus()
-    val store = AppStatusStore.createLiveStore(conf, l => bus.addListener(l))
+    val statusStore = AppStatusStore.createLiveStore(conf)
+    val listener = statusStore.listener.get
 
     try {
       val tab = mock(classOf[StagesTab], RETURNS_SMART_NULLS)
-      when(tab.store).thenReturn(store)
+      when(tab.store).thenReturn(statusStore)
 
       val request = mock(classOf[HttpServletRequest])
       when(tab.conf).thenReturn(conf)
@@ -68,7 +66,7 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
       when(tab.headerTabs).thenReturn(Seq.empty)
       when(request.getParameter("id")).thenReturn("0")
       when(request.getParameter("attempt")).thenReturn("0")
-      val page = new StagePage(tab, store)
+      val page = new StagePage(tab, statusStore)
 
       // Simulate a stage in job progress listener
       val stageInfo = new StageInfo(0, 0, "dummy", 1, Seq.empty, Seq.empty, "details")
@@ -77,17 +75,17 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
         taskId =>
           val taskInfo = new TaskInfo(taskId, taskId, 0, 0, "0", "localhost", TaskLocality.ANY,
             false)
-          bus.postToAll(SparkListenerStageSubmitted(stageInfo))
-          bus.postToAll(SparkListenerTaskStart(0, 0, taskInfo))
+          listener.onStageSubmitted(SparkListenerStageSubmitted(stageInfo))
+          listener.onTaskStart(SparkListenerTaskStart(0, 0, taskInfo))
           taskInfo.markFinished(TaskState.FINISHED, System.currentTimeMillis())
           val taskMetrics = TaskMetrics.empty
           taskMetrics.incPeakExecutionMemory(peakExecutionMemory)
-          bus.postToAll(SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, taskMetrics))
+          listener.onTaskEnd(SparkListenerTaskEnd(0, 0, "result", Success, taskInfo, taskMetrics))
       }
-      bus.postToAll(SparkListenerStageCompleted(stageInfo))
+      listener.onStageCompleted(SparkListenerStageCompleted(stageInfo))
       page.render(request)
     } finally {
-      store.close()
+      statusStore.close()
     }
   }
 
diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin b/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin
new file mode 100644
index 0000000000000..0bba2f88b92a5
--- /dev/null
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppHistoryServerPlugin
@@ -0,0 +1 @@
+org.apache.spark.sql.execution.ui.SQLHistoryServerPlugin
diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppStatusPlugin b/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppStatusPlugin
deleted file mode 100644
index ac6d7f6962f85..0000000000000
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.status.AppStatusPlugin
+++ /dev/null
@@ -1 +0,0 @@
-org.apache.spark.sql.execution.ui.SQLAppStatusPlugin
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index cf0000c6393a3..aa78fa015dbef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -30,15 +30,11 @@ import org.apache.spark.sql.execution.metric._
 import org.apache.spark.sql.internal.StaticSQLConf._
 import org.apache.spark.status.{ElementTrackingStore, KVUtils, LiveEntity}
 import org.apache.spark.status.config._
-import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.kvstore.KVStore
 
-private[sql] class SQLAppStatusListener(
+class SQLAppStatusListener(
     conf: SparkConf,
     kvstore: ElementTrackingStore,
-    live: Boolean,
-    ui: Option[SparkUI] = None)
-  extends SparkListener with Logging {
+    live: Boolean) extends SparkListener with Logging {
 
   // How often to flush intermediate state of a live execution to the store. When replaying logs,
   // never flush (only do the very last write).
@@ -50,7 +46,10 @@ private[sql] class SQLAppStatusListener(
   private val liveExecutions = new ConcurrentHashMap[Long, LiveExecutionData]()
   private val stageMetrics = new ConcurrentHashMap[Int, LiveStageMetrics]()
 
-  private var uiInitialized = false
+  // Returns true if this listener has no live data. Exposed for tests only.
+  private[sql] def noLiveData(): Boolean = {
+    liveExecutions.isEmpty && stageMetrics.isEmpty
+  }
 
   kvstore.addTrigger(classOf[SQLExecutionUIData], conf.get(UI_RETAINED_EXECUTIONS)) { count =>
     cleanupExecutions(count)
@@ -230,14 +229,6 @@ private[sql] class SQLAppStatusListener(
   }
 
   private def onExecutionStart(event: SparkListenerSQLExecutionStart): Unit = {
-    // Install the SQL tab in a live app if it hasn't been initialized yet.
-    if (!uiInitialized) {
-      ui.foreach { _ui =>
-        new SQLTab(new SQLAppStatusStore(kvstore, Some(this)), _ui)
-      }
-      uiInitialized = true
-    }
-
     val SparkListenerSQLExecutionStart(executionId, description, details,
       physicalPlanDescription, sparkPlanInfo, time) = event
 
@@ -389,7 +380,7 @@ private class LiveStageMetrics(
     val accumulatorIds: Array[Long],
     val taskMetrics: ConcurrentHashMap[Long, LiveTaskMetrics])
 
-private[sql] class LiveTaskMetrics(
+private class LiveTaskMetrics(
     val ids: Array[Long],
     val values: Array[Long],
     val succeeded: Boolean)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
index 7fd5f7395cdf3..910f2e52fdbb3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala
@@ -25,21 +25,17 @@ import scala.collection.mutable.ArrayBuffer
 
 import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
-import org.apache.spark.{JobExecutionStatus, SparkConf}
-import org.apache.spark.scheduler.SparkListener
-import org.apache.spark.status.{AppStatusPlugin, ElementTrackingStore}
+import org.apache.spark.JobExecutionStatus
 import org.apache.spark.status.KVUtils.KVIndexParam
-import org.apache.spark.ui.SparkUI
-import org.apache.spark.util.Utils
 import org.apache.spark.util.kvstore.KVStore
 
 /**
  * Provides a view of a KVStore with methods that make it easy to query SQL-specific state. There's
  * no state kept in this class, so it's ok to have multiple instances of it in an application.
  */
-private[sql] class SQLAppStatusStore(
+class SQLAppStatusStore(
     store: KVStore,
-    listener: Option[SQLAppStatusListener] = None) {
+    val listener: Option[SQLAppStatusListener] = None) {
 
   def executionsList(): Seq[SQLExecutionUIData] = {
     store.view(classOf[SQLExecutionUIData]).asScala.toSeq
@@ -74,48 +70,9 @@ private[sql] class SQLAppStatusStore(
   def planGraph(executionId: Long): SparkPlanGraph = {
     store.read(classOf[SparkPlanGraphWrapper], executionId).toSparkPlanGraph()
   }
-
-}
-
-/**
- * An AppStatusPlugin for handling the SQL UI and listeners.
- */
-private[sql] class SQLAppStatusPlugin extends AppStatusPlugin {
-
-  override def setupListeners(
-      conf: SparkConf,
-      store: ElementTrackingStore,
-      addListenerFn: SparkListener => Unit,
-      live: Boolean): Unit = {
-    // For live applications, the listener is installed in [[setupUI]]. This also avoids adding
-    // the listener when the UI is disabled. Force installation during testing, though.
-    if (!live || Utils.isTesting) {
-      val listener = new SQLAppStatusListener(conf, store, live, None)
-      addListenerFn(listener)
-    }
-  }
-
-  override def setupUI(ui: SparkUI): Unit = {
-    ui.sc match {
-      case Some(sc) =>
-        // If this is a live application, then install a listener that will enable the SQL
-        // tab as soon as there's a SQL event posted to the bus.
-        val listener = new SQLAppStatusListener(sc.conf,
-          ui.store.store.asInstanceOf[ElementTrackingStore], true, Some(ui))
-        sc.listenerBus.addToStatusQueue(listener)
-
-      case _ =>
-        // For a replayed application, only add the tab if the store already contains SQL data.
-        val sqlStore = new SQLAppStatusStore(ui.store.store)
-        if (sqlStore.executionsCount() > 0) {
-          new SQLTab(sqlStore, ui)
-        }
-    }
-  }
-
 }
 
-private[sql] class SQLExecutionUIData(
+class SQLExecutionUIData(
     @KVIndexParam val executionId: Long,
     val description: String,
     val details: String,
@@ -133,10 +90,9 @@ private[sql] class SQLExecutionUIData(
      * from the SQL listener instance.
      */
     @JsonDeserialize(keyAs = classOf[JLong])
-    val metricValues: Map[Long, String]
-    )
+    val metricValues: Map[Long, String])
 
-private[sql] class SparkPlanGraphWrapper(
+class SparkPlanGraphWrapper(
     @KVIndexParam val executionId: Long,
     val nodes: Seq[SparkPlanGraphNodeWrapper],
     val edges: Seq[SparkPlanGraphEdge]) {
@@ -147,7 +103,7 @@ private[sql] class SparkPlanGraphWrapper(
 
 }
 
-private[sql] class SparkPlanGraphClusterWrapper(
+class SparkPlanGraphClusterWrapper(
     val id: Long,
     val name: String,
     val desc: String,
@@ -163,7 +119,7 @@ private[sql] class SparkPlanGraphClusterWrapper(
 }
 
 /** Only one of the values should be set. */
-private[sql] class SparkPlanGraphNodeWrapper(
+class SparkPlanGraphNodeWrapper(
     val node: SparkPlanGraphNode,
     val cluster: SparkPlanGraphClusterWrapper) {
 
@@ -174,7 +130,7 @@ private[sql] class SparkPlanGraphNodeWrapper(
 
 }
 
-private[sql] case class SQLPlanMetric(
+case class SQLPlanMetric(
     name: String,
     accumulatorId: Long,
     metricType: String)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala
new file mode 100644
index 0000000000000..522d0cf79bffa
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLHistoryServerPlugin.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.ui
+
+import org.apache.spark.SparkConf
+import org.apache.spark.scheduler.SparkListener
+import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore}
+import org.apache.spark.ui.SparkUI
+
+class SQLHistoryServerPlugin extends AppHistoryServerPlugin {
+  override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = {
+    Seq(new SQLAppStatusListener(conf, store, live = false))
+  }
+
+  override def setupUI(ui: SparkUI): Unit = {
+    val sqlStatusStore = new SQLAppStatusStore(ui.store.store)
+    if (sqlStatusStore.executionsCount() > 0) {
+      new SQLTab(sqlStatusStore, ui)
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
index 3e479faed72ac..baea4ceebf8e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SharedState.scala
@@ -28,11 +28,12 @@ import org.apache.hadoop.fs.FsUrlStreamHandlerFactory
 
 import org.apache.spark.{SparkConf, SparkContext, SparkException}
 import org.apache.spark.internal.Logging
-import org.apache.spark.scheduler.LiveListenerBus
-import org.apache.spark.sql.{SparkSession, SQLContext}
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.execution.CacheManager
+import org.apache.spark.sql.execution.ui.{SQLAppStatusListener, SQLAppStatusStore, SQLTab}
 import org.apache.spark.sql.internal.StaticSQLConf._
+import org.apache.spark.status.ElementTrackingStore
 import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
 
@@ -82,6 +83,19 @@ private[sql] class SharedState(val sparkContext: SparkContext) extends Logging {
    */
   val cacheManager: CacheManager = new CacheManager
 
+  /**
+   * A status store to query SQL status/metrics of this Spark application, based on SQL-specific
+   * [[org.apache.spark.scheduler.SparkListenerEvent]]s.
+   */
+  val statusStore: SQLAppStatusStore = {
+    val kvStore = sparkContext.statusStore.store.asInstanceOf[ElementTrackingStore]
+    val listener = new SQLAppStatusListener(sparkContext.conf, kvStore, live = true)
+    sparkContext.listenerBus.addToStatusQueue(listener)
+    val statusStore = new SQLAppStatusStore(kvStore, Some(listener))
+    sparkContext.ui.foreach(new SQLTab(statusStore, _))
+    statusStore
+  }
+
   /**
    * A catalog that interacts with external systems.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index d588af3e19dde..fc3483379c817 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -33,14 +33,6 @@ import org.apache.spark.util.{AccumulatorContext, JsonProtocol}
 class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with SharedSQLContext {
   import testImplicits._
 
-  private def statusStore: SQLAppStatusStore = {
-    new SQLAppStatusStore(sparkContext.statusStore.store)
-  }
-
-  private def currentExecutionIds(): Set[Long] = {
-    statusStore.executionsList.map(_.executionId).toSet
-  }
-
   /**
    * Generates a `DataFrame` by filling randomly generated bytes for hash collision.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
index d89c4b14619fa..122d28798136f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
@@ -31,17 +31,14 @@ import org.apache.spark.util.Utils
 
 
 trait SQLMetricsTestUtils extends SQLTestUtils {
-
   import testImplicits._
 
-  private def statusStore: SQLAppStatusStore = {
-    new SQLAppStatusStore(sparkContext.statusStore.store)
-  }
-
-  private def currentExecutionIds(): Set[Long] = {
+  protected def currentExecutionIds(): Set[Long] = {
     statusStore.executionsList.map(_.executionId).toSet
   }
 
+  protected def statusStore: SQLAppStatusStore = spark.sharedState.statusStore
+
   /**
    * Get execution metrics for the SQL execution and verify metrics values.
    *
@@ -57,7 +54,6 @@ trait SQLMetricsTestUtils extends SQLTestUtils {
     assert(executionIds.size == 1)
     val executionId = executionIds.head
 
-    val executionData = statusStore.execution(executionId).get
     val executedNode = statusStore.planGraph(executionId).nodes.head
 
     val metricsNames = Seq(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
similarity index 71%
rename from sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index 932950687942c..5ebbeb4a7cb40 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -41,7 +41,8 @@ import org.apache.spark.status.config._
 import org.apache.spark.util.{AccumulatorMetadata, JsonProtocol, LongAccumulator}
 import org.apache.spark.util.kvstore.InMemoryStore
 
-class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTestUtils {
+
+class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTestUtils {
   import testImplicits._
 
   override protected def sparkConf = {
@@ -61,21 +62,21 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
     properties
   }
 
-  private def createStageInfo(stageId: Int, attemptId: Int): StageInfo = new StageInfo(
-    stageId = stageId,
-    attemptId = attemptId,
-    // The following fields are not used in tests
-    name = "",
-    numTasks = 0,
-    rddInfos = Nil,
-    parentIds = Nil,
-    details = ""
-  )
+  private def createStageInfo(stageId: Int, attemptId: Int): StageInfo = {
+    new StageInfo(stageId = stageId,
+      attemptId = attemptId,
+      // The following fields are not used in tests
+      name = "",
+      numTasks = 0,
+      rddInfos = Nil,
+      parentIds = Nil,
+      details = "")
+  }
 
   private def createTaskInfo(
       taskId: Int,
       attemptNumber: Int,
-      accums: Map[Long, Long] = Map()): TaskInfo = {
+      accums: Map[Long, Long] = Map.empty): TaskInfo = {
     val info = new TaskInfo(
       taskId = taskId,
       attemptNumber = attemptNumber,
@@ -99,29 +100,37 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
     }.toSeq
   }
 
-  /** Return the shared SQL store from the active SparkSession. */
-  private def statusStore: SQLAppStatusStore =
-    new SQLAppStatusStore(spark.sparkContext.statusStore.store)
-
-  /**
-   * Runs a test with a temporary SQLAppStatusStore tied to a listener bus. Events can be sent to
-   * the listener bus to update the store, and all data will be cleaned up at the end of the test.
-   */
-  private def sqlStoreTest(name: String)
-      (fn: (SQLAppStatusStore, SparkListenerBus) => Unit): Unit = {
-    test(name) {
-      val conf = sparkConf
-      val store = new ElementTrackingStore(new InMemoryStore(), conf)
-      val bus = new ReplayListenerBus()
-      val listener = new SQLAppStatusListener(conf, store, true)
-      bus.addListener(listener)
-      store.close(false)
-      val sqlStore = new SQLAppStatusStore(store, Some(listener))
-      fn(sqlStore, bus)
+  private def assertJobs(
+      exec: Option[SQLExecutionUIData],
+      running: Seq[Int] = Nil,
+      completed: Seq[Int] = Nil,
+      failed: Seq[Int] = Nil): Unit = {
+    val actualRunning = new ListBuffer[Int]()
+    val actualCompleted = new ListBuffer[Int]()
+    val actualFailed = new ListBuffer[Int]()
+
+    exec.get.jobs.foreach { case (jobId, jobStatus) =>
+      jobStatus match {
+        case JobExecutionStatus.RUNNING => actualRunning += jobId
+        case JobExecutionStatus.SUCCEEDED => actualCompleted += jobId
+        case JobExecutionStatus.FAILED => actualFailed += jobId
+        case _ => fail(s"Unexpected status $jobStatus")
+      }
     }
+
+    assert(actualRunning.sorted === running)
+    assert(actualCompleted.sorted === completed)
+    assert(actualFailed.sorted === failed)
   }
 
-  sqlStoreTest("basic") { (store, bus) =>
+  private def createStatusStore(): SQLAppStatusStore = {
+    val conf = sparkContext.conf
+    val store = new ElementTrackingStore(new InMemoryStore, conf)
+    val listener = new SQLAppStatusListener(conf, store, live = true)
+    new SQLAppStatusStore(store, Some(listener))
+  }
+
+  test("basic") {
     def checkAnswer(actual: Map[Long, String], expected: Map[Long, Long]): Unit = {
       assert(actual.size == expected.size)
       expected.foreach { case (id, value) =>
@@ -135,6 +144,9 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       }
     }
 
+    val statusStore = createStatusStore()
+    val listener = statusStore.listener.get
+
     val executionId = 0
     val df = createTestDataFrame
     val accumulatorIds =
@@ -147,7 +159,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       (id, accumulatorValue)
     }.toMap
 
-    bus.postToAll(SparkListenerSQLExecutionStart(
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
       executionId,
       "test",
       "test",
@@ -155,7 +167,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
       System.currentTimeMillis()))
 
-    bus.postToAll(SparkListenerJobStart(
+    listener.onJobStart(SparkListenerJobStart(
       jobId = 0,
       time = System.currentTimeMillis(),
       stageInfos = Seq(
@@ -163,45 +175,45 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
         createStageInfo(1, 0)
       ),
       createProperties(executionId)))
-    bus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 0)))
+    listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 0)))
 
-    assert(store.executionMetrics(0).isEmpty)
+    assert(statusStore.executionMetrics(executionId).isEmpty)
 
-    bus.postToAll(SparkListenerExecutorMetricsUpdate("", Seq(
+    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, accum updates)
       (0L, 0, 0, createAccumulatorInfos(accumulatorUpdates)),
       (1L, 0, 0, createAccumulatorInfos(accumulatorUpdates))
     )))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 2))
 
     // Driver accumulator updates don't belong to this execution should be filtered and no
     // exception will be thrown.
-    bus.postToAll(SparkListenerDriverAccumUpdates(0, Seq((999L, 2L))))
+    listener.onOtherEvent(SparkListenerDriverAccumUpdates(0, Seq((999L, 2L))))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 2))
 
-    bus.postToAll(SparkListenerExecutorMetricsUpdate("", Seq(
+    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, accum updates)
       (0L, 0, 0, createAccumulatorInfos(accumulatorUpdates)),
       (1L, 0, 0, createAccumulatorInfos(accumulatorUpdates.mapValues(_ * 2)))
     )))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 3))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 3))
 
     // Retrying a stage should reset the metrics
-    bus.postToAll(SparkListenerStageSubmitted(createStageInfo(0, 1)))
+    listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 1)))
 
-    bus.postToAll(SparkListenerExecutorMetricsUpdate("", Seq(
+    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, accum updates)
       (0L, 0, 1, createAccumulatorInfos(accumulatorUpdates)),
       (1L, 0, 1, createAccumulatorInfos(accumulatorUpdates))
     )))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 2))
 
     // Ignore the task end for the first attempt
-    bus.postToAll(SparkListenerTaskEnd(
+    listener.onTaskEnd(SparkListenerTaskEnd(
       stageId = 0,
       stageAttemptId = 0,
       taskType = "",
@@ -209,17 +221,17 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       createTaskInfo(0, 0, accums = accumulatorUpdates.mapValues(_ * 100)),
       null))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 2))
 
     // Finish two tasks
-    bus.postToAll(SparkListenerTaskEnd(
+    listener.onTaskEnd(SparkListenerTaskEnd(
       stageId = 0,
       stageAttemptId = 1,
       taskType = "",
       reason = null,
       createTaskInfo(0, 0, accums = accumulatorUpdates.mapValues(_ * 2)),
       null))
-    bus.postToAll(SparkListenerTaskEnd(
+    listener.onTaskEnd(SparkListenerTaskEnd(
       stageId = 0,
       stageAttemptId = 1,
       taskType = "",
@@ -227,28 +239,28 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       createTaskInfo(1, 0, accums = accumulatorUpdates.mapValues(_ * 3)),
       null))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 5))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 5))
 
     // Summit a new stage
-    bus.postToAll(SparkListenerStageSubmitted(createStageInfo(1, 0)))
+    listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(1, 0)))
 
-    bus.postToAll(SparkListenerExecutorMetricsUpdate("", Seq(
+    listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, accum updates)
       (0L, 1, 0, createAccumulatorInfos(accumulatorUpdates)),
       (1L, 1, 0, createAccumulatorInfos(accumulatorUpdates))
     )))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 7))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 7))
 
     // Finish two tasks
-    bus.postToAll(SparkListenerTaskEnd(
+    listener.onTaskEnd(SparkListenerTaskEnd(
       stageId = 1,
       stageAttemptId = 0,
       taskType = "",
       reason = null,
       createTaskInfo(0, 0, accums = accumulatorUpdates.mapValues(_ * 3)),
       null))
-    bus.postToAll(SparkListenerTaskEnd(
+    listener.onTaskEnd(SparkListenerTaskEnd(
       stageId = 1,
       stageAttemptId = 0,
       taskType = "",
@@ -256,127 +268,137 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       createTaskInfo(1, 0, accums = accumulatorUpdates.mapValues(_ * 3)),
       null))
 
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 11))
 
-    assertJobs(store.execution(0), running = Seq(0))
+    assertJobs(statusStore.execution(executionId), running = Seq(0))
 
-    bus.postToAll(SparkListenerJobEnd(
+    listener.onJobEnd(SparkListenerJobEnd(
       jobId = 0,
       time = System.currentTimeMillis(),
       JobSucceeded
     ))
-    bus.postToAll(SparkListenerSQLExecutionEnd(
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(
       executionId, System.currentTimeMillis()))
 
-    assertJobs(store.execution(0), completed = Seq(0))
-    checkAnswer(store.executionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
+    assertJobs(statusStore.execution(executionId), completed = Seq(0))
+
+    checkAnswer(statusStore.executionMetrics(executionId), accumulatorUpdates.mapValues(_ * 11))
   }
 
-  sqlStoreTest("onExecutionEnd happens before onJobEnd(JobSucceeded)") { (store, bus) =>
+  test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {
+    val statusStore = createStatusStore()
+    val listener = statusStore.listener.get
+
     val executionId = 0
     val df = createTestDataFrame
-    bus.postToAll(SparkListenerSQLExecutionStart(
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
       executionId,
       "test",
       "test",
       df.queryExecution.toString,
       SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
       System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobStart(
+    listener.onJobStart(SparkListenerJobStart(
       jobId = 0,
       time = System.currentTimeMillis(),
       stageInfos = Nil,
       createProperties(executionId)))
-    bus.postToAll(SparkListenerSQLExecutionEnd(
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(
       executionId, System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobEnd(
+    listener.onJobEnd(SparkListenerJobEnd(
       jobId = 0,
       time = System.currentTimeMillis(),
       JobSucceeded
     ))
 
-    assertJobs(store.execution(0), completed = Seq(0))
+    assertJobs(statusStore.execution(executionId), completed = Seq(0))
   }
 
-  sqlStoreTest("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") { (store, bus) =>
+  test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") {
+    val statusStore = createStatusStore()
+    val listener = statusStore.listener.get
+
     val executionId = 0
     val df = createTestDataFrame
-    bus.postToAll(SparkListenerSQLExecutionStart(
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
       executionId,
       "test",
       "test",
       df.queryExecution.toString,
       SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
       System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobStart(
+    listener.onJobStart(SparkListenerJobStart(
       jobId = 0,
       time = System.currentTimeMillis(),
       stageInfos = Nil,
       createProperties(executionId)))
-    bus.postToAll(SparkListenerJobEnd(
+    listener.onJobEnd(SparkListenerJobEnd(
         jobId = 0,
         time = System.currentTimeMillis(),
         JobSucceeded
     ))
 
-    bus.postToAll(SparkListenerJobStart(
+    listener.onJobStart(SparkListenerJobStart(
       jobId = 1,
       time = System.currentTimeMillis(),
       stageInfos = Nil,
       createProperties(executionId)))
-    bus.postToAll(SparkListenerSQLExecutionEnd(
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(
       executionId, System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobEnd(
+    listener.onJobEnd(SparkListenerJobEnd(
       jobId = 1,
       time = System.currentTimeMillis(),
       JobSucceeded
     ))
 
-    assertJobs(store.execution(0), completed = Seq(0, 1))
+    assertJobs(statusStore.execution(executionId), completed = Seq(0, 1))
   }
 
-  sqlStoreTest("onExecutionEnd happens before onJobEnd(JobFailed)") { (store, bus) =>
+  test("onExecutionEnd happens before onJobEnd(JobFailed)") {
+    val statusStore = createStatusStore()
+    val listener = statusStore.listener.get
+
     val executionId = 0
     val df = createTestDataFrame
-    bus.postToAll(SparkListenerSQLExecutionStart(
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
       executionId,
       "test",
       "test",
       df.queryExecution.toString,
       SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
       System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobStart(
+    listener.onJobStart(SparkListenerJobStart(
       jobId = 0,
       time = System.currentTimeMillis(),
       stageInfos = Seq.empty,
       createProperties(executionId)))
-    bus.postToAll(SparkListenerSQLExecutionEnd(
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(
       executionId, System.currentTimeMillis()))
-    bus.postToAll(SparkListenerJobEnd(
+    listener.onJobEnd(SparkListenerJobEnd(
       jobId = 0,
       time = System.currentTimeMillis(),
       JobFailed(new RuntimeException("Oops"))
     ))
 
-    assertJobs(store.execution(0), failed = Seq(0))
+    assertJobs(statusStore.execution(executionId), failed = Seq(0))
   }
 
   test("SPARK-11126: no memory leak when running non SQL jobs") {
-    val previousStageNumber = statusStore.executionsList().size
+    val listener = spark.sharedState.statusStore.listener.get
+    // At the beginning of this test case, there should be no live data in the listener.
+    assert(listener.noLiveData())
     spark.sparkContext.parallelize(1 to 10).foreach(i => ())
     spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-    // listener should ignore the non SQL stage
-    assert(statusStore.executionsList().size == previousStageNumber)
-
-    spark.sparkContext.parallelize(1 to 10).toDF().foreach(i => ())
-    spark.sparkContext.listenerBus.waitUntilEmpty(10000)
-    // listener should save the SQL stage
-    assert(statusStore.executionsList().size == previousStageNumber + 1)
+    // Listener should ignore the non-SQL stages, as the stage data are only removed when SQL
+    // execution ends, which will not be triggered for non-SQL jobs.
+    assert(listener.noLiveData())
   }
 
   test("driver side SQL metrics") {
+    val statusStore = spark.sharedState.statusStore
     val oldCount = statusStore.executionsList().size
-    val expectedAccumValue = 12345L
+
+    val expectedAccumValue = 12345
     val physicalPlan = MyPlan(sqlContext.sparkContext, expectedAccumValue)
     val dummyQueryExecution = new QueryExecution(spark, LocalRelation()) {
       override lazy val sparkPlan = physicalPlan
@@ -387,7 +409,8 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
       physicalPlan.execute().collect()
     }
 
-    while (statusStore.executionsList().size < oldCount) {
+    // Wait until the new execution is started and being tracked.
+    while (statusStore.executionsCount() < oldCount) {
       Thread.sleep(100)
     }
 
@@ -405,30 +428,6 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext with JsonTest
     assert(metrics(driverMetric.id) === expectedValue)
   }
 
-  private def assertJobs(
-      exec: Option[SQLExecutionUIData],
-      running: Seq[Int] = Nil,
-      completed: Seq[Int] = Nil,
-      failed: Seq[Int] = Nil): Unit = {
-
-    val actualRunning = new ListBuffer[Int]()
-    val actualCompleted = new ListBuffer[Int]()
-    val actualFailed = new ListBuffer[Int]()
-
-    exec.get.jobs.foreach { case (jobId, jobStatus) =>
-      jobStatus match {
-        case JobExecutionStatus.RUNNING => actualRunning += jobId
-        case JobExecutionStatus.SUCCEEDED => actualCompleted += jobId
-        case JobExecutionStatus.FAILED => actualFailed += jobId
-        case _ => fail(s"Unexpected status $jobStatus")
-      }
-    }
-
-    assert(actualRunning.toSeq.sorted === running)
-    assert(actualCompleted.toSeq.sorted === completed)
-    assert(actualFailed.toSeq.sorted === failed)
-  }
-
   test("roundtripping SparkListenerDriverAccumUpdates through JsonProtocol (SPARK-18462)") {
     val event = SparkListenerDriverAccumUpdates(1L, Seq((2L, 3L)))
     val json = JsonProtocol.sparkEventToJson(event)
@@ -494,7 +493,7 @@ private case class MyPlan(sc: SparkContext, expectedValue: Long) extends LeafExe
 }
 
 
-class SQLListenerMemoryLeakSuite extends SparkFunSuite {
+class SQLAppStatusListenerMemoryLeakSuite extends SparkFunSuite {
 
   test("no memory leak") {
     val conf = new SparkConf()
@@ -522,9 +521,10 @@ class SQLListenerMemoryLeakSuite extends SparkFunSuite {
           }
         }
         sc.listenerBus.waitUntilEmpty(10000)
-
-        val statusStore = new SQLAppStatusStore(sc.statusStore.store)
-        assert(statusStore.executionsList().size <= 50)
+        val statusStore = spark.sharedState.statusStore
+        assert(statusStore.executionsCount() <= 50)
+        // No live data should be left behind after all executions end.
+        assert(statusStore.listener.get.noLiveData())
       }
     }
   }

From 4e107fdb7463a67d9c77c4a3434dfe70c72982f4 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Thu, 21 Dec 2017 09:18:27 -0800
Subject: [PATCH 169/356] [SPARK-22822][TEST] Basic tests for
 WindowFrameCoercion and DecimalPrecision

## What changes were proposed in this pull request?

Test Coverage for `WindowFrameCoercion` and `DecimalPrecision`, this is a Sub-tasks for [SPARK-22722](https://issues.apache.org/jira/browse/SPARK-22722).

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20008 from wangyum/SPARK-22822.
---
 .../expressions/windowExpressions.scala       |    4 +-
 .../typeCoercion/native/decimalPrecision.sql  | 1448 +++
 .../native/windowFrameCoercion.sql            |   44 +
 .../native/decimalPrecision.sql.out           | 9514 +++++++++++++++++
 .../native/windowFrameCoercion.sql.out        |  206 +
 .../sql-tests/results/window.sql.out          |    2 +-
 6 files changed, 11215 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalPrecision.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/windowFrameCoercion.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index e11e3a105f597..220cc4f885d7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -251,8 +251,8 @@ case class SpecifiedWindowFrame(
       TypeCheckFailure(s"Window frame $location bound '$e' is not a literal.")
     case e: Expression if !frameType.inputType.acceptsType(e.dataType) =>
       TypeCheckFailure(
-        s"The data type of the $location bound '${e.dataType} does not match " +
-          s"the expected data type '${frameType.inputType}'.")
+        s"The data type of the $location bound '${e.dataType}' does not match " +
+          s"the expected data type '${frameType.inputType.simpleString}'.")
     case _ => TypeCheckSuccess
   }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalPrecision.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalPrecision.sql
new file mode 100644
index 0000000000000..8b04864b18ce3
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalPrecision.sql
@@ -0,0 +1,1448 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT cast(1 as tinyint) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  + cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  + cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  - cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  - cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast('2017*12*11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast('2017*12*11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  * cast('2017*12*11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  * cast('2017*12*11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast('2017/12/11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast('2017/12/11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  / cast('2017/12/11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  / cast('2017/12/11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  % cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  % cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as int), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as int), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as int), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as int), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as float), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as float), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as float), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as float), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as double), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as double), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as double), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as double), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast('1' as binary), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast('1' as binary), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast('1' as binary), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast('1' as binary), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(3, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(5, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(20, 0))) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as tinyint)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as tinyint)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as tinyint)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as smallint)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as smallint)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as smallint)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as int)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as int)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as int)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as int)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as bigint)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as bigint)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as bigint)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as float)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as float)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as float)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as float)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as double)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as double)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as double)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as double)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as decimal(10, 0))) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as string)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as string)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as string)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as string)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('1' as binary)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('1' as binary)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast('1' as binary)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as boolean)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as boolean)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as boolean)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t;
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t;
+SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t;
+SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT cast(1 as tinyint) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  = cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  = cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <=> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  <=> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  < cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  < cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  <= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  > cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  > cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  >= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  >= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t;
+
+SELECT cast(1 as tinyint) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as tinyint) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as tinyint) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as tinyint) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as smallint) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as smallint) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as smallint) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as smallint) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as int) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as int) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as int) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as int) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as bigint) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as bigint) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as bigint) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as bigint) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as float) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as float) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as float) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as float) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as double) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as double) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as double) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as double) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('1' as binary) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('1' as binary) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('1' as binary) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('1' as binary) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(3, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(5, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(20, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as tinyint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as tinyint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as smallint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as smallint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as int) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as int) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as bigint) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as bigint) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as float) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as float) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as double) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as double) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as decimal(10, 0)) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as string) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as string) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast('1' as binary) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast('1' as binary) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as boolean) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as boolean) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t;
+
+SELECT cast(1 as decimal(3, 0))  <> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(5, 0))  <> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t;
+SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/windowFrameCoercion.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/windowFrameCoercion.sql
new file mode 100644
index 0000000000000..5cd3538757499
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/windowFrameCoercion.sql
@@ -0,0 +1,44 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0))) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp)) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date)) FROM t;
+
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0)) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
new file mode 100644
index 0000000000000..ebc8201ed5a1d
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalPrecision.sql.out
@@ -0,0 +1,9514 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 1145
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT cast(1 as tinyint) + cast(1 as decimal(3, 0)) FROM t
+-- !query 1 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) + CAST(1 AS DECIMAL(3,0))):decimal(4,0)>
+-- !query 1 output
+2
+
+
+-- !query 2
+SELECT cast(1 as tinyint) + cast(1 as decimal(5, 0)) FROM t
+-- !query 2 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 2 output
+2
+
+
+-- !query 3
+SELECT cast(1 as tinyint) + cast(1 as decimal(10, 0)) FROM t
+-- !query 3 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 3 output
+2
+
+
+-- !query 4
+SELECT cast(1 as tinyint) + cast(1 as decimal(20, 0)) FROM t
+-- !query 4 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 4 output
+2
+
+
+-- !query 5
+SELECT cast(1 as smallint) + cast(1 as decimal(3, 0)) FROM t
+-- !query 5 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 5 output
+2
+
+
+-- !query 6
+SELECT cast(1 as smallint) + cast(1 as decimal(5, 0)) FROM t
+-- !query 6 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) + CAST(1 AS DECIMAL(5,0))):decimal(6,0)>
+-- !query 6 output
+2
+
+
+-- !query 7
+SELECT cast(1 as smallint) + cast(1 as decimal(10, 0)) FROM t
+-- !query 7 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 7 output
+2
+
+
+-- !query 8
+SELECT cast(1 as smallint) + cast(1 as decimal(20, 0)) FROM t
+-- !query 8 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 8 output
+2
+
+
+-- !query 9
+SELECT cast(1 as int) + cast(1 as decimal(3, 0)) FROM t
+-- !query 9 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 9 output
+2
+
+
+-- !query 10
+SELECT cast(1 as int) + cast(1 as decimal(5, 0)) FROM t
+-- !query 10 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 10 output
+2
+
+
+-- !query 11
+SELECT cast(1 as int) + cast(1 as decimal(10, 0)) FROM t
+-- !query 11 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 11 output
+2
+
+
+-- !query 12
+SELECT cast(1 as int) + cast(1 as decimal(20, 0)) FROM t
+-- !query 12 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 12 output
+2
+
+
+-- !query 13
+SELECT cast(1 as bigint) + cast(1 as decimal(3, 0)) FROM t
+-- !query 13 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 13 output
+2
+
+
+-- !query 14
+SELECT cast(1 as bigint) + cast(1 as decimal(5, 0)) FROM t
+-- !query 14 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 14 output
+2
+
+
+-- !query 15
+SELECT cast(1 as bigint) + cast(1 as decimal(10, 0)) FROM t
+-- !query 15 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 15 output
+2
+
+
+-- !query 16
+SELECT cast(1 as bigint) + cast(1 as decimal(20, 0)) FROM t
+-- !query 16 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) + CAST(1 AS DECIMAL(20,0))):decimal(21,0)>
+-- !query 16 output
+2
+
+
+-- !query 17
+SELECT cast(1 as float) + cast(1 as decimal(3, 0)) FROM t
+-- !query 17 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 17 output
+2.0
+
+
+-- !query 18
+SELECT cast(1 as float) + cast(1 as decimal(5, 0)) FROM t
+-- !query 18 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 18 output
+2.0
+
+
+-- !query 19
+SELECT cast(1 as float) + cast(1 as decimal(10, 0)) FROM t
+-- !query 19 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 19 output
+2.0
+
+
+-- !query 20
+SELECT cast(1 as float) + cast(1 as decimal(20, 0)) FROM t
+-- !query 20 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) + CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 20 output
+2.0
+
+
+-- !query 21
+SELECT cast(1 as double) + cast(1 as decimal(3, 0)) FROM t
+-- !query 21 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 21 output
+2.0
+
+
+-- !query 22
+SELECT cast(1 as double) + cast(1 as decimal(5, 0)) FROM t
+-- !query 22 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 22 output
+2.0
+
+
+-- !query 23
+SELECT cast(1 as double) + cast(1 as decimal(10, 0)) FROM t
+-- !query 23 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 23 output
+2.0
+
+
+-- !query 24
+SELECT cast(1 as double) + cast(1 as decimal(20, 0)) FROM t
+-- !query 24 schema
+struct<(CAST(1 AS DOUBLE) + CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 24 output
+2.0
+
+
+-- !query 25
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(3, 0)) FROM t
+-- !query 25 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 25 output
+2
+
+
+-- !query 26
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(5, 0)) FROM t
+-- !query 26 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 26 output
+2
+
+
+-- !query 27
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t
+-- !query 27 schema
+struct<(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 27 output
+2
+
+
+-- !query 28
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(20, 0)) FROM t
+-- !query 28 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 28 output
+2
+
+
+-- !query 29
+SELECT cast('1' as binary) + cast(1 as decimal(3, 0)) FROM t
+-- !query 29 schema
+struct<>
+-- !query 29 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 30
+SELECT cast('1' as binary) + cast(1 as decimal(5, 0)) FROM t
+-- !query 30 schema
+struct<>
+-- !query 30 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 31
+SELECT cast('1' as binary) + cast(1 as decimal(10, 0)) FROM t
+-- !query 31 schema
+struct<>
+-- !query 31 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 32
+SELECT cast('1' as binary) + cast(1 as decimal(20, 0)) FROM t
+-- !query 32 schema
+struct<>
+-- !query 32 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 33
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(3, 0)) FROM t
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 34
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(5, 0)) FROM t
+-- !query 34 schema
+struct<>
+-- !query 34 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 35
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(10, 0)) FROM t
+-- !query 35 schema
+struct<>
+-- !query 35 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 36
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) + cast(1 as decimal(20, 0)) FROM t
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) + CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 37
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(3, 0)) FROM t
+-- !query 37 schema
+struct<>
+-- !query 37 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 38
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(5, 0)) FROM t
+-- !query 38 schema
+struct<>
+-- !query 38 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 39
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(10, 0)) FROM t
+-- !query 39 schema
+struct<>
+-- !query 39 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 40
+SELECT cast('2017-12-11 09:30:00' as date) + cast(1 as decimal(20, 0)) FROM t
+-- !query 40 schema
+struct<>
+-- !query 40 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) + CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 41
+SELECT cast(1 as decimal(3, 0))  + cast(1 as tinyint) FROM t
+-- !query 41 schema
+struct<(CAST(1 AS DECIMAL(3,0)) + CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(4,0)>
+-- !query 41 output
+2
+
+
+-- !query 42
+SELECT cast(1 as decimal(5, 0))  + cast(1 as tinyint) FROM t
+-- !query 42 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 42 output
+2
+
+
+-- !query 43
+SELECT cast(1 as decimal(10, 0)) + cast(1 as tinyint) FROM t
+-- !query 43 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 43 output
+2
+
+
+-- !query 44
+SELECT cast(1 as decimal(20, 0)) + cast(1 as tinyint) FROM t
+-- !query 44 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 44 output
+2
+
+
+-- !query 45
+SELECT cast(1 as decimal(3, 0))  + cast(1 as smallint) FROM t
+-- !query 45 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 45 output
+2
+
+
+-- !query 46
+SELECT cast(1 as decimal(5, 0))  + cast(1 as smallint) FROM t
+-- !query 46 schema
+struct<(CAST(1 AS DECIMAL(5,0)) + CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(6,0)>
+-- !query 46 output
+2
+
+
+-- !query 47
+SELECT cast(1 as decimal(10, 0)) + cast(1 as smallint) FROM t
+-- !query 47 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 47 output
+2
+
+
+-- !query 48
+SELECT cast(1 as decimal(20, 0)) + cast(1 as smallint) FROM t
+-- !query 48 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 48 output
+2
+
+
+-- !query 49
+SELECT cast(1 as decimal(3, 0))  + cast(1 as int) FROM t
+-- !query 49 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 49 output
+2
+
+
+-- !query 50
+SELECT cast(1 as decimal(5, 0))  + cast(1 as int) FROM t
+-- !query 50 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 50 output
+2
+
+
+-- !query 51
+SELECT cast(1 as decimal(10, 0)) + cast(1 as int) FROM t
+-- !query 51 schema
+struct<(CAST(1 AS DECIMAL(10,0)) + CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 51 output
+2
+
+
+-- !query 52
+SELECT cast(1 as decimal(20, 0)) + cast(1 as int) FROM t
+-- !query 52 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 52 output
+2
+
+
+-- !query 53
+SELECT cast(1 as decimal(3, 0))  + cast(1 as bigint) FROM t
+-- !query 53 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 53 output
+2
+
+
+-- !query 54
+SELECT cast(1 as decimal(5, 0))  + cast(1 as bigint) FROM t
+-- !query 54 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 54 output
+2
+
+
+-- !query 55
+SELECT cast(1 as decimal(10, 0)) + cast(1 as bigint) FROM t
+-- !query 55 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) + CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 55 output
+2
+
+
+-- !query 56
+SELECT cast(1 as decimal(20, 0)) + cast(1 as bigint) FROM t
+-- !query 56 schema
+struct<(CAST(1 AS DECIMAL(20,0)) + CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(21,0)>
+-- !query 56 output
+2
+
+
+-- !query 57
+SELECT cast(1 as decimal(3, 0))  + cast(1 as float) FROM t
+-- !query 57 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 57 output
+2.0
+
+
+-- !query 58
+SELECT cast(1 as decimal(5, 0))  + cast(1 as float) FROM t
+-- !query 58 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 58 output
+2.0
+
+
+-- !query 59
+SELECT cast(1 as decimal(10, 0)) + cast(1 as float) FROM t
+-- !query 59 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 59 output
+2.0
+
+
+-- !query 60
+SELECT cast(1 as decimal(20, 0)) + cast(1 as float) FROM t
+-- !query 60 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 60 output
+2.0
+
+
+-- !query 61
+SELECT cast(1 as decimal(3, 0))  + cast(1 as double) FROM t
+-- !query 61 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 61 output
+2.0
+
+
+-- !query 62
+SELECT cast(1 as decimal(5, 0))  + cast(1 as double) FROM t
+-- !query 62 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 62 output
+2.0
+
+
+-- !query 63
+SELECT cast(1 as decimal(10, 0)) + cast(1 as double) FROM t
+-- !query 63 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 63 output
+2.0
+
+
+-- !query 64
+SELECT cast(1 as decimal(20, 0)) + cast(1 as double) FROM t
+-- !query 64 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(1 AS DOUBLE)):double>
+-- !query 64 output
+2.0
+
+
+-- !query 65
+SELECT cast(1 as decimal(3, 0))  + cast(1 as decimal(10, 0)) FROM t
+-- !query 65 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 65 output
+2
+
+
+-- !query 66
+SELECT cast(1 as decimal(5, 0))  + cast(1 as decimal(10, 0)) FROM t
+-- !query 66 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 66 output
+2
+
+
+-- !query 67
+SELECT cast(1 as decimal(10, 0)) + cast(1 as decimal(10, 0)) FROM t
+-- !query 67 schema
+struct<(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 67 output
+2
+
+
+-- !query 68
+SELECT cast(1 as decimal(20, 0)) + cast(1 as decimal(10, 0)) FROM t
+-- !query 68 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) + CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 68 output
+2
+
+
+-- !query 69
+SELECT cast(1 as decimal(3, 0))  + cast(1 as string) FROM t
+-- !query 69 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 69 output
+2.0
+
+
+-- !query 70
+SELECT cast(1 as decimal(5, 0))  + cast(1 as string) FROM t
+-- !query 70 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 70 output
+2.0
+
+
+-- !query 71
+SELECT cast(1 as decimal(10, 0)) + cast(1 as string) FROM t
+-- !query 71 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 71 output
+2.0
+
+
+-- !query 72
+SELECT cast(1 as decimal(20, 0)) + cast(1 as string) FROM t
+-- !query 72 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) + CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 72 output
+2.0
+
+
+-- !query 73
+SELECT cast(1 as decimal(3, 0))  + cast('1' as binary) FROM t
+-- !query 73 schema
+struct<>
+-- !query 73 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 74
+SELECT cast(1 as decimal(5, 0))  + cast('1' as binary) FROM t
+-- !query 74 schema
+struct<>
+-- !query 74 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 75
+SELECT cast(1 as decimal(10, 0)) + cast('1' as binary) FROM t
+-- !query 75 schema
+struct<>
+-- !query 75 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 76
+SELECT cast(1 as decimal(20, 0)) + cast('1' as binary) FROM t
+-- !query 76 schema
+struct<>
+-- !query 76 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 77
+SELECT cast(1 as decimal(3, 0))  + cast(1 as boolean) FROM t
+-- !query 77 schema
+struct<>
+-- !query 77 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 78
+SELECT cast(1 as decimal(5, 0))  + cast(1 as boolean) FROM t
+-- !query 78 schema
+struct<>
+-- !query 78 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 79
+SELECT cast(1 as decimal(10, 0)) + cast(1 as boolean) FROM t
+-- !query 79 schema
+struct<>
+-- !query 79 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 80
+SELECT cast(1 as decimal(20, 0)) + cast(1 as boolean) FROM t
+-- !query 80 schema
+struct<>
+-- !query 80 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 81
+SELECT cast(1 as decimal(3, 0))  + cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 81 schema
+struct<>
+-- !query 81 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 82
+SELECT cast(1 as decimal(5, 0))  + cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 82 schema
+struct<>
+-- !query 82 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 83
+SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 83 schema
+struct<>
+-- !query 83 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 84
+SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 84 schema
+struct<>
+-- !query 84 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 85
+SELECT cast(1 as decimal(3, 0))  + cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 85 schema
+struct<>
+-- !query 85 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 86
+SELECT cast(1 as decimal(5, 0))  + cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 86 schema
+struct<>
+-- !query 86 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 87
+SELECT cast(1 as decimal(10, 0)) + cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 87 schema
+struct<>
+-- !query 87 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 88
+SELECT cast(1 as decimal(20, 0)) + cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 88 schema
+struct<>
+-- !query 88 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) + CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 89
+SELECT cast(1 as tinyint) - cast(1 as decimal(3, 0)) FROM t
+-- !query 89 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) - CAST(1 AS DECIMAL(3,0))):decimal(4,0)>
+-- !query 89 output
+0
+
+
+-- !query 90
+SELECT cast(1 as tinyint) - cast(1 as decimal(5, 0)) FROM t
+-- !query 90 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 90 output
+0
+
+
+-- !query 91
+SELECT cast(1 as tinyint) - cast(1 as decimal(10, 0)) FROM t
+-- !query 91 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 91 output
+0
+
+
+-- !query 92
+SELECT cast(1 as tinyint) - cast(1 as decimal(20, 0)) FROM t
+-- !query 92 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 92 output
+0
+
+
+-- !query 93
+SELECT cast(1 as smallint) - cast(1 as decimal(3, 0)) FROM t
+-- !query 93 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 93 output
+0
+
+
+-- !query 94
+SELECT cast(1 as smallint) - cast(1 as decimal(5, 0)) FROM t
+-- !query 94 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) - CAST(1 AS DECIMAL(5,0))):decimal(6,0)>
+-- !query 94 output
+0
+
+
+-- !query 95
+SELECT cast(1 as smallint) - cast(1 as decimal(10, 0)) FROM t
+-- !query 95 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 95 output
+0
+
+
+-- !query 96
+SELECT cast(1 as smallint) - cast(1 as decimal(20, 0)) FROM t
+-- !query 96 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 96 output
+0
+
+
+-- !query 97
+SELECT cast(1 as int) - cast(1 as decimal(3, 0)) FROM t
+-- !query 97 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 97 output
+0
+
+
+-- !query 98
+SELECT cast(1 as int) - cast(1 as decimal(5, 0)) FROM t
+-- !query 98 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 98 output
+0
+
+
+-- !query 99
+SELECT cast(1 as int) - cast(1 as decimal(10, 0)) FROM t
+-- !query 99 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 99 output
+0
+
+
+-- !query 100
+SELECT cast(1 as int) - cast(1 as decimal(20, 0)) FROM t
+-- !query 100 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 100 output
+0
+
+
+-- !query 101
+SELECT cast(1 as bigint) - cast(1 as decimal(3, 0)) FROM t
+-- !query 101 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 101 output
+0
+
+
+-- !query 102
+SELECT cast(1 as bigint) - cast(1 as decimal(5, 0)) FROM t
+-- !query 102 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 102 output
+0
+
+
+-- !query 103
+SELECT cast(1 as bigint) - cast(1 as decimal(10, 0)) FROM t
+-- !query 103 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 103 output
+0
+
+
+-- !query 104
+SELECT cast(1 as bigint) - cast(1 as decimal(20, 0)) FROM t
+-- !query 104 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) - CAST(1 AS DECIMAL(20,0))):decimal(21,0)>
+-- !query 104 output
+0
+
+
+-- !query 105
+SELECT cast(1 as float) - cast(1 as decimal(3, 0)) FROM t
+-- !query 105 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 105 output
+0.0
+
+
+-- !query 106
+SELECT cast(1 as float) - cast(1 as decimal(5, 0)) FROM t
+-- !query 106 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 106 output
+0.0
+
+
+-- !query 107
+SELECT cast(1 as float) - cast(1 as decimal(10, 0)) FROM t
+-- !query 107 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 107 output
+0.0
+
+
+-- !query 108
+SELECT cast(1 as float) - cast(1 as decimal(20, 0)) FROM t
+-- !query 108 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) - CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 108 output
+0.0
+
+
+-- !query 109
+SELECT cast(1 as double) - cast(1 as decimal(3, 0)) FROM t
+-- !query 109 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 109 output
+0.0
+
+
+-- !query 110
+SELECT cast(1 as double) - cast(1 as decimal(5, 0)) FROM t
+-- !query 110 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 110 output
+0.0
+
+
+-- !query 111
+SELECT cast(1 as double) - cast(1 as decimal(10, 0)) FROM t
+-- !query 111 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 111 output
+0.0
+
+
+-- !query 112
+SELECT cast(1 as double) - cast(1 as decimal(20, 0)) FROM t
+-- !query 112 schema
+struct<(CAST(1 AS DOUBLE) - CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 112 output
+0.0
+
+
+-- !query 113
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(3, 0)) FROM t
+-- !query 113 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 113 output
+0
+
+
+-- !query 114
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(5, 0)) FROM t
+-- !query 114 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 114 output
+0
+
+
+-- !query 115
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t
+-- !query 115 schema
+struct<(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 115 output
+0
+
+
+-- !query 116
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(20, 0)) FROM t
+-- !query 116 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 116 output
+0
+
+
+-- !query 117
+SELECT cast('1' as binary) - cast(1 as decimal(3, 0)) FROM t
+-- !query 117 schema
+struct<>
+-- !query 117 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 118
+SELECT cast('1' as binary) - cast(1 as decimal(5, 0)) FROM t
+-- !query 118 schema
+struct<>
+-- !query 118 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 119
+SELECT cast('1' as binary) - cast(1 as decimal(10, 0)) FROM t
+-- !query 119 schema
+struct<>
+-- !query 119 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 120
+SELECT cast('1' as binary) - cast(1 as decimal(20, 0)) FROM t
+-- !query 120 schema
+struct<>
+-- !query 120 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 121
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(3, 0)) FROM t
+-- !query 121 schema
+struct<>
+-- !query 121 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 122
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(5, 0)) FROM t
+-- !query 122 schema
+struct<>
+-- !query 122 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 123
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(10, 0)) FROM t
+-- !query 123 schema
+struct<>
+-- !query 123 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 124
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) - cast(1 as decimal(20, 0)) FROM t
+-- !query 124 schema
+struct<>
+-- !query 124 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) - CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 125
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(3, 0)) FROM t
+-- !query 125 schema
+struct<>
+-- !query 125 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 126
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(5, 0)) FROM t
+-- !query 126 schema
+struct<>
+-- !query 126 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 127
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(10, 0)) FROM t
+-- !query 127 schema
+struct<>
+-- !query 127 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 128
+SELECT cast('2017-12-11 09:30:00' as date) - cast(1 as decimal(20, 0)) FROM t
+-- !query 128 schema
+struct<>
+-- !query 128 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) - CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 129
+SELECT cast(1 as decimal(3, 0))  - cast(1 as tinyint) FROM t
+-- !query 129 schema
+struct<(CAST(1 AS DECIMAL(3,0)) - CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(4,0)>
+-- !query 129 output
+0
+
+
+-- !query 130
+SELECT cast(1 as decimal(5, 0))  - cast(1 as tinyint) FROM t
+-- !query 130 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(6,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 130 output
+0
+
+
+-- !query 131
+SELECT cast(1 as decimal(10, 0)) - cast(1 as tinyint) FROM t
+-- !query 131 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 131 output
+0
+
+
+-- !query 132
+SELECT cast(1 as decimal(20, 0)) - cast(1 as tinyint) FROM t
+-- !query 132 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 132 output
+0
+
+
+-- !query 133
+SELECT cast(1 as decimal(3, 0))  - cast(1 as smallint) FROM t
+-- !query 133 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(6,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(6,0))):decimal(6,0)>
+-- !query 133 output
+0
+
+
+-- !query 134
+SELECT cast(1 as decimal(5, 0))  - cast(1 as smallint) FROM t
+-- !query 134 schema
+struct<(CAST(1 AS DECIMAL(5,0)) - CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(6,0)>
+-- !query 134 output
+0
+
+
+-- !query 135
+SELECT cast(1 as decimal(10, 0)) - cast(1 as smallint) FROM t
+-- !query 135 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 135 output
+0
+
+
+-- !query 136
+SELECT cast(1 as decimal(20, 0)) - cast(1 as smallint) FROM t
+-- !query 136 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 136 output
+0
+
+
+-- !query 137
+SELECT cast(1 as decimal(3, 0))  - cast(1 as int) FROM t
+-- !query 137 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 137 output
+0
+
+
+-- !query 138
+SELECT cast(1 as decimal(5, 0))  - cast(1 as int) FROM t
+-- !query 138 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 138 output
+0
+
+
+-- !query 139
+SELECT cast(1 as decimal(10, 0)) - cast(1 as int) FROM t
+-- !query 139 schema
+struct<(CAST(1 AS DECIMAL(10,0)) - CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 139 output
+0
+
+
+-- !query 140
+SELECT cast(1 as decimal(20, 0)) - cast(1 as int) FROM t
+-- !query 140 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 140 output
+0
+
+
+-- !query 141
+SELECT cast(1 as decimal(3, 0))  - cast(1 as bigint) FROM t
+-- !query 141 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 141 output
+0
+
+
+-- !query 142
+SELECT cast(1 as decimal(5, 0))  - cast(1 as bigint) FROM t
+-- !query 142 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 142 output
+0
+
+
+-- !query 143
+SELECT cast(1 as decimal(10, 0)) - cast(1 as bigint) FROM t
+-- !query 143 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0)) - CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 143 output
+0
+
+
+-- !query 144
+SELECT cast(1 as decimal(20, 0)) - cast(1 as bigint) FROM t
+-- !query 144 schema
+struct<(CAST(1 AS DECIMAL(20,0)) - CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(21,0)>
+-- !query 144 output
+0
+
+
+-- !query 145
+SELECT cast(1 as decimal(3, 0))  - cast(1 as float) FROM t
+-- !query 145 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 145 output
+0.0
+
+
+-- !query 146
+SELECT cast(1 as decimal(5, 0))  - cast(1 as float) FROM t
+-- !query 146 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 146 output
+0.0
+
+
+-- !query 147
+SELECT cast(1 as decimal(10, 0)) - cast(1 as float) FROM t
+-- !query 147 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 147 output
+0.0
+
+
+-- !query 148
+SELECT cast(1 as decimal(20, 0)) - cast(1 as float) FROM t
+-- !query 148 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 148 output
+0.0
+
+
+-- !query 149
+SELECT cast(1 as decimal(3, 0))  - cast(1 as double) FROM t
+-- !query 149 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 149 output
+0.0
+
+
+-- !query 150
+SELECT cast(1 as decimal(5, 0))  - cast(1 as double) FROM t
+-- !query 150 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 150 output
+0.0
+
+
+-- !query 151
+SELECT cast(1 as decimal(10, 0)) - cast(1 as double) FROM t
+-- !query 151 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 151 output
+0.0
+
+
+-- !query 152
+SELECT cast(1 as decimal(20, 0)) - cast(1 as double) FROM t
+-- !query 152 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(1 AS DOUBLE)):double>
+-- !query 152 output
+0.0
+
+
+-- !query 153
+SELECT cast(1 as decimal(3, 0))  - cast(1 as decimal(10, 0)) FROM t
+-- !query 153 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 153 output
+0
+
+
+-- !query 154
+SELECT cast(1 as decimal(5, 0))  - cast(1 as decimal(10, 0)) FROM t
+-- !query 154 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(11,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(11,0))):decimal(11,0)>
+-- !query 154 output
+0
+
+
+-- !query 155
+SELECT cast(1 as decimal(10, 0)) - cast(1 as decimal(10, 0)) FROM t
+-- !query 155 schema
+struct<(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS DECIMAL(10,0))):decimal(11,0)>
+-- !query 155 output
+0
+
+
+-- !query 156
+SELECT cast(1 as decimal(20, 0)) - cast(1 as decimal(10, 0)) FROM t
+-- !query 156 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(21,0)) - CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(21,0))):decimal(21,0)>
+-- !query 156 output
+0
+
+
+-- !query 157
+SELECT cast(1 as decimal(3, 0))  - cast(1 as string) FROM t
+-- !query 157 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 157 output
+0.0
+
+
+-- !query 158
+SELECT cast(1 as decimal(5, 0))  - cast(1 as string) FROM t
+-- !query 158 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 158 output
+0.0
+
+
+-- !query 159
+SELECT cast(1 as decimal(10, 0)) - cast(1 as string) FROM t
+-- !query 159 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 159 output
+0.0
+
+
+-- !query 160
+SELECT cast(1 as decimal(20, 0)) - cast(1 as string) FROM t
+-- !query 160 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) - CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 160 output
+0.0
+
+
+-- !query 161
+SELECT cast(1 as decimal(3, 0))  - cast('1' as binary) FROM t
+-- !query 161 schema
+struct<>
+-- !query 161 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 162
+SELECT cast(1 as decimal(5, 0))  - cast('1' as binary) FROM t
+-- !query 162 schema
+struct<>
+-- !query 162 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 163
+SELECT cast(1 as decimal(10, 0)) - cast('1' as binary) FROM t
+-- !query 163 schema
+struct<>
+-- !query 163 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 164
+SELECT cast(1 as decimal(20, 0)) - cast('1' as binary) FROM t
+-- !query 164 schema
+struct<>
+-- !query 164 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 165
+SELECT cast(1 as decimal(3, 0))  - cast(1 as boolean) FROM t
+-- !query 165 schema
+struct<>
+-- !query 165 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 166
+SELECT cast(1 as decimal(5, 0))  - cast(1 as boolean) FROM t
+-- !query 166 schema
+struct<>
+-- !query 166 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 167
+SELECT cast(1 as decimal(10, 0)) - cast(1 as boolean) FROM t
+-- !query 167 schema
+struct<>
+-- !query 167 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 168
+SELECT cast(1 as decimal(20, 0)) - cast(1 as boolean) FROM t
+-- !query 168 schema
+struct<>
+-- !query 168 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 169
+SELECT cast(1 as decimal(3, 0))  - cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 169 schema
+struct<>
+-- !query 169 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 170
+SELECT cast(1 as decimal(5, 0))  - cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 170 schema
+struct<>
+-- !query 170 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 171
+SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 171 schema
+struct<>
+-- !query 171 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 172
+SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 172 schema
+struct<>
+-- !query 172 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 173
+SELECT cast(1 as decimal(3, 0))  - cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 173 schema
+struct<>
+-- !query 173 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 174
+SELECT cast(1 as decimal(5, 0))  - cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 174 schema
+struct<>
+-- !query 174 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 175
+SELECT cast(1 as decimal(10, 0)) - cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 175 schema
+struct<>
+-- !query 175 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 176
+SELECT cast(1 as decimal(20, 0)) - cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 176 schema
+struct<>
+-- !query 176 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) - CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 177
+SELECT cast(1 as tinyint) * cast(1 as decimal(3, 0)) FROM t
+-- !query 177 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) * CAST(1 AS DECIMAL(3,0))):decimal(7,0)>
+-- !query 177 output
+1
+
+
+-- !query 178
+SELECT cast(1 as tinyint) * cast(1 as decimal(5, 0)) FROM t
+-- !query 178 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,0)>
+-- !query 178 output
+1
+
+
+-- !query 179
+SELECT cast(1 as tinyint) * cast(1 as decimal(10, 0)) FROM t
+-- !query 179 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 179 output
+1
+
+
+-- !query 180
+SELECT cast(1 as tinyint) * cast(1 as decimal(20, 0)) FROM t
+-- !query 180 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,0)>
+-- !query 180 output
+1
+
+
+-- !query 181
+SELECT cast(1 as smallint) * cast(1 as decimal(3, 0)) FROM t
+-- !query 181 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(9,0)>
+-- !query 181 output
+1
+
+
+-- !query 182
+SELECT cast(1 as smallint) * cast(1 as decimal(5, 0)) FROM t
+-- !query 182 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) * CAST(1 AS DECIMAL(5,0))):decimal(11,0)>
+-- !query 182 output
+1
+
+
+-- !query 183
+SELECT cast(1 as smallint) * cast(1 as decimal(10, 0)) FROM t
+-- !query 183 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 183 output
+1
+
+
+-- !query 184
+SELECT cast(1 as smallint) * cast(1 as decimal(20, 0)) FROM t
+-- !query 184 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,0)>
+-- !query 184 output
+1
+
+
+-- !query 185
+SELECT cast(1 as int) * cast(1 as decimal(3, 0)) FROM t
+-- !query 185 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 185 output
+1
+
+
+-- !query 186
+SELECT cast(1 as int) * cast(1 as decimal(5, 0)) FROM t
+-- !query 186 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 186 output
+1
+
+
+-- !query 187
+SELECT cast(1 as int) * cast(1 as decimal(10, 0)) FROM t
+-- !query 187 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)>
+-- !query 187 output
+1
+
+
+-- !query 188
+SELECT cast(1 as int) * cast(1 as decimal(20, 0)) FROM t
+-- !query 188 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 188 output
+1
+
+
+-- !query 189
+SELECT cast(1 as bigint) * cast(1 as decimal(3, 0)) FROM t
+-- !query 189 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(24,0)>
+-- !query 189 output
+1
+
+
+-- !query 190
+SELECT cast(1 as bigint) * cast(1 as decimal(5, 0)) FROM t
+-- !query 190 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,0)>
+-- !query 190 output
+1
+
+
+-- !query 191
+SELECT cast(1 as bigint) * cast(1 as decimal(10, 0)) FROM t
+-- !query 191 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 191 output
+1
+
+
+-- !query 192
+SELECT cast(1 as bigint) * cast(1 as decimal(20, 0)) FROM t
+-- !query 192 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) * CAST(1 AS DECIMAL(20,0))):decimal(38,0)>
+-- !query 192 output
+1
+
+
+-- !query 193
+SELECT cast(1 as float) * cast(1 as decimal(3, 0)) FROM t
+-- !query 193 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 193 output
+1.0
+
+
+-- !query 194
+SELECT cast(1 as float) * cast(1 as decimal(5, 0)) FROM t
+-- !query 194 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 194 output
+1.0
+
+
+-- !query 195
+SELECT cast(1 as float) * cast(1 as decimal(10, 0)) FROM t
+-- !query 195 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 195 output
+1.0
+
+
+-- !query 196
+SELECT cast(1 as float) * cast(1 as decimal(20, 0)) FROM t
+-- !query 196 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) * CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 196 output
+1.0
+
+
+-- !query 197
+SELECT cast(1 as double) * cast(1 as decimal(3, 0)) FROM t
+-- !query 197 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 197 output
+1.0
+
+
+-- !query 198
+SELECT cast(1 as double) * cast(1 as decimal(5, 0)) FROM t
+-- !query 198 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 198 output
+1.0
+
+
+-- !query 199
+SELECT cast(1 as double) * cast(1 as decimal(10, 0)) FROM t
+-- !query 199 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 199 output
+1.0
+
+
+-- !query 200
+SELECT cast(1 as double) * cast(1 as decimal(20, 0)) FROM t
+-- !query 200 schema
+struct<(CAST(1 AS DOUBLE) * CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 200 output
+1.0
+
+
+-- !query 201
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(3, 0)) FROM t
+-- !query 201 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 201 output
+1
+
+
+-- !query 202
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(5, 0)) FROM t
+-- !query 202 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 202 output
+1
+
+
+-- !query 203
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t
+-- !query 203 schema
+struct<(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)>
+-- !query 203 output
+1
+
+
+-- !query 204
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(20, 0)) FROM t
+-- !query 204 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 204 output
+1
+
+
+-- !query 205
+SELECT cast('1' as binary) * cast(1 as decimal(3, 0)) FROM t
+-- !query 205 schema
+struct<>
+-- !query 205 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 206
+SELECT cast('1' as binary) * cast(1 as decimal(5, 0)) FROM t
+-- !query 206 schema
+struct<>
+-- !query 206 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 207
+SELECT cast('1' as binary) * cast(1 as decimal(10, 0)) FROM t
+-- !query 207 schema
+struct<>
+-- !query 207 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 208
+SELECT cast('1' as binary) * cast(1 as decimal(20, 0)) FROM t
+-- !query 208 schema
+struct<>
+-- !query 208 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) * CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 209
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(3, 0)) FROM t
+-- !query 209 schema
+struct<>
+-- !query 209 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 210
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(5, 0)) FROM t
+-- !query 210 schema
+struct<>
+-- !query 210 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 211
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(10, 0)) FROM t
+-- !query 211 schema
+struct<>
+-- !query 211 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 212
+SELECT cast('2017*12*11 09:30:00.0' as timestamp) * cast(1 as decimal(20, 0)) FROM t
+-- !query 212 schema
+struct<>
+-- !query 212 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00.0' AS TIMESTAMP) * CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 213
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(3, 0)) FROM t
+-- !query 213 schema
+struct<>
+-- !query 213 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 214
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(5, 0)) FROM t
+-- !query 214 schema
+struct<>
+-- !query 214 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 215
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(10, 0)) FROM t
+-- !query 215 schema
+struct<>
+-- !query 215 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 216
+SELECT cast('2017*12*11 09:30:00' as date) * cast(1 as decimal(20, 0)) FROM t
+-- !query 216 schema
+struct<>
+-- !query 216 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017*12*11 09:30:00' AS DATE) * CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 217
+SELECT cast(1 as decimal(3, 0))  * cast(1 as tinyint) FROM t
+-- !query 217 schema
+struct<(CAST(1 AS DECIMAL(3,0)) * CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(7,0)>
+-- !query 217 output
+1
+
+
+-- !query 218
+SELECT cast(1 as decimal(5, 0))  * cast(1 as tinyint) FROM t
+-- !query 218 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(9,0)>
+-- !query 218 output
+1
+
+
+-- !query 219
+SELECT cast(1 as decimal(10, 0)) * cast(1 as tinyint) FROM t
+-- !query 219 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 219 output
+1
+
+
+-- !query 220
+SELECT cast(1 as decimal(20, 0)) * cast(1 as tinyint) FROM t
+-- !query 220 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(24,0)>
+-- !query 220 output
+1
+
+
+-- !query 221
+SELECT cast(1 as decimal(3, 0))  * cast(1 as smallint) FROM t
+-- !query 221 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,0)>
+-- !query 221 output
+1
+
+
+-- !query 222
+SELECT cast(1 as decimal(5, 0))  * cast(1 as smallint) FROM t
+-- !query 222 schema
+struct<(CAST(1 AS DECIMAL(5,0)) * CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(11,0)>
+-- !query 222 output
+1
+
+
+-- !query 223
+SELECT cast(1 as decimal(10, 0)) * cast(1 as smallint) FROM t
+-- !query 223 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 223 output
+1
+
+
+-- !query 224
+SELECT cast(1 as decimal(20, 0)) * cast(1 as smallint) FROM t
+-- !query 224 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,0)>
+-- !query 224 output
+1
+
+
+-- !query 225
+SELECT cast(1 as decimal(3, 0))  * cast(1 as int) FROM t
+-- !query 225 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 225 output
+1
+
+
+-- !query 226
+SELECT cast(1 as decimal(5, 0))  * cast(1 as int) FROM t
+-- !query 226 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 226 output
+1
+
+
+-- !query 227
+SELECT cast(1 as decimal(10, 0)) * cast(1 as int) FROM t
+-- !query 227 schema
+struct<(CAST(1 AS DECIMAL(10,0)) * CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,0)>
+-- !query 227 output
+1
+
+
+-- !query 228
+SELECT cast(1 as decimal(20, 0)) * cast(1 as int) FROM t
+-- !query 228 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 228 output
+1
+
+
+-- !query 229
+SELECT cast(1 as decimal(3, 0))  * cast(1 as bigint) FROM t
+-- !query 229 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,0)>
+-- !query 229 output
+1
+
+
+-- !query 230
+SELECT cast(1 as decimal(5, 0))  * cast(1 as bigint) FROM t
+-- !query 230 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,0)>
+-- !query 230 output
+1
+
+
+-- !query 231
+SELECT cast(1 as decimal(10, 0)) * cast(1 as bigint) FROM t
+-- !query 231 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) * CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 231 output
+1
+
+
+-- !query 232
+SELECT cast(1 as decimal(20, 0)) * cast(1 as bigint) FROM t
+-- !query 232 schema
+struct<(CAST(1 AS DECIMAL(20,0)) * CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,0)>
+-- !query 232 output
+1
+
+
+-- !query 233
+SELECT cast(1 as decimal(3, 0))  * cast(1 as float) FROM t
+-- !query 233 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 233 output
+1.0
+
+
+-- !query 234
+SELECT cast(1 as decimal(5, 0))  * cast(1 as float) FROM t
+-- !query 234 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 234 output
+1.0
+
+
+-- !query 235
+SELECT cast(1 as decimal(10, 0)) * cast(1 as float) FROM t
+-- !query 235 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 235 output
+1.0
+
+
+-- !query 236
+SELECT cast(1 as decimal(20, 0)) * cast(1 as float) FROM t
+-- !query 236 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 236 output
+1.0
+
+
+-- !query 237
+SELECT cast(1 as decimal(3, 0))  * cast(1 as double) FROM t
+-- !query 237 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 237 output
+1.0
+
+
+-- !query 238
+SELECT cast(1 as decimal(5, 0))  * cast(1 as double) FROM t
+-- !query 238 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 238 output
+1.0
+
+
+-- !query 239
+SELECT cast(1 as decimal(10, 0)) * cast(1 as double) FROM t
+-- !query 239 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 239 output
+1.0
+
+
+-- !query 240
+SELECT cast(1 as decimal(20, 0)) * cast(1 as double) FROM t
+-- !query 240 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(1 AS DOUBLE)):double>
+-- !query 240 output
+1.0
+
+
+-- !query 241
+SELECT cast(1 as decimal(3, 0))  * cast(1 as decimal(10, 0)) FROM t
+-- !query 241 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,0)>
+-- !query 241 output
+1
+
+
+-- !query 242
+SELECT cast(1 as decimal(5, 0))  * cast(1 as decimal(10, 0)) FROM t
+-- !query 242 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,0)>
+-- !query 242 output
+1
+
+
+-- !query 243
+SELECT cast(1 as decimal(10, 0)) * cast(1 as decimal(10, 0)) FROM t
+-- !query 243 schema
+struct<(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS DECIMAL(10,0))):decimal(21,0)>
+-- !query 243 output
+1
+
+
+-- !query 244
+SELECT cast(1 as decimal(20, 0)) * cast(1 as decimal(10, 0)) FROM t
+-- !query 244 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) * CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,0)>
+-- !query 244 output
+1
+
+
+-- !query 245
+SELECT cast(1 as decimal(3, 0))  * cast(1 as string) FROM t
+-- !query 245 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 245 output
+1.0
+
+
+-- !query 246
+SELECT cast(1 as decimal(5, 0))  * cast(1 as string) FROM t
+-- !query 246 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 246 output
+1.0
+
+
+-- !query 247
+SELECT cast(1 as decimal(10, 0)) * cast(1 as string) FROM t
+-- !query 247 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 247 output
+1.0
+
+
+-- !query 248
+SELECT cast(1 as decimal(20, 0)) * cast(1 as string) FROM t
+-- !query 248 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) * CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 248 output
+1.0
+
+
+-- !query 249
+SELECT cast(1 as decimal(3, 0))  * cast('1' as binary) FROM t
+-- !query 249 schema
+struct<>
+-- !query 249 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 250
+SELECT cast(1 as decimal(5, 0))  * cast('1' as binary) FROM t
+-- !query 250 schema
+struct<>
+-- !query 250 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 251
+SELECT cast(1 as decimal(10, 0)) * cast('1' as binary) FROM t
+-- !query 251 schema
+struct<>
+-- !query 251 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 252
+SELECT cast(1 as decimal(20, 0)) * cast('1' as binary) FROM t
+-- !query 252 schema
+struct<>
+-- !query 252 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 253
+SELECT cast(1 as decimal(3, 0))  * cast(1 as boolean) FROM t
+-- !query 253 schema
+struct<>
+-- !query 253 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 254
+SELECT cast(1 as decimal(5, 0))  * cast(1 as boolean) FROM t
+-- !query 254 schema
+struct<>
+-- !query 254 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 255
+SELECT cast(1 as decimal(10, 0)) * cast(1 as boolean) FROM t
+-- !query 255 schema
+struct<>
+-- !query 255 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 256
+SELECT cast(1 as decimal(20, 0)) * cast(1 as boolean) FROM t
+-- !query 256 schema
+struct<>
+-- !query 256 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 257
+SELECT cast(1 as decimal(3, 0))  * cast('2017*12*11 09:30:00.0' as timestamp) FROM t
+-- !query 257 schema
+struct<>
+-- !query 257 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 258
+SELECT cast(1 as decimal(5, 0))  * cast('2017*12*11 09:30:00.0' as timestamp) FROM t
+-- !query 258 schema
+struct<>
+-- !query 258 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 259
+SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t
+-- !query 259 schema
+struct<>
+-- !query 259 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 260
+SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00.0' as timestamp) FROM t
+-- !query 260 schema
+struct<>
+-- !query 260 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 261
+SELECT cast(1 as decimal(3, 0))  * cast('2017*12*11 09:30:00' as date) FROM t
+-- !query 261 schema
+struct<>
+-- !query 261 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 262
+SELECT cast(1 as decimal(5, 0))  * cast('2017*12*11 09:30:00' as date) FROM t
+-- !query 262 schema
+struct<>
+-- !query 262 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 263
+SELECT cast(1 as decimal(10, 0)) * cast('2017*12*11 09:30:00' as date) FROM t
+-- !query 263 schema
+struct<>
+-- !query 263 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 264
+SELECT cast(1 as decimal(20, 0)) * cast('2017*12*11 09:30:00' as date) FROM t
+-- !query 264 schema
+struct<>
+-- !query 264 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) * CAST('2017*12*11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 265
+SELECT cast(1 as tinyint) / cast(1 as decimal(3, 0)) FROM t
+-- !query 265 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) / CAST(1 AS DECIMAL(3,0))):decimal(9,6)>
+-- !query 265 output
+1
+
+
+-- !query 266
+SELECT cast(1 as tinyint) / cast(1 as decimal(5, 0)) FROM t
+-- !query 266 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,6)>
+-- !query 266 output
+1
+
+
+-- !query 267
+SELECT cast(1 as tinyint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 267 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)>
+-- !query 267 output
+1
+
+
+-- !query 268
+SELECT cast(1 as tinyint) / cast(1 as decimal(20, 0)) FROM t
+-- !query 268 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,21)>
+-- !query 268 output
+1
+
+
+-- !query 269
+SELECT cast(1 as smallint) / cast(1 as decimal(3, 0)) FROM t
+-- !query 269 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(11,6)>
+-- !query 269 output
+1
+
+
+-- !query 270
+SELECT cast(1 as smallint) / cast(1 as decimal(5, 0)) FROM t
+-- !query 270 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) / CAST(1 AS DECIMAL(5,0))):decimal(11,6)>
+-- !query 270 output
+1
+
+
+-- !query 271
+SELECT cast(1 as smallint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 271 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)>
+-- !query 271 output
+1
+
+
+-- !query 272
+SELECT cast(1 as smallint) / cast(1 as decimal(20, 0)) FROM t
+-- !query 272 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,21)>
+-- !query 272 output
+1
+
+
+-- !query 273
+SELECT cast(1 as int) / cast(1 as decimal(3, 0)) FROM t
+-- !query 273 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 273 output
+1
+
+
+-- !query 274
+SELECT cast(1 as int) / cast(1 as decimal(5, 0)) FROM t
+-- !query 274 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 274 output
+1
+
+
+-- !query 275
+SELECT cast(1 as int) / cast(1 as decimal(10, 0)) FROM t
+-- !query 275 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 275 output
+1
+
+
+-- !query 276
+SELECT cast(1 as int) / cast(1 as decimal(20, 0)) FROM t
+-- !query 276 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)>
+-- !query 276 output
+1
+
+
+-- !query 277
+SELECT cast(1 as bigint) / cast(1 as decimal(3, 0)) FROM t
+-- !query 277 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(26,6)>
+-- !query 277 output
+1
+
+
+-- !query 278
+SELECT cast(1 as bigint) / cast(1 as decimal(5, 0)) FROM t
+-- !query 278 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,6)>
+-- !query 278 output
+1
+
+
+-- !query 279
+SELECT cast(1 as bigint) / cast(1 as decimal(10, 0)) FROM t
+-- !query 279 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)>
+-- !query 279 output
+1
+
+
+-- !query 280
+SELECT cast(1 as bigint) / cast(1 as decimal(20, 0)) FROM t
+-- !query 280 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) / CAST(1 AS DECIMAL(20,0))):decimal(38,19)>
+-- !query 280 output
+1
+
+
+-- !query 281
+SELECT cast(1 as float) / cast(1 as decimal(3, 0)) FROM t
+-- !query 281 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) AS DOUBLE)):double>
+-- !query 281 output
+1.0
+
+
+-- !query 282
+SELECT cast(1 as float) / cast(1 as decimal(5, 0)) FROM t
+-- !query 282 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) AS DOUBLE)):double>
+-- !query 282 output
+1.0
+
+
+-- !query 283
+SELECT cast(1 as float) / cast(1 as decimal(10, 0)) FROM t
+-- !query 283 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) AS DOUBLE)):double>
+-- !query 283 output
+1.0
+
+
+-- !query 284
+SELECT cast(1 as float) / cast(1 as decimal(20, 0)) FROM t
+-- !query 284 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) / CAST(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) AS DOUBLE)):double>
+-- !query 284 output
+1.0
+
+
+-- !query 285
+SELECT cast(1 as double) / cast(1 as decimal(3, 0)) FROM t
+-- !query 285 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 285 output
+1.0
+
+
+-- !query 286
+SELECT cast(1 as double) / cast(1 as decimal(5, 0)) FROM t
+-- !query 286 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 286 output
+1.0
+
+
+-- !query 287
+SELECT cast(1 as double) / cast(1 as decimal(10, 0)) FROM t
+-- !query 287 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 287 output
+1.0
+
+
+-- !query 288
+SELECT cast(1 as double) / cast(1 as decimal(20, 0)) FROM t
+-- !query 288 schema
+struct<(CAST(1 AS DOUBLE) / CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 288 output
+1.0
+
+
+-- !query 289
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(3, 0)) FROM t
+-- !query 289 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 289 output
+1
+
+
+-- !query 290
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(5, 0)) FROM t
+-- !query 290 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 290 output
+1
+
+
+-- !query 291
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t
+-- !query 291 schema
+struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 291 output
+1
+
+
+-- !query 292
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(20, 0)) FROM t
+-- !query 292 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)>
+-- !query 292 output
+1
+
+
+-- !query 293
+SELECT cast('1' as binary) / cast(1 as decimal(3, 0)) FROM t
+-- !query 293 schema
+struct<>
+-- !query 293 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 294
+SELECT cast('1' as binary) / cast(1 as decimal(5, 0)) FROM t
+-- !query 294 schema
+struct<>
+-- !query 294 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 295
+SELECT cast('1' as binary) / cast(1 as decimal(10, 0)) FROM t
+-- !query 295 schema
+struct<>
+-- !query 295 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 296
+SELECT cast('1' as binary) / cast(1 as decimal(20, 0)) FROM t
+-- !query 296 schema
+struct<>
+-- !query 296 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) / CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 297
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(3, 0)) FROM t
+-- !query 297 schema
+struct<>
+-- !query 297 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 298
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(5, 0)) FROM t
+-- !query 298 schema
+struct<>
+-- !query 298 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 299
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(10, 0)) FROM t
+-- !query 299 schema
+struct<>
+-- !query 299 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 300
+SELECT cast('2017/12/11 09:30:00.0' as timestamp) / cast(1 as decimal(20, 0)) FROM t
+-- !query 300 schema
+struct<>
+-- !query 300 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00.0' AS TIMESTAMP) / CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 301
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(3, 0)) FROM t
+-- !query 301 schema
+struct<>
+-- !query 301 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 302
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(5, 0)) FROM t
+-- !query 302 schema
+struct<>
+-- !query 302 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 303
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(10, 0)) FROM t
+-- !query 303 schema
+struct<>
+-- !query 303 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 304
+SELECT cast('2017/12/11 09:30:00' as date) / cast(1 as decimal(20, 0)) FROM t
+-- !query 304 schema
+struct<>
+-- !query 304 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017/12/11 09:30:00' AS DATE) / CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 305
+SELECT cast(1 as decimal(3, 0))  / cast(1 as tinyint) FROM t
+-- !query 305 schema
+struct<(CAST(1 AS DECIMAL(3,0)) / CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(9,6)>
+-- !query 305 output
+1
+
+
+-- !query 306
+SELECT cast(1 as decimal(5, 0))  / cast(1 as tinyint) FROM t
+-- !query 306 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(11,6)>
+-- !query 306 output
+1
+
+
+-- !query 307
+SELECT cast(1 as decimal(10, 0)) / cast(1 as tinyint) FROM t
+-- !query 307 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 307 output
+1
+
+
+-- !query 308
+SELECT cast(1 as decimal(20, 0)) / cast(1 as tinyint) FROM t
+-- !query 308 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(26,6)>
+-- !query 308 output
+1
+
+
+-- !query 309
+SELECT cast(1 as decimal(3, 0))  / cast(1 as smallint) FROM t
+-- !query 309 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(9,6)>
+-- !query 309 output
+1
+
+
+-- !query 310
+SELECT cast(1 as decimal(5, 0))  / cast(1 as smallint) FROM t
+-- !query 310 schema
+struct<(CAST(1 AS DECIMAL(5,0)) / CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(11,6)>
+-- !query 310 output
+1
+
+
+-- !query 311
+SELECT cast(1 as decimal(10, 0)) / cast(1 as smallint) FROM t
+-- !query 311 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(16,6)>
+-- !query 311 output
+1
+
+
+-- !query 312
+SELECT cast(1 as decimal(20, 0)) / cast(1 as smallint) FROM t
+-- !query 312 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(26,6)>
+-- !query 312 output
+1
+
+
+-- !query 313
+SELECT cast(1 as decimal(3, 0))  / cast(1 as int) FROM t
+-- !query 313 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)>
+-- !query 313 output
+1
+
+
+-- !query 314
+SELECT cast(1 as decimal(5, 0))  / cast(1 as int) FROM t
+-- !query 314 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)>
+-- !query 314 output
+1
+
+
+-- !query 315
+SELECT cast(1 as decimal(10, 0)) / cast(1 as int) FROM t
+-- !query 315 schema
+struct<(CAST(1 AS DECIMAL(10,0)) / CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 315 output
+1
+
+
+-- !query 316
+SELECT cast(1 as decimal(20, 0)) / cast(1 as int) FROM t
+-- !query 316 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)>
+-- !query 316 output
+1
+
+
+-- !query 317
+SELECT cast(1 as decimal(3, 0))  / cast(1 as bigint) FROM t
+-- !query 317 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(24,21)>
+-- !query 317 output
+1
+
+
+-- !query 318
+SELECT cast(1 as decimal(5, 0))  / cast(1 as bigint) FROM t
+-- !query 318 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(26,21)>
+-- !query 318 output
+1
+
+
+-- !query 319
+SELECT cast(1 as decimal(10, 0)) / cast(1 as bigint) FROM t
+-- !query 319 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) / CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(31,21)>
+-- !query 319 output
+1
+
+
+-- !query 320
+SELECT cast(1 as decimal(20, 0)) / cast(1 as bigint) FROM t
+-- !query 320 schema
+struct<(CAST(1 AS DECIMAL(20,0)) / CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(38,19)>
+-- !query 320 output
+1
+
+
+-- !query 321
+SELECT cast(1 as decimal(3, 0))  / cast(1 as float) FROM t
+-- !query 321 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 321 output
+1.0
+
+
+-- !query 322
+SELECT cast(1 as decimal(5, 0))  / cast(1 as float) FROM t
+-- !query 322 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 322 output
+1.0
+
+
+-- !query 323
+SELECT cast(1 as decimal(10, 0)) / cast(1 as float) FROM t
+-- !query 323 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 323 output
+1.0
+
+
+-- !query 324
+SELECT cast(1 as decimal(20, 0)) / cast(1 as float) FROM t
+-- !query 324 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 324 output
+1.0
+
+
+-- !query 325
+SELECT cast(1 as decimal(3, 0))  / cast(1 as double) FROM t
+-- !query 325 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 325 output
+1.0
+
+
+-- !query 326
+SELECT cast(1 as decimal(5, 0))  / cast(1 as double) FROM t
+-- !query 326 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 326 output
+1.0
+
+
+-- !query 327
+SELECT cast(1 as decimal(10, 0)) / cast(1 as double) FROM t
+-- !query 327 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 327 output
+1.0
+
+
+-- !query 328
+SELECT cast(1 as decimal(20, 0)) / cast(1 as double) FROM t
+-- !query 328 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(1 AS DOUBLE)):double>
+-- !query 328 output
+1.0
+
+
+-- !query 329
+SELECT cast(1 as decimal(3, 0))  / cast(1 as decimal(10, 0)) FROM t
+-- !query 329 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(14,11)>
+-- !query 329 output
+1
+
+
+-- !query 330
+SELECT cast(1 as decimal(5, 0))  / cast(1 as decimal(10, 0)) FROM t
+-- !query 330 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(16,11)>
+-- !query 330 output
+1
+
+
+-- !query 331
+SELECT cast(1 as decimal(10, 0)) / cast(1 as decimal(10, 0)) FROM t
+-- !query 331 schema
+struct<(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS DECIMAL(10,0))):decimal(21,11)>
+-- !query 331 output
+1
+
+
+-- !query 332
+SELECT cast(1 as decimal(20, 0)) / cast(1 as decimal(10, 0)) FROM t
+-- !query 332 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) / CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(31,11)>
+-- !query 332 output
+1
+
+
+-- !query 333
+SELECT cast(1 as decimal(3, 0))  / cast(1 as string) FROM t
+-- !query 333 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 333 output
+1.0
+
+
+-- !query 334
+SELECT cast(1 as decimal(5, 0))  / cast(1 as string) FROM t
+-- !query 334 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 334 output
+1.0
+
+
+-- !query 335
+SELECT cast(1 as decimal(10, 0)) / cast(1 as string) FROM t
+-- !query 335 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 335 output
+1.0
+
+
+-- !query 336
+SELECT cast(1 as decimal(20, 0)) / cast(1 as string) FROM t
+-- !query 336 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) / CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 336 output
+1.0
+
+
+-- !query 337
+SELECT cast(1 as decimal(3, 0))  / cast('1' as binary) FROM t
+-- !query 337 schema
+struct<>
+-- !query 337 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 338
+SELECT cast(1 as decimal(5, 0))  / cast('1' as binary) FROM t
+-- !query 338 schema
+struct<>
+-- !query 338 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 339
+SELECT cast(1 as decimal(10, 0)) / cast('1' as binary) FROM t
+-- !query 339 schema
+struct<>
+-- !query 339 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 340
+SELECT cast(1 as decimal(20, 0)) / cast('1' as binary) FROM t
+-- !query 340 schema
+struct<>
+-- !query 340 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 341
+SELECT cast(1 as decimal(3, 0))  / cast(1 as boolean) FROM t
+-- !query 341 schema
+struct<>
+-- !query 341 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 342
+SELECT cast(1 as decimal(5, 0))  / cast(1 as boolean) FROM t
+-- !query 342 schema
+struct<>
+-- !query 342 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 343
+SELECT cast(1 as decimal(10, 0)) / cast(1 as boolean) FROM t
+-- !query 343 schema
+struct<>
+-- !query 343 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 344
+SELECT cast(1 as decimal(20, 0)) / cast(1 as boolean) FROM t
+-- !query 344 schema
+struct<>
+-- !query 344 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 345
+SELECT cast(1 as decimal(3, 0))  / cast('2017/12/11 09:30:00.0' as timestamp) FROM t
+-- !query 345 schema
+struct<>
+-- !query 345 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 346
+SELECT cast(1 as decimal(5, 0))  / cast('2017/12/11 09:30:00.0' as timestamp) FROM t
+-- !query 346 schema
+struct<>
+-- !query 346 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 347
+SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t
+-- !query 347 schema
+struct<>
+-- !query 347 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 348
+SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00.0' as timestamp) FROM t
+-- !query 348 schema
+struct<>
+-- !query 348 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 349
+SELECT cast(1 as decimal(3, 0))  / cast('2017/12/11 09:30:00' as date) FROM t
+-- !query 349 schema
+struct<>
+-- !query 349 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 350
+SELECT cast(1 as decimal(5, 0))  / cast('2017/12/11 09:30:00' as date) FROM t
+-- !query 350 schema
+struct<>
+-- !query 350 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 351
+SELECT cast(1 as decimal(10, 0)) / cast('2017/12/11 09:30:00' as date) FROM t
+-- !query 351 schema
+struct<>
+-- !query 351 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 352
+SELECT cast(1 as decimal(20, 0)) / cast('2017/12/11 09:30:00' as date) FROM t
+-- !query 352 schema
+struct<>
+-- !query 352 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) / CAST('2017/12/11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 353
+SELECT cast(1 as tinyint) % cast(1 as decimal(3, 0)) FROM t
+-- !query 353 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) % CAST(1 AS DECIMAL(3,0))):decimal(3,0)>
+-- !query 353 output
+0
+
+
+-- !query 354
+SELECT cast(1 as tinyint) % cast(1 as decimal(5, 0)) FROM t
+-- !query 354 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 354 output
+0
+
+
+-- !query 355
+SELECT cast(1 as tinyint) % cast(1 as decimal(10, 0)) FROM t
+-- !query 355 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 355 output
+0
+
+
+-- !query 356
+SELECT cast(1 as tinyint) % cast(1 as decimal(20, 0)) FROM t
+-- !query 356 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 356 output
+0
+
+
+-- !query 357
+SELECT cast(1 as smallint) % cast(1 as decimal(3, 0)) FROM t
+-- !query 357 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 357 output
+0
+
+
+-- !query 358
+SELECT cast(1 as smallint) % cast(1 as decimal(5, 0)) FROM t
+-- !query 358 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) % CAST(1 AS DECIMAL(5,0))):decimal(5,0)>
+-- !query 358 output
+0
+
+
+-- !query 359
+SELECT cast(1 as smallint) % cast(1 as decimal(10, 0)) FROM t
+-- !query 359 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 359 output
+0
+
+
+-- !query 360
+SELECT cast(1 as smallint) % cast(1 as decimal(20, 0)) FROM t
+-- !query 360 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 360 output
+0
+
+
+-- !query 361
+SELECT cast(1 as int) % cast(1 as decimal(3, 0)) FROM t
+-- !query 361 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 361 output
+0
+
+
+-- !query 362
+SELECT cast(1 as int) % cast(1 as decimal(5, 0)) FROM t
+-- !query 362 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 362 output
+0
+
+
+-- !query 363
+SELECT cast(1 as int) % cast(1 as decimal(10, 0)) FROM t
+-- !query 363 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 363 output
+0
+
+
+-- !query 364
+SELECT cast(1 as int) % cast(1 as decimal(20, 0)) FROM t
+-- !query 364 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 364 output
+0
+
+
+-- !query 365
+SELECT cast(1 as bigint) % cast(1 as decimal(3, 0)) FROM t
+-- !query 365 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 365 output
+0
+
+
+-- !query 366
+SELECT cast(1 as bigint) % cast(1 as decimal(5, 0)) FROM t
+-- !query 366 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 366 output
+0
+
+
+-- !query 367
+SELECT cast(1 as bigint) % cast(1 as decimal(10, 0)) FROM t
+-- !query 367 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 367 output
+0
+
+
+-- !query 368
+SELECT cast(1 as bigint) % cast(1 as decimal(20, 0)) FROM t
+-- !query 368 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) % CAST(1 AS DECIMAL(20,0))):decimal(20,0)>
+-- !query 368 output
+0
+
+
+-- !query 369
+SELECT cast(1 as float) % cast(1 as decimal(3, 0)) FROM t
+-- !query 369 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 369 output
+0.0
+
+
+-- !query 370
+SELECT cast(1 as float) % cast(1 as decimal(5, 0)) FROM t
+-- !query 370 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 370 output
+0.0
+
+
+-- !query 371
+SELECT cast(1 as float) % cast(1 as decimal(10, 0)) FROM t
+-- !query 371 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 371 output
+0.0
+
+
+-- !query 372
+SELECT cast(1 as float) % cast(1 as decimal(20, 0)) FROM t
+-- !query 372 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) % CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 372 output
+0.0
+
+
+-- !query 373
+SELECT cast(1 as double) % cast(1 as decimal(3, 0)) FROM t
+-- !query 373 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 373 output
+0.0
+
+
+-- !query 374
+SELECT cast(1 as double) % cast(1 as decimal(5, 0)) FROM t
+-- !query 374 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 374 output
+0.0
+
+
+-- !query 375
+SELECT cast(1 as double) % cast(1 as decimal(10, 0)) FROM t
+-- !query 375 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 375 output
+0.0
+
+
+-- !query 376
+SELECT cast(1 as double) % cast(1 as decimal(20, 0)) FROM t
+-- !query 376 schema
+struct<(CAST(1 AS DOUBLE) % CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 376 output
+0.0
+
+
+-- !query 377
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(3, 0)) FROM t
+-- !query 377 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 377 output
+0
+
+
+-- !query 378
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(5, 0)) FROM t
+-- !query 378 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 378 output
+0
+
+
+-- !query 379
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t
+-- !query 379 schema
+struct<(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 379 output
+0
+
+
+-- !query 380
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(20, 0)) FROM t
+-- !query 380 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 380 output
+0
+
+
+-- !query 381
+SELECT cast('1' as binary) % cast(1 as decimal(3, 0)) FROM t
+-- !query 381 schema
+struct<>
+-- !query 381 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 382
+SELECT cast('1' as binary) % cast(1 as decimal(5, 0)) FROM t
+-- !query 382 schema
+struct<>
+-- !query 382 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 383
+SELECT cast('1' as binary) % cast(1 as decimal(10, 0)) FROM t
+-- !query 383 schema
+struct<>
+-- !query 383 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 384
+SELECT cast('1' as binary) % cast(1 as decimal(20, 0)) FROM t
+-- !query 384 schema
+struct<>
+-- !query 384 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) % CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 385
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(3, 0)) FROM t
+-- !query 385 schema
+struct<>
+-- !query 385 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 386
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(5, 0)) FROM t
+-- !query 386 schema
+struct<>
+-- !query 386 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 387
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(10, 0)) FROM t
+-- !query 387 schema
+struct<>
+-- !query 387 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 388
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) % cast(1 as decimal(20, 0)) FROM t
+-- !query 388 schema
+struct<>
+-- !query 388 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) % CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 389
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(3, 0)) FROM t
+-- !query 389 schema
+struct<>
+-- !query 389 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 390
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(5, 0)) FROM t
+-- !query 390 schema
+struct<>
+-- !query 390 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 391
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(10, 0)) FROM t
+-- !query 391 schema
+struct<>
+-- !query 391 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 392
+SELECT cast('2017-12-11 09:30:00' as date) % cast(1 as decimal(20, 0)) FROM t
+-- !query 392 schema
+struct<>
+-- !query 392 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) % CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 393
+SELECT cast(1 as decimal(3, 0))  % cast(1 as tinyint) FROM t
+-- !query 393 schema
+struct<(CAST(1 AS DECIMAL(3,0)) % CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(3,0)>
+-- !query 393 output
+0
+
+
+-- !query 394
+SELECT cast(1 as decimal(5, 0))  % cast(1 as tinyint) FROM t
+-- !query 394 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 394 output
+0
+
+
+-- !query 395
+SELECT cast(1 as decimal(10, 0)) % cast(1 as tinyint) FROM t
+-- !query 395 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 395 output
+0
+
+
+-- !query 396
+SELECT cast(1 as decimal(20, 0)) % cast(1 as tinyint) FROM t
+-- !query 396 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 396 output
+0
+
+
+-- !query 397
+SELECT cast(1 as decimal(3, 0))  % cast(1 as smallint) FROM t
+-- !query 397 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 397 output
+0
+
+
+-- !query 398
+SELECT cast(1 as decimal(5, 0))  % cast(1 as smallint) FROM t
+-- !query 398 schema
+struct<(CAST(1 AS DECIMAL(5,0)) % CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(5,0)>
+-- !query 398 output
+0
+
+
+-- !query 399
+SELECT cast(1 as decimal(10, 0)) % cast(1 as smallint) FROM t
+-- !query 399 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 399 output
+0
+
+
+-- !query 400
+SELECT cast(1 as decimal(20, 0)) % cast(1 as smallint) FROM t
+-- !query 400 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 400 output
+0
+
+
+-- !query 401
+SELECT cast(1 as decimal(3, 0))  % cast(1 as int) FROM t
+-- !query 401 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 401 output
+0
+
+
+-- !query 402
+SELECT cast(1 as decimal(5, 0))  % cast(1 as int) FROM t
+-- !query 402 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 402 output
+0
+
+
+-- !query 403
+SELECT cast(1 as decimal(10, 0)) % cast(1 as int) FROM t
+-- !query 403 schema
+struct<(CAST(1 AS DECIMAL(10,0)) % CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 403 output
+0
+
+
+-- !query 404
+SELECT cast(1 as decimal(20, 0)) % cast(1 as int) FROM t
+-- !query 404 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 404 output
+0
+
+
+-- !query 405
+SELECT cast(1 as decimal(3, 0))  % cast(1 as bigint) FROM t
+-- !query 405 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 405 output
+0
+
+
+-- !query 406
+SELECT cast(1 as decimal(5, 0))  % cast(1 as bigint) FROM t
+-- !query 406 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 406 output
+0
+
+
+-- !query 407
+SELECT cast(1 as decimal(10, 0)) % cast(1 as bigint) FROM t
+-- !query 407 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) % CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 407 output
+0
+
+
+-- !query 408
+SELECT cast(1 as decimal(20, 0)) % cast(1 as bigint) FROM t
+-- !query 408 schema
+struct<(CAST(1 AS DECIMAL(20,0)) % CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(20,0)>
+-- !query 408 output
+0
+
+
+-- !query 409
+SELECT cast(1 as decimal(3, 0))  % cast(1 as float) FROM t
+-- !query 409 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 409 output
+0.0
+
+
+-- !query 410
+SELECT cast(1 as decimal(5, 0))  % cast(1 as float) FROM t
+-- !query 410 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 410 output
+0.0
+
+
+-- !query 411
+SELECT cast(1 as decimal(10, 0)) % cast(1 as float) FROM t
+-- !query 411 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 411 output
+0.0
+
+
+-- !query 412
+SELECT cast(1 as decimal(20, 0)) % cast(1 as float) FROM t
+-- !query 412 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 412 output
+0.0
+
+
+-- !query 413
+SELECT cast(1 as decimal(3, 0))  % cast(1 as double) FROM t
+-- !query 413 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 413 output
+0.0
+
+
+-- !query 414
+SELECT cast(1 as decimal(5, 0))  % cast(1 as double) FROM t
+-- !query 414 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 414 output
+0.0
+
+
+-- !query 415
+SELECT cast(1 as decimal(10, 0)) % cast(1 as double) FROM t
+-- !query 415 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 415 output
+0.0
+
+
+-- !query 416
+SELECT cast(1 as decimal(20, 0)) % cast(1 as double) FROM t
+-- !query 416 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(1 AS DOUBLE)):double>
+-- !query 416 output
+0.0
+
+
+-- !query 417
+SELECT cast(1 as decimal(3, 0))  % cast(1 as decimal(10, 0)) FROM t
+-- !query 417 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 417 output
+0
+
+
+-- !query 418
+SELECT cast(1 as decimal(5, 0))  % cast(1 as decimal(10, 0)) FROM t
+-- !query 418 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 418 output
+0
+
+
+-- !query 419
+SELECT cast(1 as decimal(10, 0)) % cast(1 as decimal(10, 0)) FROM t
+-- !query 419 schema
+struct<(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 419 output
+0
+
+
+-- !query 420
+SELECT cast(1 as decimal(20, 0)) % cast(1 as decimal(10, 0)) FROM t
+-- !query 420 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) % CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 420 output
+0
+
+
+-- !query 421
+SELECT cast(1 as decimal(3, 0))  % cast(1 as string) FROM t
+-- !query 421 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 421 output
+0.0
+
+
+-- !query 422
+SELECT cast(1 as decimal(5, 0))  % cast(1 as string) FROM t
+-- !query 422 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 422 output
+0.0
+
+
+-- !query 423
+SELECT cast(1 as decimal(10, 0)) % cast(1 as string) FROM t
+-- !query 423 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 423 output
+0.0
+
+
+-- !query 424
+SELECT cast(1 as decimal(20, 0)) % cast(1 as string) FROM t
+-- !query 424 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) % CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 424 output
+0.0
+
+
+-- !query 425
+SELECT cast(1 as decimal(3, 0))  % cast('1' as binary) FROM t
+-- !query 425 schema
+struct<>
+-- !query 425 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 426
+SELECT cast(1 as decimal(5, 0))  % cast('1' as binary) FROM t
+-- !query 426 schema
+struct<>
+-- !query 426 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 427
+SELECT cast(1 as decimal(10, 0)) % cast('1' as binary) FROM t
+-- !query 427 schema
+struct<>
+-- !query 427 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 428
+SELECT cast(1 as decimal(20, 0)) % cast('1' as binary) FROM t
+-- !query 428 schema
+struct<>
+-- !query 428 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 429
+SELECT cast(1 as decimal(3, 0))  % cast(1 as boolean) FROM t
+-- !query 429 schema
+struct<>
+-- !query 429 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 430
+SELECT cast(1 as decimal(5, 0))  % cast(1 as boolean) FROM t
+-- !query 430 schema
+struct<>
+-- !query 430 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 431
+SELECT cast(1 as decimal(10, 0)) % cast(1 as boolean) FROM t
+-- !query 431 schema
+struct<>
+-- !query 431 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 432
+SELECT cast(1 as decimal(20, 0)) % cast(1 as boolean) FROM t
+-- !query 432 schema
+struct<>
+-- !query 432 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 433
+SELECT cast(1 as decimal(3, 0))  % cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 433 schema
+struct<>
+-- !query 433 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 434
+SELECT cast(1 as decimal(5, 0))  % cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 434 schema
+struct<>
+-- !query 434 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 435
+SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 435 schema
+struct<>
+-- !query 435 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 436
+SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 436 schema
+struct<>
+-- !query 436 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 437
+SELECT cast(1 as decimal(3, 0))  % cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 437 schema
+struct<>
+-- !query 437 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 438
+SELECT cast(1 as decimal(5, 0))  % cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 438 schema
+struct<>
+-- !query 438 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 439
+SELECT cast(1 as decimal(10, 0)) % cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 439 schema
+struct<>
+-- !query 439 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 440
+SELECT cast(1 as decimal(20, 0)) % cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 440 schema
+struct<>
+-- !query 440 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) % CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 441
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(3, 0))) FROM t
+-- !query 441 schema
+struct<pmod(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)), CAST(1 AS DECIMAL(3,0))):decimal(3,0)>
+-- !query 441 output
+0
+
+
+-- !query 442
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(5, 0))) FROM t
+-- !query 442 schema
+struct<pmod(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)), CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 442 output
+0
+
+
+-- !query 443
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(10, 0))) FROM t
+-- !query 443 schema
+struct<pmod(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 443 output
+0
+
+
+-- !query 444
+SELECT pmod(cast(1 as tinyint), cast(1 as decimal(20, 0))) FROM t
+-- !query 444 schema
+struct<pmod(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 444 output
+0
+
+
+-- !query 445
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(3, 0))) FROM t
+-- !query 445 schema
+struct<pmod(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)), CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 445 output
+0
+
+
+-- !query 446
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(5, 0))) FROM t
+-- !query 446 schema
+struct<pmod(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)), CAST(1 AS DECIMAL(5,0))):decimal(5,0)>
+-- !query 446 output
+0
+
+
+-- !query 447
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(10, 0))) FROM t
+-- !query 447 schema
+struct<pmod(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 447 output
+0
+
+
+-- !query 448
+SELECT pmod(cast(1 as smallint), cast(1 as decimal(20, 0))) FROM t
+-- !query 448 schema
+struct<pmod(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 448 output
+0
+
+
+-- !query 449
+SELECT pmod(cast(1 as int), cast(1 as decimal(3, 0))) FROM t
+-- !query 449 schema
+struct<pmod(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 449 output
+0
+
+
+-- !query 450
+SELECT pmod(cast(1 as int), cast(1 as decimal(5, 0))) FROM t
+-- !query 450 schema
+struct<pmod(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 450 output
+0
+
+
+-- !query 451
+SELECT pmod(cast(1 as int), cast(1 as decimal(10, 0))) FROM t
+-- !query 451 schema
+struct<pmod(CAST(CAST(1 AS INT) AS DECIMAL(10,0)), CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 451 output
+0
+
+
+-- !query 452
+SELECT pmod(cast(1 as int), cast(1 as decimal(20, 0))) FROM t
+-- !query 452 schema
+struct<pmod(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 452 output
+0
+
+
+-- !query 453
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(3, 0))) FROM t
+-- !query 453 schema
+struct<pmod(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 453 output
+0
+
+
+-- !query 454
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(5, 0))) FROM t
+-- !query 454 schema
+struct<pmod(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 454 output
+0
+
+
+-- !query 455
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(10, 0))) FROM t
+-- !query 455 schema
+struct<pmod(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 455 output
+0
+
+
+-- !query 456
+SELECT pmod(cast(1 as bigint), cast(1 as decimal(20, 0))) FROM t
+-- !query 456 schema
+struct<pmod(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)), CAST(1 AS DECIMAL(20,0))):decimal(20,0)>
+-- !query 456 output
+0
+
+
+-- !query 457
+SELECT pmod(cast(1 as float), cast(1 as decimal(3, 0))) FROM t
+-- !query 457 schema
+struct<pmod(CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 457 output
+0.0
+
+
+-- !query 458
+SELECT pmod(cast(1 as float), cast(1 as decimal(5, 0))) FROM t
+-- !query 458 schema
+struct<pmod(CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 458 output
+0.0
+
+
+-- !query 459
+SELECT pmod(cast(1 as float), cast(1 as decimal(10, 0))) FROM t
+-- !query 459 schema
+struct<pmod(CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 459 output
+0.0
+
+
+-- !query 460
+SELECT pmod(cast(1 as float), cast(1 as decimal(20, 0))) FROM t
+-- !query 460 schema
+struct<pmod(CAST(CAST(1 AS FLOAT) AS DOUBLE), CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 460 output
+0.0
+
+
+-- !query 461
+SELECT pmod(cast(1 as double), cast(1 as decimal(3, 0))) FROM t
+-- !query 461 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):double>
+-- !query 461 output
+0.0
+
+
+-- !query 462
+SELECT pmod(cast(1 as double), cast(1 as decimal(5, 0))) FROM t
+-- !query 462 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):double>
+-- !query 462 output
+0.0
+
+
+-- !query 463
+SELECT pmod(cast(1 as double), cast(1 as decimal(10, 0))) FROM t
+-- !query 463 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):double>
+-- !query 463 output
+0.0
+
+
+-- !query 464
+SELECT pmod(cast(1 as double), cast(1 as decimal(20, 0))) FROM t
+-- !query 464 schema
+struct<pmod(CAST(1 AS DOUBLE), CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):double>
+-- !query 464 output
+0.0
+
+
+-- !query 465
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(3, 0))) FROM t
+-- !query 465 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 465 output
+0
+
+
+-- !query 466
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(5, 0))) FROM t
+-- !query 466 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 466 output
+0
+
+
+-- !query 467
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t
+-- !query 467 schema
+struct<pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 467 output
+0
+
+
+-- !query 468
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(20, 0))) FROM t
+-- !query 468 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 468 output
+0
+
+
+-- !query 469
+SELECT pmod(cast('1' as binary), cast(1 as decimal(3, 0))) FROM t
+-- !query 469 schema
+struct<>
+-- !query 469 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 470
+SELECT pmod(cast('1' as binary), cast(1 as decimal(5, 0))) FROM t
+-- !query 470 schema
+struct<>
+-- !query 470 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 471
+SELECT pmod(cast('1' as binary), cast(1 as decimal(10, 0))) FROM t
+-- !query 471 schema
+struct<>
+-- !query 471 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 472
+SELECT pmod(cast('1' as binary), cast(1 as decimal(20, 0))) FROM t
+-- !query 472 schema
+struct<>
+-- !query 472 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('1' AS BINARY), CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 473
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(3, 0))) FROM t
+-- !query 473 schema
+struct<>
+-- !query 473 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 474
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(5, 0))) FROM t
+-- !query 474 schema
+struct<>
+-- !query 474 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 475
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(10, 0))) FROM t
+-- !query 475 schema
+struct<>
+-- !query 475 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 476
+SELECT pmod(cast('2017-12-11 09:30:00.0' as timestamp), cast(1 as decimal(20, 0))) FROM t
+-- !query 476 schema
+struct<>
+-- !query 476 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 477
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(3, 0))) FROM t
+-- !query 477 schema
+struct<>
+-- !query 477 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 478
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(5, 0))) FROM t
+-- !query 478 schema
+struct<>
+-- !query 478 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 479
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(10, 0))) FROM t
+-- !query 479 schema
+struct<>
+-- !query 479 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 480
+SELECT pmod(cast('2017-12-11 09:30:00' as date), cast(1 as decimal(20, 0))) FROM t
+-- !query 480 schema
+struct<>
+-- !query 480 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in 'pmod(CAST('2017-12-11 09:30:00' AS DATE), CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 481
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as tinyint)) FROM t
+-- !query 481 schema
+struct<pmod(CAST(1 AS DECIMAL(3,0)), CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):decimal(3,0)>
+-- !query 481 output
+0
+
+
+-- !query 482
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as tinyint)) FROM t
+-- !query 482 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)), CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 482 output
+0
+
+
+-- !query 483
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as tinyint)) FROM t
+-- !query 483 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 483 output
+0
+
+
+-- !query 484
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as tinyint)) FROM t
+-- !query 484 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 484 output
+0
+
+
+-- !query 485
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as smallint)) FROM t
+-- !query 485 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)), CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):decimal(3,0)>
+-- !query 485 output
+0
+
+
+-- !query 486
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as smallint)) FROM t
+-- !query 486 schema
+struct<pmod(CAST(1 AS DECIMAL(5,0)), CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):decimal(5,0)>
+-- !query 486 output
+0
+
+
+-- !query 487
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as smallint)) FROM t
+-- !query 487 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)), CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 487 output
+0
+
+
+-- !query 488
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as smallint)) FROM t
+-- !query 488 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 488 output
+0
+
+
+-- !query 489
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as int)) FROM t
+-- !query 489 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)), CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 489 output
+0
+
+
+-- !query 490
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as int)) FROM t
+-- !query 490 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)), CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 490 output
+0
+
+
+-- !query 491
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as int)) FROM t
+-- !query 491 schema
+struct<pmod(CAST(1 AS DECIMAL(10,0)), CAST(CAST(1 AS INT) AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 491 output
+0
+
+
+-- !query 492
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as int)) FROM t
+-- !query 492 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 492 output
+0
+
+
+-- !query 493
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as bigint)) FROM t
+-- !query 493 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(3,0)>
+-- !query 493 output
+0
+
+
+-- !query 494
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as bigint)) FROM t
+-- !query 494 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(5,0)>
+-- !query 494 output
+0
+
+
+-- !query 495
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as bigint)) FROM t
+-- !query 495 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)), CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 495 output
+0
+
+
+-- !query 496
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as bigint)) FROM t
+-- !query 496 schema
+struct<pmod(CAST(1 AS DECIMAL(20,0)), CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):decimal(20,0)>
+-- !query 496 output
+0
+
+
+-- !query 497
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as float)) FROM t
+-- !query 497 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 497 output
+0.0
+
+
+-- !query 498
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as float)) FROM t
+-- !query 498 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 498 output
+0.0
+
+
+-- !query 499
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as float)) FROM t
+-- !query 499 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 499 output
+0.0
+
+
+-- !query 500
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as float)) FROM t
+-- !query 500 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE), CAST(CAST(1 AS FLOAT) AS DOUBLE)):double>
+-- !query 500 output
+0.0
+
+
+-- !query 501
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as double)) FROM t
+-- !query 501 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 501 output
+0.0
+
+
+-- !query 502
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as double)) FROM t
+-- !query 502 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 502 output
+0.0
+
+
+-- !query 503
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as double)) FROM t
+-- !query 503 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 503 output
+0.0
+
+
+-- !query 504
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as double)) FROM t
+-- !query 504 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE), CAST(1 AS DOUBLE)):double>
+-- !query 504 output
+0.0
+
+
+-- !query 505
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as decimal(10, 0))) FROM t
+-- !query 505 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(3,0)>
+-- !query 505 output
+0
+
+
+-- !query 506
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as decimal(10, 0))) FROM t
+-- !query 506 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):decimal(5,0)>
+-- !query 506 output
+0
+
+
+-- !query 507
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as decimal(10, 0))) FROM t
+-- !query 507 schema
+struct<pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS DECIMAL(10,0))):decimal(10,0)>
+-- !query 507 output
+0
+
+
+-- !query 508
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as decimal(10, 0))) FROM t
+-- !query 508 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)), CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):decimal(10,0)>
+-- !query 508 output
+0
+
+
+-- !query 509
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as string)) FROM t
+-- !query 509 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE), CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 509 output
+0.0
+
+
+-- !query 510
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as string)) FROM t
+-- !query 510 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE), CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 510 output
+0.0
+
+
+-- !query 511
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as string)) FROM t
+-- !query 511 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE), CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 511 output
+0.0
+
+
+-- !query 512
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as string)) FROM t
+-- !query 512 schema
+struct<pmod(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE), CAST(CAST(1 AS STRING) AS DOUBLE)):double>
+-- !query 512 output
+0.0
+
+
+-- !query 513
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('1' as binary)) FROM t
+-- !query 513 schema
+struct<>
+-- !query 513 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 514
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('1' as binary)) FROM t
+-- !query 514 schema
+struct<>
+-- !query 514 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 515
+SELECT pmod(cast(1 as decimal(10, 0)), cast('1' as binary)) FROM t
+-- !query 515 schema
+struct<>
+-- !query 515 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 516
+SELECT pmod(cast(1 as decimal(20, 0)), cast('1' as binary)) FROM t
+-- !query 516 schema
+struct<>
+-- !query 516 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('1' AS BINARY))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 517
+SELECT pmod(cast(1 as decimal(3, 0)) , cast(1 as boolean)) FROM t
+-- !query 517 schema
+struct<>
+-- !query 517 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 518
+SELECT pmod(cast(1 as decimal(5, 0)) , cast(1 as boolean)) FROM t
+-- !query 518 schema
+struct<>
+-- !query 518 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 519
+SELECT pmod(cast(1 as decimal(10, 0)), cast(1 as boolean)) FROM t
+-- !query 519 schema
+struct<>
+-- !query 519 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 520
+SELECT pmod(cast(1 as decimal(20, 0)), cast(1 as boolean)) FROM t
+-- !query 520 schema
+struct<>
+-- !query 520 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 521
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 521 schema
+struct<>
+-- !query 521 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 522
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 522 schema
+struct<>
+-- !query 522 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 523
+SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 523 schema
+struct<>
+-- !query 523 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 524
+SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 524 schema
+struct<>
+-- !query 524 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 525
+SELECT pmod(cast(1 as decimal(3, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 525 schema
+struct<>
+-- !query 525 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(3,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 526
+SELECT pmod(cast(1 as decimal(5, 0)) , cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 526 schema
+struct<>
+-- !query 526 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(5,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 527
+SELECT pmod(cast(1 as decimal(10, 0)), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 527 schema
+struct<>
+-- !query 527 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 528
+SELECT pmod(cast(1 as decimal(20, 0)), cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 528 schema
+struct<>
+-- !query 528 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in 'pmod(CAST(1 AS DECIMAL(20,0)), CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 529
+SELECT cast(1 as tinyint) = cast(1 as decimal(3, 0)) FROM t
+-- !query 529 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) = CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 529 output
+true
+
+
+-- !query 530
+SELECT cast(1 as tinyint) = cast(1 as decimal(5, 0)) FROM t
+-- !query 530 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 530 output
+true
+
+
+-- !query 531
+SELECT cast(1 as tinyint) = cast(1 as decimal(10, 0)) FROM t
+-- !query 531 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 531 output
+true
+
+
+-- !query 532
+SELECT cast(1 as tinyint) = cast(1 as decimal(20, 0)) FROM t
+-- !query 532 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 532 output
+true
+
+
+-- !query 533
+SELECT cast(1 as smallint) = cast(1 as decimal(3, 0)) FROM t
+-- !query 533 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 533 output
+true
+
+
+-- !query 534
+SELECT cast(1 as smallint) = cast(1 as decimal(5, 0)) FROM t
+-- !query 534 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) = CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 534 output
+true
+
+
+-- !query 535
+SELECT cast(1 as smallint) = cast(1 as decimal(10, 0)) FROM t
+-- !query 535 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 535 output
+true
+
+
+-- !query 536
+SELECT cast(1 as smallint) = cast(1 as decimal(20, 0)) FROM t
+-- !query 536 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 536 output
+true
+
+
+-- !query 537
+SELECT cast(1 as int) = cast(1 as decimal(3, 0)) FROM t
+-- !query 537 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 537 output
+true
+
+
+-- !query 538
+SELECT cast(1 as int) = cast(1 as decimal(5, 0)) FROM t
+-- !query 538 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 538 output
+true
+
+
+-- !query 539
+SELECT cast(1 as int) = cast(1 as decimal(10, 0)) FROM t
+-- !query 539 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 539 output
+true
+
+
+-- !query 540
+SELECT cast(1 as int) = cast(1 as decimal(20, 0)) FROM t
+-- !query 540 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 540 output
+true
+
+
+-- !query 541
+SELECT cast(1 as bigint) = cast(1 as decimal(3, 0)) FROM t
+-- !query 541 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 541 output
+true
+
+
+-- !query 542
+SELECT cast(1 as bigint) = cast(1 as decimal(5, 0)) FROM t
+-- !query 542 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 542 output
+true
+
+
+-- !query 543
+SELECT cast(1 as bigint) = cast(1 as decimal(10, 0)) FROM t
+-- !query 543 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 543 output
+true
+
+
+-- !query 544
+SELECT cast(1 as bigint) = cast(1 as decimal(20, 0)) FROM t
+-- !query 544 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) = CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 544 output
+true
+
+
+-- !query 545
+SELECT cast(1 as float) = cast(1 as decimal(3, 0)) FROM t
+-- !query 545 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 545 output
+true
+
+
+-- !query 546
+SELECT cast(1 as float) = cast(1 as decimal(5, 0)) FROM t
+-- !query 546 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 546 output
+true
+
+
+-- !query 547
+SELECT cast(1 as float) = cast(1 as decimal(10, 0)) FROM t
+-- !query 547 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 547 output
+true
+
+
+-- !query 548
+SELECT cast(1 as float) = cast(1 as decimal(20, 0)) FROM t
+-- !query 548 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 548 output
+true
+
+
+-- !query 549
+SELECT cast(1 as double) = cast(1 as decimal(3, 0)) FROM t
+-- !query 549 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 549 output
+true
+
+
+-- !query 550
+SELECT cast(1 as double) = cast(1 as decimal(5, 0)) FROM t
+-- !query 550 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 550 output
+true
+
+
+-- !query 551
+SELECT cast(1 as double) = cast(1 as decimal(10, 0)) FROM t
+-- !query 551 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 551 output
+true
+
+
+-- !query 552
+SELECT cast(1 as double) = cast(1 as decimal(20, 0)) FROM t
+-- !query 552 schema
+struct<(CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 552 output
+true
+
+
+-- !query 553
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(3, 0)) FROM t
+-- !query 553 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 553 output
+true
+
+
+-- !query 554
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(5, 0)) FROM t
+-- !query 554 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 554 output
+true
+
+
+-- !query 555
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t
+-- !query 555 schema
+struct<(CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 555 output
+true
+
+
+-- !query 556
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(20, 0)) FROM t
+-- !query 556 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 556 output
+true
+
+
+-- !query 557
+SELECT cast('1' as binary) = cast(1 as decimal(3, 0)) FROM t
+-- !query 557 schema
+struct<>
+-- !query 557 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 558
+SELECT cast('1' as binary) = cast(1 as decimal(5, 0)) FROM t
+-- !query 558 schema
+struct<>
+-- !query 558 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 559
+SELECT cast('1' as binary) = cast(1 as decimal(10, 0)) FROM t
+-- !query 559 schema
+struct<>
+-- !query 559 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 560
+SELECT cast('1' as binary) = cast(1 as decimal(20, 0)) FROM t
+-- !query 560 schema
+struct<>
+-- !query 560 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 561
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(3, 0)) FROM t
+-- !query 561 schema
+struct<>
+-- !query 561 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 562
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(5, 0)) FROM t
+-- !query 562 schema
+struct<>
+-- !query 562 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 563
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(10, 0)) FROM t
+-- !query 563 schema
+struct<>
+-- !query 563 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 564
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) = cast(1 as decimal(20, 0)) FROM t
+-- !query 564 schema
+struct<>
+-- !query 564 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 565
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(3, 0)) FROM t
+-- !query 565 schema
+struct<>
+-- !query 565 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 566
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(5, 0)) FROM t
+-- !query 566 schema
+struct<>
+-- !query 566 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 567
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(10, 0)) FROM t
+-- !query 567 schema
+struct<>
+-- !query 567 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 568
+SELECT cast('2017-12-11 09:30:00' as date) = cast(1 as decimal(20, 0)) FROM t
+-- !query 568 schema
+struct<>
+-- !query 568 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 569
+SELECT cast(1 as decimal(3, 0))  = cast(1 as tinyint) FROM t
+-- !query 569 schema
+struct<(CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 569 output
+true
+
+
+-- !query 570
+SELECT cast(1 as decimal(5, 0))  = cast(1 as tinyint) FROM t
+-- !query 570 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 570 output
+true
+
+
+-- !query 571
+SELECT cast(1 as decimal(10, 0)) = cast(1 as tinyint) FROM t
+-- !query 571 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 571 output
+true
+
+
+-- !query 572
+SELECT cast(1 as decimal(20, 0)) = cast(1 as tinyint) FROM t
+-- !query 572 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 572 output
+true
+
+
+-- !query 573
+SELECT cast(1 as decimal(3, 0))  = cast(1 as smallint) FROM t
+-- !query 573 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 573 output
+true
+
+
+-- !query 574
+SELECT cast(1 as decimal(5, 0))  = cast(1 as smallint) FROM t
+-- !query 574 schema
+struct<(CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 574 output
+true
+
+
+-- !query 575
+SELECT cast(1 as decimal(10, 0)) = cast(1 as smallint) FROM t
+-- !query 575 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 575 output
+true
+
+
+-- !query 576
+SELECT cast(1 as decimal(20, 0)) = cast(1 as smallint) FROM t
+-- !query 576 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 576 output
+true
+
+
+-- !query 577
+SELECT cast(1 as decimal(3, 0))  = cast(1 as int) FROM t
+-- !query 577 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 577 output
+true
+
+
+-- !query 578
+SELECT cast(1 as decimal(5, 0))  = cast(1 as int) FROM t
+-- !query 578 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 578 output
+true
+
+
+-- !query 579
+SELECT cast(1 as decimal(10, 0)) = cast(1 as int) FROM t
+-- !query 579 schema
+struct<(CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 579 output
+true
+
+
+-- !query 580
+SELECT cast(1 as decimal(20, 0)) = cast(1 as int) FROM t
+-- !query 580 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 580 output
+true
+
+
+-- !query 581
+SELECT cast(1 as decimal(3, 0))  = cast(1 as bigint) FROM t
+-- !query 581 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 581 output
+true
+
+
+-- !query 582
+SELECT cast(1 as decimal(5, 0))  = cast(1 as bigint) FROM t
+-- !query 582 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 582 output
+true
+
+
+-- !query 583
+SELECT cast(1 as decimal(10, 0)) = cast(1 as bigint) FROM t
+-- !query 583 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 583 output
+true
+
+
+-- !query 584
+SELECT cast(1 as decimal(20, 0)) = cast(1 as bigint) FROM t
+-- !query 584 schema
+struct<(CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 584 output
+true
+
+
+-- !query 585
+SELECT cast(1 as decimal(3, 0))  = cast(1 as float) FROM t
+-- !query 585 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 585 output
+true
+
+
+-- !query 586
+SELECT cast(1 as decimal(5, 0))  = cast(1 as float) FROM t
+-- !query 586 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 586 output
+true
+
+
+-- !query 587
+SELECT cast(1 as decimal(10, 0)) = cast(1 as float) FROM t
+-- !query 587 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 587 output
+true
+
+
+-- !query 588
+SELECT cast(1 as decimal(20, 0)) = cast(1 as float) FROM t
+-- !query 588 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 588 output
+true
+
+
+-- !query 589
+SELECT cast(1 as decimal(3, 0))  = cast(1 as double) FROM t
+-- !query 589 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 589 output
+true
+
+
+-- !query 590
+SELECT cast(1 as decimal(5, 0))  = cast(1 as double) FROM t
+-- !query 590 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 590 output
+true
+
+
+-- !query 591
+SELECT cast(1 as decimal(10, 0)) = cast(1 as double) FROM t
+-- !query 591 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 591 output
+true
+
+
+-- !query 592
+SELECT cast(1 as decimal(20, 0)) = cast(1 as double) FROM t
+-- !query 592 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(1 AS DOUBLE)):boolean>
+-- !query 592 output
+true
+
+
+-- !query 593
+SELECT cast(1 as decimal(3, 0))  = cast(1 as decimal(10, 0)) FROM t
+-- !query 593 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 593 output
+true
+
+
+-- !query 594
+SELECT cast(1 as decimal(5, 0))  = cast(1 as decimal(10, 0)) FROM t
+-- !query 594 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 594 output
+true
+
+
+-- !query 595
+SELECT cast(1 as decimal(10, 0)) = cast(1 as decimal(10, 0)) FROM t
+-- !query 595 schema
+struct<(CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 595 output
+true
+
+
+-- !query 596
+SELECT cast(1 as decimal(20, 0)) = cast(1 as decimal(10, 0)) FROM t
+-- !query 596 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 596 output
+true
+
+
+-- !query 597
+SELECT cast(1 as decimal(3, 0))  = cast(1 as string) FROM t
+-- !query 597 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 597 output
+true
+
+
+-- !query 598
+SELECT cast(1 as decimal(5, 0))  = cast(1 as string) FROM t
+-- !query 598 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 598 output
+true
+
+
+-- !query 599
+SELECT cast(1 as decimal(10, 0)) = cast(1 as string) FROM t
+-- !query 599 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 599 output
+true
+
+
+-- !query 600
+SELECT cast(1 as decimal(20, 0)) = cast(1 as string) FROM t
+-- !query 600 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 600 output
+true
+
+
+-- !query 601
+SELECT cast(1 as decimal(3, 0))  = cast('1' as binary) FROM t
+-- !query 601 schema
+struct<>
+-- !query 601 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 602
+SELECT cast(1 as decimal(5, 0))  = cast('1' as binary) FROM t
+-- !query 602 schema
+struct<>
+-- !query 602 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 603
+SELECT cast(1 as decimal(10, 0)) = cast('1' as binary) FROM t
+-- !query 603 schema
+struct<>
+-- !query 603 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 604
+SELECT cast(1 as decimal(20, 0)) = cast('1' as binary) FROM t
+-- !query 604 schema
+struct<>
+-- !query 604 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 605
+SELECT cast(1 as decimal(3, 0))  = cast(1 as boolean) FROM t
+-- !query 605 schema
+struct<(CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0))):boolean>
+-- !query 605 output
+true
+
+
+-- !query 606
+SELECT cast(1 as decimal(5, 0))  = cast(1 as boolean) FROM t
+-- !query 606 schema
+struct<(CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0))):boolean>
+-- !query 606 output
+true
+
+
+-- !query 607
+SELECT cast(1 as decimal(10, 0)) = cast(1 as boolean) FROM t
+-- !query 607 schema
+struct<(CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0))):boolean>
+-- !query 607 output
+true
+
+
+-- !query 608
+SELECT cast(1 as decimal(20, 0)) = cast(1 as boolean) FROM t
+-- !query 608 schema
+struct<(CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0))):boolean>
+-- !query 608 output
+true
+
+
+-- !query 609
+SELECT cast(1 as decimal(3, 0))  = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 609 schema
+struct<>
+-- !query 609 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 610
+SELECT cast(1 as decimal(5, 0))  = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 610 schema
+struct<>
+-- !query 610 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 611
+SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 611 schema
+struct<>
+-- !query 611 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 612
+SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 612 schema
+struct<>
+-- !query 612 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 613
+SELECT cast(1 as decimal(3, 0))  = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 613 schema
+struct<>
+-- !query 613 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 614
+SELECT cast(1 as decimal(5, 0))  = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 614 schema
+struct<>
+-- !query 614 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 615
+SELECT cast(1 as decimal(10, 0)) = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 615 schema
+struct<>
+-- !query 615 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 616
+SELECT cast(1 as decimal(20, 0)) = cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 616 schema
+struct<>
+-- !query 616 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 617
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 617 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) <=> CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 617 output
+true
+
+
+-- !query 618
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 618 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 618 output
+true
+
+
+-- !query 619
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 619 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 619 output
+true
+
+
+-- !query 620
+SELECT cast(1 as tinyint) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 620 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 620 output
+true
+
+
+-- !query 621
+SELECT cast(1 as smallint) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 621 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 621 output
+true
+
+
+-- !query 622
+SELECT cast(1 as smallint) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 622 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) <=> CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 622 output
+true
+
+
+-- !query 623
+SELECT cast(1 as smallint) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 623 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 623 output
+true
+
+
+-- !query 624
+SELECT cast(1 as smallint) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 624 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 624 output
+true
+
+
+-- !query 625
+SELECT cast(1 as int) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 625 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 625 output
+true
+
+
+-- !query 626
+SELECT cast(1 as int) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 626 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 626 output
+true
+
+
+-- !query 627
+SELECT cast(1 as int) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 627 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 627 output
+true
+
+
+-- !query 628
+SELECT cast(1 as int) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 628 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 628 output
+true
+
+
+-- !query 629
+SELECT cast(1 as bigint) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 629 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 629 output
+true
+
+
+-- !query 630
+SELECT cast(1 as bigint) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 630 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 630 output
+true
+
+
+-- !query 631
+SELECT cast(1 as bigint) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 631 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 631 output
+true
+
+
+-- !query 632
+SELECT cast(1 as bigint) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 632 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) <=> CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 632 output
+true
+
+
+-- !query 633
+SELECT cast(1 as float) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 633 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 633 output
+true
+
+
+-- !query 634
+SELECT cast(1 as float) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 634 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 634 output
+true
+
+
+-- !query 635
+SELECT cast(1 as float) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 635 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 635 output
+true
+
+
+-- !query 636
+SELECT cast(1 as float) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 636 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 636 output
+true
+
+
+-- !query 637
+SELECT cast(1 as double) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 637 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 637 output
+true
+
+
+-- !query 638
+SELECT cast(1 as double) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 638 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 638 output
+true
+
+
+-- !query 639
+SELECT cast(1 as double) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 639 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 639 output
+true
+
+
+-- !query 640
+SELECT cast(1 as double) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 640 schema
+struct<(CAST(1 AS DOUBLE) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 640 output
+true
+
+
+-- !query 641
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 641 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 641 output
+true
+
+
+-- !query 642
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 642 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 642 output
+true
+
+
+-- !query 643
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 643 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 643 output
+true
+
+
+-- !query 644
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 644 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 644 output
+true
+
+
+-- !query 645
+SELECT cast('1' as binary) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 645 schema
+struct<>
+-- !query 645 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 646
+SELECT cast('1' as binary) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 646 schema
+struct<>
+-- !query 646 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 647
+SELECT cast('1' as binary) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 647 schema
+struct<>
+-- !query 647 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 648
+SELECT cast('1' as binary) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 648 schema
+struct<>
+-- !query 648 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <=> CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 649
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 649 schema
+struct<>
+-- !query 649 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 650
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 650 schema
+struct<>
+-- !query 650 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 651
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 651 schema
+struct<>
+-- !query 651 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 652
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 652 schema
+struct<>
+-- !query 652 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <=> CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 653
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(3, 0)) FROM t
+-- !query 653 schema
+struct<>
+-- !query 653 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 654
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(5, 0)) FROM t
+-- !query 654 schema
+struct<>
+-- !query 654 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 655
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 655 schema
+struct<>
+-- !query 655 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 656
+SELECT cast('2017-12-11 09:30:00' as date) <=> cast(1 as decimal(20, 0)) FROM t
+-- !query 656 schema
+struct<>
+-- !query 656 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <=> CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 657
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as tinyint) FROM t
+-- !query 657 schema
+struct<(CAST(1 AS DECIMAL(3,0)) <=> CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 657 output
+true
+
+
+-- !query 658
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as tinyint) FROM t
+-- !query 658 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 658 output
+true
+
+
+-- !query 659
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as tinyint) FROM t
+-- !query 659 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 659 output
+true
+
+
+-- !query 660
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as tinyint) FROM t
+-- !query 660 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 660 output
+true
+
+
+-- !query 661
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as smallint) FROM t
+-- !query 661 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 661 output
+true
+
+
+-- !query 662
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as smallint) FROM t
+-- !query 662 schema
+struct<(CAST(1 AS DECIMAL(5,0)) <=> CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 662 output
+true
+
+
+-- !query 663
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as smallint) FROM t
+-- !query 663 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 663 output
+true
+
+
+-- !query 664
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as smallint) FROM t
+-- !query 664 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 664 output
+true
+
+
+-- !query 665
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as int) FROM t
+-- !query 665 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 665 output
+true
+
+
+-- !query 666
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as int) FROM t
+-- !query 666 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 666 output
+true
+
+
+-- !query 667
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as int) FROM t
+-- !query 667 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 667 output
+true
+
+
+-- !query 668
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as int) FROM t
+-- !query 668 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 668 output
+true
+
+
+-- !query 669
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as bigint) FROM t
+-- !query 669 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 669 output
+true
+
+
+-- !query 670
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as bigint) FROM t
+-- !query 670 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 670 output
+true
+
+
+-- !query 671
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as bigint) FROM t
+-- !query 671 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <=> CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 671 output
+true
+
+
+-- !query 672
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as bigint) FROM t
+-- !query 672 schema
+struct<(CAST(1 AS DECIMAL(20,0)) <=> CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 672 output
+true
+
+
+-- !query 673
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as float) FROM t
+-- !query 673 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 673 output
+true
+
+
+-- !query 674
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as float) FROM t
+-- !query 674 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 674 output
+true
+
+
+-- !query 675
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as float) FROM t
+-- !query 675 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 675 output
+true
+
+
+-- !query 676
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as float) FROM t
+-- !query 676 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 676 output
+true
+
+
+-- !query 677
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as double) FROM t
+-- !query 677 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 677 output
+true
+
+
+-- !query 678
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as double) FROM t
+-- !query 678 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 678 output
+true
+
+
+-- !query 679
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as double) FROM t
+-- !query 679 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 679 output
+true
+
+
+-- !query 680
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as double) FROM t
+-- !query 680 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(1 AS DOUBLE)):boolean>
+-- !query 680 output
+true
+
+
+-- !query 681
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 681 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 681 output
+true
+
+
+-- !query 682
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 682 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 682 output
+true
+
+
+-- !query 683
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 683 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 683 output
+true
+
+
+-- !query 684
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as decimal(10, 0)) FROM t
+-- !query 684 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <=> CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 684 output
+true
+
+
+-- !query 685
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as string) FROM t
+-- !query 685 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 685 output
+true
+
+
+-- !query 686
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as string) FROM t
+-- !query 686 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 686 output
+true
+
+
+-- !query 687
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as string) FROM t
+-- !query 687 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 687 output
+true
+
+
+-- !query 688
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as string) FROM t
+-- !query 688 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <=> CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 688 output
+true
+
+
+-- !query 689
+SELECT cast(1 as decimal(3, 0))  <=> cast('1' as binary) FROM t
+-- !query 689 schema
+struct<>
+-- !query 689 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 690
+SELECT cast(1 as decimal(5, 0))  <=> cast('1' as binary) FROM t
+-- !query 690 schema
+struct<>
+-- !query 690 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 691
+SELECT cast(1 as decimal(10, 0)) <=> cast('1' as binary) FROM t
+-- !query 691 schema
+struct<>
+-- !query 691 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 692
+SELECT cast(1 as decimal(20, 0)) <=> cast('1' as binary) FROM t
+-- !query 692 schema
+struct<>
+-- !query 692 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 693
+SELECT cast(1 as decimal(3, 0))  <=> cast(1 as boolean) FROM t
+-- !query 693 schema
+struct<(CAST(1 AS DECIMAL(3,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0))):boolean>
+-- !query 693 output
+true
+
+
+-- !query 694
+SELECT cast(1 as decimal(5, 0))  <=> cast(1 as boolean) FROM t
+-- !query 694 schema
+struct<(CAST(1 AS DECIMAL(5,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0))):boolean>
+-- !query 694 output
+true
+
+
+-- !query 695
+SELECT cast(1 as decimal(10, 0)) <=> cast(1 as boolean) FROM t
+-- !query 695 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0))):boolean>
+-- !query 695 output
+true
+
+
+-- !query 696
+SELECT cast(1 as decimal(20, 0)) <=> cast(1 as boolean) FROM t
+-- !query 696 schema
+struct<(CAST(1 AS DECIMAL(20,0)) <=> CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0))):boolean>
+-- !query 696 output
+true
+
+
+-- !query 697
+SELECT cast(1 as decimal(3, 0))  <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 697 schema
+struct<>
+-- !query 697 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 698
+SELECT cast(1 as decimal(5, 0))  <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 698 schema
+struct<>
+-- !query 698 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 699
+SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 699 schema
+struct<>
+-- !query 699 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 700
+SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 700 schema
+struct<>
+-- !query 700 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 701
+SELECT cast(1 as decimal(3, 0))  <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 701 schema
+struct<>
+-- !query 701 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 702
+SELECT cast(1 as decimal(5, 0))  <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 702 schema
+struct<>
+-- !query 702 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 703
+SELECT cast(1 as decimal(10, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 703 schema
+struct<>
+-- !query 703 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 704
+SELECT cast(1 as decimal(20, 0)) <=> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 704 schema
+struct<>
+-- !query 704 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <=> CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 705
+SELECT cast(1 as tinyint) < cast(1 as decimal(3, 0)) FROM t
+-- !query 705 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) < CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 705 output
+false
+
+
+-- !query 706
+SELECT cast(1 as tinyint) < cast(1 as decimal(5, 0)) FROM t
+-- !query 706 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 706 output
+false
+
+
+-- !query 707
+SELECT cast(1 as tinyint) < cast(1 as decimal(10, 0)) FROM t
+-- !query 707 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 707 output
+false
+
+
+-- !query 708
+SELECT cast(1 as tinyint) < cast(1 as decimal(20, 0)) FROM t
+-- !query 708 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 708 output
+false
+
+
+-- !query 709
+SELECT cast(1 as smallint) < cast(1 as decimal(3, 0)) FROM t
+-- !query 709 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 709 output
+false
+
+
+-- !query 710
+SELECT cast(1 as smallint) < cast(1 as decimal(5, 0)) FROM t
+-- !query 710 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) < CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 710 output
+false
+
+
+-- !query 711
+SELECT cast(1 as smallint) < cast(1 as decimal(10, 0)) FROM t
+-- !query 711 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 711 output
+false
+
+
+-- !query 712
+SELECT cast(1 as smallint) < cast(1 as decimal(20, 0)) FROM t
+-- !query 712 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 712 output
+false
+
+
+-- !query 713
+SELECT cast(1 as int) < cast(1 as decimal(3, 0)) FROM t
+-- !query 713 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 713 output
+false
+
+
+-- !query 714
+SELECT cast(1 as int) < cast(1 as decimal(5, 0)) FROM t
+-- !query 714 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 714 output
+false
+
+
+-- !query 715
+SELECT cast(1 as int) < cast(1 as decimal(10, 0)) FROM t
+-- !query 715 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 715 output
+false
+
+
+-- !query 716
+SELECT cast(1 as int) < cast(1 as decimal(20, 0)) FROM t
+-- !query 716 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 716 output
+false
+
+
+-- !query 717
+SELECT cast(1 as bigint) < cast(1 as decimal(3, 0)) FROM t
+-- !query 717 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 717 output
+false
+
+
+-- !query 718
+SELECT cast(1 as bigint) < cast(1 as decimal(5, 0)) FROM t
+-- !query 718 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 718 output
+false
+
+
+-- !query 719
+SELECT cast(1 as bigint) < cast(1 as decimal(10, 0)) FROM t
+-- !query 719 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 719 output
+false
+
+
+-- !query 720
+SELECT cast(1 as bigint) < cast(1 as decimal(20, 0)) FROM t
+-- !query 720 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) < CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 720 output
+false
+
+
+-- !query 721
+SELECT cast(1 as float) < cast(1 as decimal(3, 0)) FROM t
+-- !query 721 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 721 output
+false
+
+
+-- !query 722
+SELECT cast(1 as float) < cast(1 as decimal(5, 0)) FROM t
+-- !query 722 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 722 output
+false
+
+
+-- !query 723
+SELECT cast(1 as float) < cast(1 as decimal(10, 0)) FROM t
+-- !query 723 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 723 output
+false
+
+
+-- !query 724
+SELECT cast(1 as float) < cast(1 as decimal(20, 0)) FROM t
+-- !query 724 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) < CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 724 output
+false
+
+
+-- !query 725
+SELECT cast(1 as double) < cast(1 as decimal(3, 0)) FROM t
+-- !query 725 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 725 output
+false
+
+
+-- !query 726
+SELECT cast(1 as double) < cast(1 as decimal(5, 0)) FROM t
+-- !query 726 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 726 output
+false
+
+
+-- !query 727
+SELECT cast(1 as double) < cast(1 as decimal(10, 0)) FROM t
+-- !query 727 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 727 output
+false
+
+
+-- !query 728
+SELECT cast(1 as double) < cast(1 as decimal(20, 0)) FROM t
+-- !query 728 schema
+struct<(CAST(1 AS DOUBLE) < CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 728 output
+false
+
+
+-- !query 729
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(3, 0)) FROM t
+-- !query 729 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 729 output
+false
+
+
+-- !query 730
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(5, 0)) FROM t
+-- !query 730 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 730 output
+false
+
+
+-- !query 731
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t
+-- !query 731 schema
+struct<(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 731 output
+false
+
+
+-- !query 732
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(20, 0)) FROM t
+-- !query 732 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 732 output
+false
+
+
+-- !query 733
+SELECT cast('1' as binary) < cast(1 as decimal(3, 0)) FROM t
+-- !query 733 schema
+struct<>
+-- !query 733 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 734
+SELECT cast('1' as binary) < cast(1 as decimal(5, 0)) FROM t
+-- !query 734 schema
+struct<>
+-- !query 734 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 735
+SELECT cast('1' as binary) < cast(1 as decimal(10, 0)) FROM t
+-- !query 735 schema
+struct<>
+-- !query 735 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 736
+SELECT cast('1' as binary) < cast(1 as decimal(20, 0)) FROM t
+-- !query 736 schema
+struct<>
+-- !query 736 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) < CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 737
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(3, 0)) FROM t
+-- !query 737 schema
+struct<>
+-- !query 737 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 738
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(5, 0)) FROM t
+-- !query 738 schema
+struct<>
+-- !query 738 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 739
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(10, 0)) FROM t
+-- !query 739 schema
+struct<>
+-- !query 739 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 740
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) < cast(1 as decimal(20, 0)) FROM t
+-- !query 740 schema
+struct<>
+-- !query 740 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) < CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 741
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(3, 0)) FROM t
+-- !query 741 schema
+struct<>
+-- !query 741 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 742
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(5, 0)) FROM t
+-- !query 742 schema
+struct<>
+-- !query 742 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 743
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(10, 0)) FROM t
+-- !query 743 schema
+struct<>
+-- !query 743 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 744
+SELECT cast('2017-12-11 09:30:00' as date) < cast(1 as decimal(20, 0)) FROM t
+-- !query 744 schema
+struct<>
+-- !query 744 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) < CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 745
+SELECT cast(1 as decimal(3, 0))  < cast(1 as tinyint) FROM t
+-- !query 745 schema
+struct<(CAST(1 AS DECIMAL(3,0)) < CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 745 output
+false
+
+
+-- !query 746
+SELECT cast(1 as decimal(5, 0))  < cast(1 as tinyint) FROM t
+-- !query 746 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 746 output
+false
+
+
+-- !query 747
+SELECT cast(1 as decimal(10, 0)) < cast(1 as tinyint) FROM t
+-- !query 747 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 747 output
+false
+
+
+-- !query 748
+SELECT cast(1 as decimal(20, 0)) < cast(1 as tinyint) FROM t
+-- !query 748 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 748 output
+false
+
+
+-- !query 749
+SELECT cast(1 as decimal(3, 0))  < cast(1 as smallint) FROM t
+-- !query 749 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 749 output
+false
+
+
+-- !query 750
+SELECT cast(1 as decimal(5, 0))  < cast(1 as smallint) FROM t
+-- !query 750 schema
+struct<(CAST(1 AS DECIMAL(5,0)) < CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 750 output
+false
+
+
+-- !query 751
+SELECT cast(1 as decimal(10, 0)) < cast(1 as smallint) FROM t
+-- !query 751 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 751 output
+false
+
+
+-- !query 752
+SELECT cast(1 as decimal(20, 0)) < cast(1 as smallint) FROM t
+-- !query 752 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 752 output
+false
+
+
+-- !query 753
+SELECT cast(1 as decimal(3, 0))  < cast(1 as int) FROM t
+-- !query 753 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 753 output
+false
+
+
+-- !query 754
+SELECT cast(1 as decimal(5, 0))  < cast(1 as int) FROM t
+-- !query 754 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 754 output
+false
+
+
+-- !query 755
+SELECT cast(1 as decimal(10, 0)) < cast(1 as int) FROM t
+-- !query 755 schema
+struct<(CAST(1 AS DECIMAL(10,0)) < CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 755 output
+false
+
+
+-- !query 756
+SELECT cast(1 as decimal(20, 0)) < cast(1 as int) FROM t
+-- !query 756 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 756 output
+false
+
+
+-- !query 757
+SELECT cast(1 as decimal(3, 0))  < cast(1 as bigint) FROM t
+-- !query 757 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 757 output
+false
+
+
+-- !query 758
+SELECT cast(1 as decimal(5, 0))  < cast(1 as bigint) FROM t
+-- !query 758 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 758 output
+false
+
+
+-- !query 759
+SELECT cast(1 as decimal(10, 0)) < cast(1 as bigint) FROM t
+-- !query 759 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) < CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 759 output
+false
+
+
+-- !query 760
+SELECT cast(1 as decimal(20, 0)) < cast(1 as bigint) FROM t
+-- !query 760 schema
+struct<(CAST(1 AS DECIMAL(20,0)) < CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 760 output
+false
+
+
+-- !query 761
+SELECT cast(1 as decimal(3, 0))  < cast(1 as float) FROM t
+-- !query 761 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 761 output
+false
+
+
+-- !query 762
+SELECT cast(1 as decimal(5, 0))  < cast(1 as float) FROM t
+-- !query 762 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 762 output
+false
+
+
+-- !query 763
+SELECT cast(1 as decimal(10, 0)) < cast(1 as float) FROM t
+-- !query 763 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 763 output
+false
+
+
+-- !query 764
+SELECT cast(1 as decimal(20, 0)) < cast(1 as float) FROM t
+-- !query 764 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 764 output
+false
+
+
+-- !query 765
+SELECT cast(1 as decimal(3, 0))  < cast(1 as double) FROM t
+-- !query 765 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 765 output
+false
+
+
+-- !query 766
+SELECT cast(1 as decimal(5, 0))  < cast(1 as double) FROM t
+-- !query 766 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 766 output
+false
+
+
+-- !query 767
+SELECT cast(1 as decimal(10, 0)) < cast(1 as double) FROM t
+-- !query 767 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 767 output
+false
+
+
+-- !query 768
+SELECT cast(1 as decimal(20, 0)) < cast(1 as double) FROM t
+-- !query 768 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(1 AS DOUBLE)):boolean>
+-- !query 768 output
+false
+
+
+-- !query 769
+SELECT cast(1 as decimal(3, 0))  < cast(1 as decimal(10, 0)) FROM t
+-- !query 769 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 769 output
+false
+
+
+-- !query 770
+SELECT cast(1 as decimal(5, 0))  < cast(1 as decimal(10, 0)) FROM t
+-- !query 770 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 770 output
+false
+
+
+-- !query 771
+SELECT cast(1 as decimal(10, 0)) < cast(1 as decimal(10, 0)) FROM t
+-- !query 771 schema
+struct<(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 771 output
+false
+
+
+-- !query 772
+SELECT cast(1 as decimal(20, 0)) < cast(1 as decimal(10, 0)) FROM t
+-- !query 772 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) < CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 772 output
+false
+
+
+-- !query 773
+SELECT cast(1 as decimal(3, 0))  < cast(1 as string) FROM t
+-- !query 773 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 773 output
+false
+
+
+-- !query 774
+SELECT cast(1 as decimal(5, 0))  < cast(1 as string) FROM t
+-- !query 774 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 774 output
+false
+
+
+-- !query 775
+SELECT cast(1 as decimal(10, 0)) < cast(1 as string) FROM t
+-- !query 775 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 775 output
+false
+
+
+-- !query 776
+SELECT cast(1 as decimal(20, 0)) < cast(1 as string) FROM t
+-- !query 776 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) < CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 776 output
+false
+
+
+-- !query 777
+SELECT cast(1 as decimal(3, 0))  < cast('1' as binary) FROM t
+-- !query 777 schema
+struct<>
+-- !query 777 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 778
+SELECT cast(1 as decimal(5, 0))  < cast('1' as binary) FROM t
+-- !query 778 schema
+struct<>
+-- !query 778 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 779
+SELECT cast(1 as decimal(10, 0)) < cast('1' as binary) FROM t
+-- !query 779 schema
+struct<>
+-- !query 779 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 780
+SELECT cast(1 as decimal(20, 0)) < cast('1' as binary) FROM t
+-- !query 780 schema
+struct<>
+-- !query 780 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 781
+SELECT cast(1 as decimal(3, 0))  < cast(1 as boolean) FROM t
+-- !query 781 schema
+struct<>
+-- !query 781 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 782
+SELECT cast(1 as decimal(5, 0))  < cast(1 as boolean) FROM t
+-- !query 782 schema
+struct<>
+-- !query 782 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 783
+SELECT cast(1 as decimal(10, 0)) < cast(1 as boolean) FROM t
+-- !query 783 schema
+struct<>
+-- !query 783 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 784
+SELECT cast(1 as decimal(20, 0)) < cast(1 as boolean) FROM t
+-- !query 784 schema
+struct<>
+-- !query 784 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 785
+SELECT cast(1 as decimal(3, 0))  < cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 785 schema
+struct<>
+-- !query 785 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 786
+SELECT cast(1 as decimal(5, 0))  < cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 786 schema
+struct<>
+-- !query 786 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 787
+SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 787 schema
+struct<>
+-- !query 787 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 788
+SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 788 schema
+struct<>
+-- !query 788 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 789
+SELECT cast(1 as decimal(3, 0))  < cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 789 schema
+struct<>
+-- !query 789 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 790
+SELECT cast(1 as decimal(5, 0))  < cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 790 schema
+struct<>
+-- !query 790 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 791
+SELECT cast(1 as decimal(10, 0)) < cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 791 schema
+struct<>
+-- !query 791 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 792
+SELECT cast(1 as decimal(20, 0)) < cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 792 schema
+struct<>
+-- !query 792 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) < CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 793
+SELECT cast(1 as tinyint) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 793 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) <= CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 793 output
+true
+
+
+-- !query 794
+SELECT cast(1 as tinyint) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 794 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 794 output
+true
+
+
+-- !query 795
+SELECT cast(1 as tinyint) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 795 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 795 output
+true
+
+
+-- !query 796
+SELECT cast(1 as tinyint) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 796 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 796 output
+true
+
+
+-- !query 797
+SELECT cast(1 as smallint) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 797 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 797 output
+true
+
+
+-- !query 798
+SELECT cast(1 as smallint) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 798 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) <= CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 798 output
+true
+
+
+-- !query 799
+SELECT cast(1 as smallint) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 799 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 799 output
+true
+
+
+-- !query 800
+SELECT cast(1 as smallint) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 800 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 800 output
+true
+
+
+-- !query 801
+SELECT cast(1 as int) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 801 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 801 output
+true
+
+
+-- !query 802
+SELECT cast(1 as int) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 802 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 802 output
+true
+
+
+-- !query 803
+SELECT cast(1 as int) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 803 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 803 output
+true
+
+
+-- !query 804
+SELECT cast(1 as int) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 804 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 804 output
+true
+
+
+-- !query 805
+SELECT cast(1 as bigint) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 805 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 805 output
+true
+
+
+-- !query 806
+SELECT cast(1 as bigint) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 806 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 806 output
+true
+
+
+-- !query 807
+SELECT cast(1 as bigint) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 807 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 807 output
+true
+
+
+-- !query 808
+SELECT cast(1 as bigint) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 808 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) <= CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 808 output
+true
+
+
+-- !query 809
+SELECT cast(1 as float) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 809 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 809 output
+true
+
+
+-- !query 810
+SELECT cast(1 as float) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 810 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 810 output
+true
+
+
+-- !query 811
+SELECT cast(1 as float) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 811 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 811 output
+true
+
+
+-- !query 812
+SELECT cast(1 as float) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 812 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 812 output
+true
+
+
+-- !query 813
+SELECT cast(1 as double) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 813 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 813 output
+true
+
+
+-- !query 814
+SELECT cast(1 as double) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 814 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 814 output
+true
+
+
+-- !query 815
+SELECT cast(1 as double) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 815 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 815 output
+true
+
+
+-- !query 816
+SELECT cast(1 as double) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 816 schema
+struct<(CAST(1 AS DOUBLE) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 816 output
+true
+
+
+-- !query 817
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 817 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 817 output
+true
+
+
+-- !query 818
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 818 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 818 output
+true
+
+
+-- !query 819
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 819 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 819 output
+true
+
+
+-- !query 820
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 820 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 820 output
+true
+
+
+-- !query 821
+SELECT cast('1' as binary) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 821 schema
+struct<>
+-- !query 821 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 822
+SELECT cast('1' as binary) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 822 schema
+struct<>
+-- !query 822 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 823
+SELECT cast('1' as binary) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 823 schema
+struct<>
+-- !query 823 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 824
+SELECT cast('1' as binary) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 824 schema
+struct<>
+-- !query 824 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) <= CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 825
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 825 schema
+struct<>
+-- !query 825 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 826
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 826 schema
+struct<>
+-- !query 826 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 827
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 827 schema
+struct<>
+-- !query 827 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 828
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 828 schema
+struct<>
+-- !query 828 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) <= CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 829
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(3, 0)) FROM t
+-- !query 829 schema
+struct<>
+-- !query 829 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 830
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(5, 0)) FROM t
+-- !query 830 schema
+struct<>
+-- !query 830 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 831
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 831 schema
+struct<>
+-- !query 831 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 832
+SELECT cast('2017-12-11 09:30:00' as date) <= cast(1 as decimal(20, 0)) FROM t
+-- !query 832 schema
+struct<>
+-- !query 832 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) <= CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 833
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as tinyint) FROM t
+-- !query 833 schema
+struct<(CAST(1 AS DECIMAL(3,0)) <= CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 833 output
+true
+
+
+-- !query 834
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as tinyint) FROM t
+-- !query 834 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 834 output
+true
+
+
+-- !query 835
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as tinyint) FROM t
+-- !query 835 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 835 output
+true
+
+
+-- !query 836
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as tinyint) FROM t
+-- !query 836 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 836 output
+true
+
+
+-- !query 837
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as smallint) FROM t
+-- !query 837 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 837 output
+true
+
+
+-- !query 838
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as smallint) FROM t
+-- !query 838 schema
+struct<(CAST(1 AS DECIMAL(5,0)) <= CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 838 output
+true
+
+
+-- !query 839
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as smallint) FROM t
+-- !query 839 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 839 output
+true
+
+
+-- !query 840
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as smallint) FROM t
+-- !query 840 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 840 output
+true
+
+
+-- !query 841
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as int) FROM t
+-- !query 841 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 841 output
+true
+
+
+-- !query 842
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as int) FROM t
+-- !query 842 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 842 output
+true
+
+
+-- !query 843
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as int) FROM t
+-- !query 843 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 843 output
+true
+
+
+-- !query 844
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as int) FROM t
+-- !query 844 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 844 output
+true
+
+
+-- !query 845
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as bigint) FROM t
+-- !query 845 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 845 output
+true
+
+
+-- !query 846
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as bigint) FROM t
+-- !query 846 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 846 output
+true
+
+
+-- !query 847
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as bigint) FROM t
+-- !query 847 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) <= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 847 output
+true
+
+
+-- !query 848
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as bigint) FROM t
+-- !query 848 schema
+struct<(CAST(1 AS DECIMAL(20,0)) <= CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 848 output
+true
+
+
+-- !query 849
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as float) FROM t
+-- !query 849 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 849 output
+true
+
+
+-- !query 850
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as float) FROM t
+-- !query 850 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 850 output
+true
+
+
+-- !query 851
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as float) FROM t
+-- !query 851 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 851 output
+true
+
+
+-- !query 852
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as float) FROM t
+-- !query 852 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 852 output
+true
+
+
+-- !query 853
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as double) FROM t
+-- !query 853 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 853 output
+true
+
+
+-- !query 854
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as double) FROM t
+-- !query 854 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 854 output
+true
+
+
+-- !query 855
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as double) FROM t
+-- !query 855 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 855 output
+true
+
+
+-- !query 856
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as double) FROM t
+-- !query 856 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(1 AS DOUBLE)):boolean>
+-- !query 856 output
+true
+
+
+-- !query 857
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as decimal(10, 0)) FROM t
+-- !query 857 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 857 output
+true
+
+
+-- !query 858
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as decimal(10, 0)) FROM t
+-- !query 858 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 858 output
+true
+
+
+-- !query 859
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 859 schema
+struct<(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 859 output
+true
+
+
+-- !query 860
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as decimal(10, 0)) FROM t
+-- !query 860 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) <= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 860 output
+true
+
+
+-- !query 861
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as string) FROM t
+-- !query 861 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 861 output
+true
+
+
+-- !query 862
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as string) FROM t
+-- !query 862 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 862 output
+true
+
+
+-- !query 863
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as string) FROM t
+-- !query 863 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 863 output
+true
+
+
+-- !query 864
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as string) FROM t
+-- !query 864 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) <= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 864 output
+true
+
+
+-- !query 865
+SELECT cast(1 as decimal(3, 0))  <= cast('1' as binary) FROM t
+-- !query 865 schema
+struct<>
+-- !query 865 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 866
+SELECT cast(1 as decimal(5, 0))  <= cast('1' as binary) FROM t
+-- !query 866 schema
+struct<>
+-- !query 866 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 867
+SELECT cast(1 as decimal(10, 0)) <= cast('1' as binary) FROM t
+-- !query 867 schema
+struct<>
+-- !query 867 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 868
+SELECT cast(1 as decimal(20, 0)) <= cast('1' as binary) FROM t
+-- !query 868 schema
+struct<>
+-- !query 868 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 869
+SELECT cast(1 as decimal(3, 0))  <= cast(1 as boolean) FROM t
+-- !query 869 schema
+struct<>
+-- !query 869 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 870
+SELECT cast(1 as decimal(5, 0))  <= cast(1 as boolean) FROM t
+-- !query 870 schema
+struct<>
+-- !query 870 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 871
+SELECT cast(1 as decimal(10, 0)) <= cast(1 as boolean) FROM t
+-- !query 871 schema
+struct<>
+-- !query 871 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 872
+SELECT cast(1 as decimal(20, 0)) <= cast(1 as boolean) FROM t
+-- !query 872 schema
+struct<>
+-- !query 872 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 873
+SELECT cast(1 as decimal(3, 0))  <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 873 schema
+struct<>
+-- !query 873 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 874
+SELECT cast(1 as decimal(5, 0))  <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 874 schema
+struct<>
+-- !query 874 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 875
+SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 875 schema
+struct<>
+-- !query 875 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 876
+SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 876 schema
+struct<>
+-- !query 876 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 877
+SELECT cast(1 as decimal(3, 0))  <= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 877 schema
+struct<>
+-- !query 877 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 878
+SELECT cast(1 as decimal(5, 0))  <= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 878 schema
+struct<>
+-- !query 878 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 879
+SELECT cast(1 as decimal(10, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 879 schema
+struct<>
+-- !query 879 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 880
+SELECT cast(1 as decimal(20, 0)) <= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 880 schema
+struct<>
+-- !query 880 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) <= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 881
+SELECT cast(1 as tinyint) > cast(1 as decimal(3, 0)) FROM t
+-- !query 881 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) > CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 881 output
+false
+
+
+-- !query 882
+SELECT cast(1 as tinyint) > cast(1 as decimal(5, 0)) FROM t
+-- !query 882 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 882 output
+false
+
+
+-- !query 883
+SELECT cast(1 as tinyint) > cast(1 as decimal(10, 0)) FROM t
+-- !query 883 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 883 output
+false
+
+
+-- !query 884
+SELECT cast(1 as tinyint) > cast(1 as decimal(20, 0)) FROM t
+-- !query 884 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 884 output
+false
+
+
+-- !query 885
+SELECT cast(1 as smallint) > cast(1 as decimal(3, 0)) FROM t
+-- !query 885 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 885 output
+false
+
+
+-- !query 886
+SELECT cast(1 as smallint) > cast(1 as decimal(5, 0)) FROM t
+-- !query 886 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) > CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 886 output
+false
+
+
+-- !query 887
+SELECT cast(1 as smallint) > cast(1 as decimal(10, 0)) FROM t
+-- !query 887 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 887 output
+false
+
+
+-- !query 888
+SELECT cast(1 as smallint) > cast(1 as decimal(20, 0)) FROM t
+-- !query 888 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 888 output
+false
+
+
+-- !query 889
+SELECT cast(1 as int) > cast(1 as decimal(3, 0)) FROM t
+-- !query 889 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 889 output
+false
+
+
+-- !query 890
+SELECT cast(1 as int) > cast(1 as decimal(5, 0)) FROM t
+-- !query 890 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 890 output
+false
+
+
+-- !query 891
+SELECT cast(1 as int) > cast(1 as decimal(10, 0)) FROM t
+-- !query 891 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 891 output
+false
+
+
+-- !query 892
+SELECT cast(1 as int) > cast(1 as decimal(20, 0)) FROM t
+-- !query 892 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 892 output
+false
+
+
+-- !query 893
+SELECT cast(1 as bigint) > cast(1 as decimal(3, 0)) FROM t
+-- !query 893 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 893 output
+false
+
+
+-- !query 894
+SELECT cast(1 as bigint) > cast(1 as decimal(5, 0)) FROM t
+-- !query 894 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 894 output
+false
+
+
+-- !query 895
+SELECT cast(1 as bigint) > cast(1 as decimal(10, 0)) FROM t
+-- !query 895 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 895 output
+false
+
+
+-- !query 896
+SELECT cast(1 as bigint) > cast(1 as decimal(20, 0)) FROM t
+-- !query 896 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) > CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 896 output
+false
+
+
+-- !query 897
+SELECT cast(1 as float) > cast(1 as decimal(3, 0)) FROM t
+-- !query 897 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 897 output
+false
+
+
+-- !query 898
+SELECT cast(1 as float) > cast(1 as decimal(5, 0)) FROM t
+-- !query 898 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 898 output
+false
+
+
+-- !query 899
+SELECT cast(1 as float) > cast(1 as decimal(10, 0)) FROM t
+-- !query 899 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 899 output
+false
+
+
+-- !query 900
+SELECT cast(1 as float) > cast(1 as decimal(20, 0)) FROM t
+-- !query 900 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) > CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 900 output
+false
+
+
+-- !query 901
+SELECT cast(1 as double) > cast(1 as decimal(3, 0)) FROM t
+-- !query 901 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 901 output
+false
+
+
+-- !query 902
+SELECT cast(1 as double) > cast(1 as decimal(5, 0)) FROM t
+-- !query 902 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 902 output
+false
+
+
+-- !query 903
+SELECT cast(1 as double) > cast(1 as decimal(10, 0)) FROM t
+-- !query 903 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 903 output
+false
+
+
+-- !query 904
+SELECT cast(1 as double) > cast(1 as decimal(20, 0)) FROM t
+-- !query 904 schema
+struct<(CAST(1 AS DOUBLE) > CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 904 output
+false
+
+
+-- !query 905
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(3, 0)) FROM t
+-- !query 905 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 905 output
+false
+
+
+-- !query 906
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(5, 0)) FROM t
+-- !query 906 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 906 output
+false
+
+
+-- !query 907
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t
+-- !query 907 schema
+struct<(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 907 output
+false
+
+
+-- !query 908
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(20, 0)) FROM t
+-- !query 908 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 908 output
+false
+
+
+-- !query 909
+SELECT cast('1' as binary) > cast(1 as decimal(3, 0)) FROM t
+-- !query 909 schema
+struct<>
+-- !query 909 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 910
+SELECT cast('1' as binary) > cast(1 as decimal(5, 0)) FROM t
+-- !query 910 schema
+struct<>
+-- !query 910 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 911
+SELECT cast('1' as binary) > cast(1 as decimal(10, 0)) FROM t
+-- !query 911 schema
+struct<>
+-- !query 911 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 912
+SELECT cast('1' as binary) > cast(1 as decimal(20, 0)) FROM t
+-- !query 912 schema
+struct<>
+-- !query 912 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) > CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 913
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(3, 0)) FROM t
+-- !query 913 schema
+struct<>
+-- !query 913 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 914
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(5, 0)) FROM t
+-- !query 914 schema
+struct<>
+-- !query 914 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 915
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(10, 0)) FROM t
+-- !query 915 schema
+struct<>
+-- !query 915 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 916
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) > cast(1 as decimal(20, 0)) FROM t
+-- !query 916 schema
+struct<>
+-- !query 916 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) > CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 917
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(3, 0)) FROM t
+-- !query 917 schema
+struct<>
+-- !query 917 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 918
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(5, 0)) FROM t
+-- !query 918 schema
+struct<>
+-- !query 918 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 919
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(10, 0)) FROM t
+-- !query 919 schema
+struct<>
+-- !query 919 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 920
+SELECT cast('2017-12-11 09:30:00' as date) > cast(1 as decimal(20, 0)) FROM t
+-- !query 920 schema
+struct<>
+-- !query 920 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) > CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 921
+SELECT cast(1 as decimal(3, 0))  > cast(1 as tinyint) FROM t
+-- !query 921 schema
+struct<(CAST(1 AS DECIMAL(3,0)) > CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 921 output
+false
+
+
+-- !query 922
+SELECT cast(1 as decimal(5, 0))  > cast(1 as tinyint) FROM t
+-- !query 922 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 922 output
+false
+
+
+-- !query 923
+SELECT cast(1 as decimal(10, 0)) > cast(1 as tinyint) FROM t
+-- !query 923 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 923 output
+false
+
+
+-- !query 924
+SELECT cast(1 as decimal(20, 0)) > cast(1 as tinyint) FROM t
+-- !query 924 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 924 output
+false
+
+
+-- !query 925
+SELECT cast(1 as decimal(3, 0))  > cast(1 as smallint) FROM t
+-- !query 925 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 925 output
+false
+
+
+-- !query 926
+SELECT cast(1 as decimal(5, 0))  > cast(1 as smallint) FROM t
+-- !query 926 schema
+struct<(CAST(1 AS DECIMAL(5,0)) > CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 926 output
+false
+
+
+-- !query 927
+SELECT cast(1 as decimal(10, 0)) > cast(1 as smallint) FROM t
+-- !query 927 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 927 output
+false
+
+
+-- !query 928
+SELECT cast(1 as decimal(20, 0)) > cast(1 as smallint) FROM t
+-- !query 928 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 928 output
+false
+
+
+-- !query 929
+SELECT cast(1 as decimal(3, 0))  > cast(1 as int) FROM t
+-- !query 929 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 929 output
+false
+
+
+-- !query 930
+SELECT cast(1 as decimal(5, 0))  > cast(1 as int) FROM t
+-- !query 930 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 930 output
+false
+
+
+-- !query 931
+SELECT cast(1 as decimal(10, 0)) > cast(1 as int) FROM t
+-- !query 931 schema
+struct<(CAST(1 AS DECIMAL(10,0)) > CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 931 output
+false
+
+
+-- !query 932
+SELECT cast(1 as decimal(20, 0)) > cast(1 as int) FROM t
+-- !query 932 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 932 output
+false
+
+
+-- !query 933
+SELECT cast(1 as decimal(3, 0))  > cast(1 as bigint) FROM t
+-- !query 933 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 933 output
+false
+
+
+-- !query 934
+SELECT cast(1 as decimal(5, 0))  > cast(1 as bigint) FROM t
+-- !query 934 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 934 output
+false
+
+
+-- !query 935
+SELECT cast(1 as decimal(10, 0)) > cast(1 as bigint) FROM t
+-- !query 935 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) > CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 935 output
+false
+
+
+-- !query 936
+SELECT cast(1 as decimal(20, 0)) > cast(1 as bigint) FROM t
+-- !query 936 schema
+struct<(CAST(1 AS DECIMAL(20,0)) > CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 936 output
+false
+
+
+-- !query 937
+SELECT cast(1 as decimal(3, 0))  > cast(1 as float) FROM t
+-- !query 937 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 937 output
+false
+
+
+-- !query 938
+SELECT cast(1 as decimal(5, 0))  > cast(1 as float) FROM t
+-- !query 938 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 938 output
+false
+
+
+-- !query 939
+SELECT cast(1 as decimal(10, 0)) > cast(1 as float) FROM t
+-- !query 939 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 939 output
+false
+
+
+-- !query 940
+SELECT cast(1 as decimal(20, 0)) > cast(1 as float) FROM t
+-- !query 940 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 940 output
+false
+
+
+-- !query 941
+SELECT cast(1 as decimal(3, 0))  > cast(1 as double) FROM t
+-- !query 941 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 941 output
+false
+
+
+-- !query 942
+SELECT cast(1 as decimal(5, 0))  > cast(1 as double) FROM t
+-- !query 942 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 942 output
+false
+
+
+-- !query 943
+SELECT cast(1 as decimal(10, 0)) > cast(1 as double) FROM t
+-- !query 943 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 943 output
+false
+
+
+-- !query 944
+SELECT cast(1 as decimal(20, 0)) > cast(1 as double) FROM t
+-- !query 944 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(1 AS DOUBLE)):boolean>
+-- !query 944 output
+false
+
+
+-- !query 945
+SELECT cast(1 as decimal(3, 0))  > cast(1 as decimal(10, 0)) FROM t
+-- !query 945 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 945 output
+false
+
+
+-- !query 946
+SELECT cast(1 as decimal(5, 0))  > cast(1 as decimal(10, 0)) FROM t
+-- !query 946 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 946 output
+false
+
+
+-- !query 947
+SELECT cast(1 as decimal(10, 0)) > cast(1 as decimal(10, 0)) FROM t
+-- !query 947 schema
+struct<(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 947 output
+false
+
+
+-- !query 948
+SELECT cast(1 as decimal(20, 0)) > cast(1 as decimal(10, 0)) FROM t
+-- !query 948 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) > CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 948 output
+false
+
+
+-- !query 949
+SELECT cast(1 as decimal(3, 0))  > cast(1 as string) FROM t
+-- !query 949 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 949 output
+false
+
+
+-- !query 950
+SELECT cast(1 as decimal(5, 0))  > cast(1 as string) FROM t
+-- !query 950 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 950 output
+false
+
+
+-- !query 951
+SELECT cast(1 as decimal(10, 0)) > cast(1 as string) FROM t
+-- !query 951 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 951 output
+false
+
+
+-- !query 952
+SELECT cast(1 as decimal(20, 0)) > cast(1 as string) FROM t
+-- !query 952 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) > CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 952 output
+false
+
+
+-- !query 953
+SELECT cast(1 as decimal(3, 0))  > cast('1' as binary) FROM t
+-- !query 953 schema
+struct<>
+-- !query 953 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 954
+SELECT cast(1 as decimal(5, 0))  > cast('1' as binary) FROM t
+-- !query 954 schema
+struct<>
+-- !query 954 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 955
+SELECT cast(1 as decimal(10, 0)) > cast('1' as binary) FROM t
+-- !query 955 schema
+struct<>
+-- !query 955 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 956
+SELECT cast(1 as decimal(20, 0)) > cast('1' as binary) FROM t
+-- !query 956 schema
+struct<>
+-- !query 956 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 957
+SELECT cast(1 as decimal(3, 0))  > cast(1 as boolean) FROM t
+-- !query 957 schema
+struct<>
+-- !query 957 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 958
+SELECT cast(1 as decimal(5, 0))  > cast(1 as boolean) FROM t
+-- !query 958 schema
+struct<>
+-- !query 958 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 959
+SELECT cast(1 as decimal(10, 0)) > cast(1 as boolean) FROM t
+-- !query 959 schema
+struct<>
+-- !query 959 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 960
+SELECT cast(1 as decimal(20, 0)) > cast(1 as boolean) FROM t
+-- !query 960 schema
+struct<>
+-- !query 960 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 961
+SELECT cast(1 as decimal(3, 0))  > cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 961 schema
+struct<>
+-- !query 961 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 962
+SELECT cast(1 as decimal(5, 0))  > cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 962 schema
+struct<>
+-- !query 962 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 963
+SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 963 schema
+struct<>
+-- !query 963 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 964
+SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 964 schema
+struct<>
+-- !query 964 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 965
+SELECT cast(1 as decimal(3, 0))  > cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 965 schema
+struct<>
+-- !query 965 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 966
+SELECT cast(1 as decimal(5, 0))  > cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 966 schema
+struct<>
+-- !query 966 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 967
+SELECT cast(1 as decimal(10, 0)) > cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 967 schema
+struct<>
+-- !query 967 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 968
+SELECT cast(1 as decimal(20, 0)) > cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 968 schema
+struct<>
+-- !query 968 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) > CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 969
+SELECT cast(1 as tinyint) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 969 schema
+struct<(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) >= CAST(1 AS DECIMAL(3,0))):boolean>
+-- !query 969 output
+true
+
+
+-- !query 970
+SELECT cast(1 as tinyint) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 970 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 970 output
+true
+
+
+-- !query 971
+SELECT cast(1 as tinyint) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 971 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 971 output
+true
+
+
+-- !query 972
+SELECT cast(1 as tinyint) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 972 schema
+struct<(CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 972 output
+true
+
+
+-- !query 973
+SELECT cast(1 as smallint) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 973 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 973 output
+true
+
+
+-- !query 974
+SELECT cast(1 as smallint) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 974 schema
+struct<(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) >= CAST(1 AS DECIMAL(5,0))):boolean>
+-- !query 974 output
+true
+
+
+-- !query 975
+SELECT cast(1 as smallint) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 975 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 975 output
+true
+
+
+-- !query 976
+SELECT cast(1 as smallint) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 976 schema
+struct<(CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 976 output
+true
+
+
+-- !query 977
+SELECT cast(1 as int) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 977 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 977 output
+true
+
+
+-- !query 978
+SELECT cast(1 as int) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 978 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 978 output
+true
+
+
+-- !query 979
+SELECT cast(1 as int) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 979 schema
+struct<(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 979 output
+true
+
+
+-- !query 980
+SELECT cast(1 as int) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 980 schema
+struct<(CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 980 output
+true
+
+
+-- !query 981
+SELECT cast(1 as bigint) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 981 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 981 output
+true
+
+
+-- !query 982
+SELECT cast(1 as bigint) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 982 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 982 output
+true
+
+
+-- !query 983
+SELECT cast(1 as bigint) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 983 schema
+struct<(CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 983 output
+true
+
+
+-- !query 984
+SELECT cast(1 as bigint) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 984 schema
+struct<(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) >= CAST(1 AS DECIMAL(20,0))):boolean>
+-- !query 984 output
+true
+
+
+-- !query 985
+SELECT cast(1 as float) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 985 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 985 output
+true
+
+
+-- !query 986
+SELECT cast(1 as float) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 986 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 986 output
+true
+
+
+-- !query 987
+SELECT cast(1 as float) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 987 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 987 output
+true
+
+
+-- !query 988
+SELECT cast(1 as float) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 988 schema
+struct<(CAST(CAST(1 AS FLOAT) AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 988 output
+true
+
+
+-- !query 989
+SELECT cast(1 as double) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 989 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE)):boolean>
+-- !query 989 output
+true
+
+
+-- !query 990
+SELECT cast(1 as double) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 990 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE)):boolean>
+-- !query 990 output
+true
+
+
+-- !query 991
+SELECT cast(1 as double) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 991 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE)):boolean>
+-- !query 991 output
+true
+
+
+-- !query 992
+SELECT cast(1 as double) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 992 schema
+struct<(CAST(1 AS DOUBLE) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE)):boolean>
+-- !query 992 output
+true
+
+
+-- !query 993
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 993 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 993 output
+true
+
+
+-- !query 994
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 994 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 994 output
+true
+
+
+-- !query 995
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 995 schema
+struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 995 output
+true
+
+
+-- !query 996
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 996 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 996 output
+true
+
+
+-- !query 997
+SELECT cast('1' as binary) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 997 schema
+struct<>
+-- !query 997 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 998
+SELECT cast('1' as binary) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 998 schema
+struct<>
+-- !query 998 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 999
+SELECT cast('1' as binary) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 999 schema
+struct<>
+-- !query 999 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1000
+SELECT cast('1' as binary) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 1000 schema
+struct<>
+-- !query 1000 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) >= CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1001
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 1001 schema
+struct<>
+-- !query 1001 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 1002
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 1002 schema
+struct<>
+-- !query 1002 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 1003
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1003 schema
+struct<>
+-- !query 1003 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1004
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 1004 schema
+struct<>
+-- !query 1004 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) >= CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1005
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(3, 0)) FROM t
+-- !query 1005 schema
+struct<>
+-- !query 1005 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 1006
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(5, 0)) FROM t
+-- !query 1006 schema
+struct<>
+-- !query 1006 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 1007
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1007 schema
+struct<>
+-- !query 1007 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1008
+SELECT cast('2017-12-11 09:30:00' as date) >= cast(1 as decimal(20, 0)) FROM t
+-- !query 1008 schema
+struct<>
+-- !query 1008 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) >= CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1009
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as tinyint) FROM t
+-- !query 1009 schema
+struct<(CAST(1 AS DECIMAL(3,0)) >= CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0))):boolean>
+-- !query 1009 output
+true
+
+
+-- !query 1010
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as tinyint) FROM t
+-- !query 1010 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0))):boolean>
+-- !query 1010 output
+true
+
+
+-- !query 1011
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as tinyint) FROM t
+-- !query 1011 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1011 output
+true
+
+
+-- !query 1012
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as tinyint) FROM t
+-- !query 1012 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1012 output
+true
+
+
+-- !query 1013
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as smallint) FROM t
+-- !query 1013 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0))):boolean>
+-- !query 1013 output
+true
+
+
+-- !query 1014
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as smallint) FROM t
+-- !query 1014 schema
+struct<(CAST(1 AS DECIMAL(5,0)) >= CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0))):boolean>
+-- !query 1014 output
+true
+
+
+-- !query 1015
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as smallint) FROM t
+-- !query 1015 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1015 output
+true
+
+
+-- !query 1016
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as smallint) FROM t
+-- !query 1016 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1016 output
+true
+
+
+-- !query 1017
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as int) FROM t
+-- !query 1017 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1017 output
+true
+
+
+-- !query 1018
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as int) FROM t
+-- !query 1018 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1018 output
+true
+
+
+-- !query 1019
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as int) FROM t
+-- !query 1019 schema
+struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(CAST(1 AS INT) AS DECIMAL(10,0))):boolean>
+-- !query 1019 output
+true
+
+
+-- !query 1020
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as int) FROM t
+-- !query 1020 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1020 output
+true
+
+
+-- !query 1021
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as bigint) FROM t
+-- !query 1021 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1021 output
+true
+
+
+-- !query 1022
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as bigint) FROM t
+-- !query 1022 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1022 output
+true
+
+
+-- !query 1023
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as bigint) FROM t
+-- !query 1023 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) >= CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1023 output
+true
+
+
+-- !query 1024
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as bigint) FROM t
+-- !query 1024 schema
+struct<(CAST(1 AS DECIMAL(20,0)) >= CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0))):boolean>
+-- !query 1024 output
+true
+
+
+-- !query 1025
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as float) FROM t
+-- !query 1025 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 1025 output
+true
+
+
+-- !query 1026
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as float) FROM t
+-- !query 1026 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 1026 output
+true
+
+
+-- !query 1027
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as float) FROM t
+-- !query 1027 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 1027 output
+true
+
+
+-- !query 1028
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as float) FROM t
+-- !query 1028 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(CAST(1 AS FLOAT) AS DOUBLE)):boolean>
+-- !query 1028 output
+true
+
+
+-- !query 1029
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as double) FROM t
+-- !query 1029 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 1029 output
+true
+
+
+-- !query 1030
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as double) FROM t
+-- !query 1030 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 1030 output
+true
+
+
+-- !query 1031
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as double) FROM t
+-- !query 1031 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 1031 output
+true
+
+
+-- !query 1032
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as double) FROM t
+-- !query 1032 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(1 AS DOUBLE)):boolean>
+-- !query 1032 output
+true
+
+
+-- !query 1033
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1033 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1033 output
+true
+
+
+-- !query 1034
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1034 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0))):boolean>
+-- !query 1034 output
+true
+
+
+-- !query 1035
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1035 schema
+struct<(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS DECIMAL(10,0))):boolean>
+-- !query 1035 output
+true
+
+
+-- !query 1036
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as decimal(10, 0)) FROM t
+-- !query 1036 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) >= CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0))):boolean>
+-- !query 1036 output
+true
+
+
+-- !query 1037
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as string) FROM t
+-- !query 1037 schema
+struct<(CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 1037 output
+true
+
+
+-- !query 1038
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as string) FROM t
+-- !query 1038 schema
+struct<(CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 1038 output
+true
+
+
+-- !query 1039
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as string) FROM t
+-- !query 1039 schema
+struct<(CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 1039 output
+true
+
+
+-- !query 1040
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as string) FROM t
+-- !query 1040 schema
+struct<(CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) >= CAST(CAST(1 AS STRING) AS DOUBLE)):boolean>
+-- !query 1040 output
+true
+
+
+-- !query 1041
+SELECT cast(1 as decimal(3, 0))  >= cast('1' as binary) FROM t
+-- !query 1041 schema
+struct<>
+-- !query 1041 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 1042
+SELECT cast(1 as decimal(5, 0))  >= cast('1' as binary) FROM t
+-- !query 1042 schema
+struct<>
+-- !query 1042 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 1043
+SELECT cast(1 as decimal(10, 0)) >= cast('1' as binary) FROM t
+-- !query 1043 schema
+struct<>
+-- !query 1043 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 1044
+SELECT cast(1 as decimal(20, 0)) >= cast('1' as binary) FROM t
+-- !query 1044 schema
+struct<>
+-- !query 1044 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 1045
+SELECT cast(1 as decimal(3, 0))  >= cast(1 as boolean) FROM t
+-- !query 1045 schema
+struct<>
+-- !query 1045 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST(1 AS BOOLEAN))' (decimal(3,0) and boolean).; line 1 pos 7
+
+
+-- !query 1046
+SELECT cast(1 as decimal(5, 0))  >= cast(1 as boolean) FROM t
+-- !query 1046 schema
+struct<>
+-- !query 1046 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST(1 AS BOOLEAN))' (decimal(5,0) and boolean).; line 1 pos 7
+
+
+-- !query 1047
+SELECT cast(1 as decimal(10, 0)) >= cast(1 as boolean) FROM t
+-- !query 1047 schema
+struct<>
+-- !query 1047 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST(1 AS BOOLEAN))' (decimal(10,0) and boolean).; line 1 pos 7
+
+
+-- !query 1048
+SELECT cast(1 as decimal(20, 0)) >= cast(1 as boolean) FROM t
+-- !query 1048 schema
+struct<>
+-- !query 1048 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST(1 AS BOOLEAN))' (decimal(20,0) and boolean).; line 1 pos 7
+
+
+-- !query 1049
+SELECT cast(1 as decimal(3, 0))  >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1049 schema
+struct<>
+-- !query 1049 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1050
+SELECT cast(1 as decimal(5, 0))  >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1050 schema
+struct<>
+-- !query 1050 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1051
+SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1051 schema
+struct<>
+-- !query 1051 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1052
+SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1052 schema
+struct<>
+-- !query 1052 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1053
+SELECT cast(1 as decimal(3, 0))  >= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1053 schema
+struct<>
+-- !query 1053 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 1054
+SELECT cast(1 as decimal(5, 0))  >= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1054 schema
+struct<>
+-- !query 1054 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 1055
+SELECT cast(1 as decimal(10, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1055 schema
+struct<>
+-- !query 1055 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 1056
+SELECT cast(1 as decimal(20, 0)) >= cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1056 schema
+struct<>
+-- !query 1056 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) >= CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
+
+
+-- !query 1057
+SELECT cast(1 as tinyint) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1057 schema
+struct<(NOT (CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) = CAST(1 AS DECIMAL(3,0)))):boolean>
+-- !query 1057 output
+false
+
+
+-- !query 1058
+SELECT cast(1 as tinyint) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1058 schema
+struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)))):boolean>
+-- !query 1058 output
+false
+
+
+-- !query 1059
+SELECT cast(1 as tinyint) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1059 schema
+struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1059 output
+false
+
+
+-- !query 1060
+SELECT cast(1 as tinyint) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1060 schema
+struct<(NOT (CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1060 output
+false
+
+
+-- !query 1061
+SELECT cast(1 as smallint) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1061 schema
+struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)))):boolean>
+-- !query 1061 output
+false
+
+
+-- !query 1062
+SELECT cast(1 as smallint) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1062 schema
+struct<(NOT (CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) = CAST(1 AS DECIMAL(5,0)))):boolean>
+-- !query 1062 output
+false
+
+
+-- !query 1063
+SELECT cast(1 as smallint) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1063 schema
+struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1063 output
+false
+
+
+-- !query 1064
+SELECT cast(1 as smallint) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1064 schema
+struct<(NOT (CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1064 output
+false
+
+
+-- !query 1065
+SELECT cast(1 as int) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1065 schema
+struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1065 output
+false
+
+
+-- !query 1066
+SELECT cast(1 as int) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1066 schema
+struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1066 output
+false
+
+
+-- !query 1067
+SELECT cast(1 as int) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1067 schema
+struct<(NOT (CAST(CAST(1 AS INT) AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean>
+-- !query 1067 output
+false
+
+
+-- !query 1068
+SELECT cast(1 as int) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1068 schema
+struct<(NOT (CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1068 output
+false
+
+
+-- !query 1069
+SELECT cast(1 as bigint) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1069 schema
+struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1069 output
+false
+
+
+-- !query 1070
+SELECT cast(1 as bigint) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1070 schema
+struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1070 output
+false
+
+
+-- !query 1071
+SELECT cast(1 as bigint) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1071 schema
+struct<(NOT (CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1071 output
+false
+
+
+-- !query 1072
+SELECT cast(1 as bigint) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1072 schema
+struct<(NOT (CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) = CAST(1 AS DECIMAL(20,0)))):boolean>
+-- !query 1072 output
+false
+
+
+-- !query 1073
+SELECT cast(1 as float) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1073 schema
+struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE))):boolean>
+-- !query 1073 output
+false
+
+
+-- !query 1074
+SELECT cast(1 as float) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1074 schema
+struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE))):boolean>
+-- !query 1074 output
+false
+
+
+-- !query 1075
+SELECT cast(1 as float) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1075 schema
+struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 1075 output
+false
+
+
+-- !query 1076
+SELECT cast(1 as float) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1076 schema
+struct<(NOT (CAST(CAST(1 AS FLOAT) AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE))):boolean>
+-- !query 1076 output
+false
+
+
+-- !query 1077
+SELECT cast(1 as double) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1077 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE))):boolean>
+-- !query 1077 output
+false
+
+
+-- !query 1078
+SELECT cast(1 as double) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1078 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE))):boolean>
+-- !query 1078 output
+false
+
+
+-- !query 1079
+SELECT cast(1 as double) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1079 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE))):boolean>
+-- !query 1079 output
+false
+
+
+-- !query 1080
+SELECT cast(1 as double) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1080 schema
+struct<(NOT (CAST(1 AS DOUBLE) = CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE))):boolean>
+-- !query 1080 output
+false
+
+
+-- !query 1081
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1081 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1081 output
+false
+
+
+-- !query 1082
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1082 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1082 output
+false
+
+
+-- !query 1083
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1083 schema
+struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean>
+-- !query 1083 output
+false
+
+
+-- !query 1084
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1084 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1084 output
+false
+
+
+-- !query 1085
+SELECT cast('1' as binary) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1085 schema
+struct<>
+-- !query 1085 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(3,0)))' (binary and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 1086
+SELECT cast('1' as binary) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1086 schema
+struct<>
+-- !query 1086 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(5,0)))' (binary and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 1087
+SELECT cast('1' as binary) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1087 schema
+struct<>
+-- !query 1087 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(10,0)))' (binary and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1088
+SELECT cast('1' as binary) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1088 schema
+struct<>
+-- !query 1088 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('1' AS BINARY) = CAST(1 AS DECIMAL(20,0)))' (binary and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1089
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1089 schema
+struct<>
+-- !query 1089 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(3,0)))' (timestamp and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 1090
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1090 schema
+struct<>
+-- !query 1090 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(5,0)))' (timestamp and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 1091
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1091 schema
+struct<>
+-- !query 1091 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(10,0)))' (timestamp and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1092
+SELECT cast('2017-12-11 09:30:00.0' as timestamp) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1092 schema
+struct<>
+-- !query 1092 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) = CAST(1 AS DECIMAL(20,0)))' (timestamp and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1093
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(3, 0)) FROM t
+-- !query 1093 schema
+struct<>
+-- !query 1093 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(3,0)))' (date and decimal(3,0)).; line 1 pos 7
+
+
+-- !query 1094
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(5, 0)) FROM t
+-- !query 1094 schema
+struct<>
+-- !query 1094 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(5,0)))' (date and decimal(5,0)).; line 1 pos 7
+
+
+-- !query 1095
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1095 schema
+struct<>
+-- !query 1095 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(10,0)))' (date and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 1096
+SELECT cast('2017-12-11 09:30:00' as date) <> cast(1 as decimal(20, 0)) FROM t
+-- !query 1096 schema
+struct<>
+-- !query 1096 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' due to data type mismatch: differing types in '(CAST('2017-12-11 09:30:00' AS DATE) = CAST(1 AS DECIMAL(20,0)))' (date and decimal(20,0)).; line 1 pos 7
+
+
+-- !query 1097
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as tinyint) FROM t
+-- !query 1097 schema
+struct<(NOT (CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)))):boolean>
+-- !query 1097 output
+false
+
+
+-- !query 1098
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as tinyint) FROM t
+-- !query 1098 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(5,0)))):boolean>
+-- !query 1098 output
+false
+
+
+-- !query 1099
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as tinyint) FROM t
+-- !query 1099 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1099 output
+false
+
+
+-- !query 1100
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as tinyint) FROM t
+-- !query 1100 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS TINYINT) AS DECIMAL(3,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1100 output
+false
+
+
+-- !query 1101
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as smallint) FROM t
+-- !query 1101 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(5,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(5,0)))):boolean>
+-- !query 1101 output
+false
+
+
+-- !query 1102
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as smallint) FROM t
+-- !query 1102 schema
+struct<(NOT (CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)))):boolean>
+-- !query 1102 output
+false
+
+
+-- !query 1103
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as smallint) FROM t
+-- !query 1103 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1103 output
+false
+
+
+-- !query 1104
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as smallint) FROM t
+-- !query 1104 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS SMALLINT) AS DECIMAL(5,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1104 output
+false
+
+
+-- !query 1105
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as int) FROM t
+-- !query 1105 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1105 output
+false
+
+
+-- !query 1106
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as int) FROM t
+-- !query 1106 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1106 output
+false
+
+
+-- !query 1107
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as int) FROM t
+-- !query 1107 schema
+struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS INT) AS DECIMAL(10,0)))):boolean>
+-- !query 1107 output
+false
+
+
+-- !query 1108
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as int) FROM t
+-- !query 1108 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS INT) AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1108 output
+false
+
+
+-- !query 1109
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as bigint) FROM t
+-- !query 1109 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1109 output
+false
+
+
+-- !query 1110
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as bigint) FROM t
+-- !query 1110 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1110 output
+false
+
+
+-- !query 1111
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as bigint) FROM t
+-- !query 1111 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)) = CAST(CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1111 output
+false
+
+
+-- !query 1112
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as bigint) FROM t
+-- !query 1112 schema
+struct<(NOT (CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BIGINT) AS DECIMAL(20,0)))):boolean>
+-- !query 1112 output
+false
+
+
+-- !query 1113
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as float) FROM t
+-- !query 1113 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 1113 output
+false
+
+
+-- !query 1114
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as float) FROM t
+-- !query 1114 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 1114 output
+false
+
+
+-- !query 1115
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as float) FROM t
+-- !query 1115 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 1115 output
+false
+
+
+-- !query 1116
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as float) FROM t
+-- !query 1116 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS FLOAT) AS DOUBLE))):boolean>
+-- !query 1116 output
+false
+
+
+-- !query 1117
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as double) FROM t
+-- !query 1117 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 1117 output
+false
+
+
+-- !query 1118
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as double) FROM t
+-- !query 1118 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 1118 output
+false
+
+
+-- !query 1119
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as double) FROM t
+-- !query 1119 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 1119 output
+false
+
+
+-- !query 1120
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as double) FROM t
+-- !query 1120 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(1 AS DOUBLE))):boolean>
+-- !query 1120 output
+false
+
+
+-- !query 1121
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1121 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1121 output
+false
+
+
+-- !query 1122
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1122 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DECIMAL(10,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(10,0)))):boolean>
+-- !query 1122 output
+false
+
+
+-- !query 1123
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1123 schema
+struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(1 AS DECIMAL(10,0)))):boolean>
+-- !query 1123 output
+false
+
+
+-- !query 1124
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as decimal(10, 0)) FROM t
+-- !query 1124 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DECIMAL(20,0)) = CAST(CAST(1 AS DECIMAL(10,0)) AS DECIMAL(20,0)))):boolean>
+-- !query 1124 output
+false
+
+
+-- !query 1125
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as string) FROM t
+-- !query 1125 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(3,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean>
+-- !query 1125 output
+false
+
+
+-- !query 1126
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as string) FROM t
+-- !query 1126 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(5,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean>
+-- !query 1126 output
+false
+
+
+-- !query 1127
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as string) FROM t
+-- !query 1127 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(10,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean>
+-- !query 1127 output
+false
+
+
+-- !query 1128
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as string) FROM t
+-- !query 1128 schema
+struct<(NOT (CAST(CAST(1 AS DECIMAL(20,0)) AS DOUBLE) = CAST(CAST(1 AS STRING) AS DOUBLE))):boolean>
+-- !query 1128 output
+false
+
+
+-- !query 1129
+SELECT cast(1 as decimal(3, 0))  <> cast('1' as binary) FROM t
+-- !query 1129 schema
+struct<>
+-- !query 1129 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('1' AS BINARY))' (decimal(3,0) and binary).; line 1 pos 7
+
+
+-- !query 1130
+SELECT cast(1 as decimal(5, 0))  <> cast('1' as binary) FROM t
+-- !query 1130 schema
+struct<>
+-- !query 1130 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('1' AS BINARY))' (decimal(5,0) and binary).; line 1 pos 7
+
+
+-- !query 1131
+SELECT cast(1 as decimal(10, 0)) <> cast('1' as binary) FROM t
+-- !query 1131 schema
+struct<>
+-- !query 1131 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('1' AS BINARY))' (decimal(10,0) and binary).; line 1 pos 7
+
+
+-- !query 1132
+SELECT cast(1 as decimal(20, 0)) <> cast('1' as binary) FROM t
+-- !query 1132 schema
+struct<>
+-- !query 1132 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('1' AS BINARY))' (decimal(20,0) and binary).; line 1 pos 7
+
+
+-- !query 1133
+SELECT cast(1 as decimal(3, 0))  <> cast(1 as boolean) FROM t
+-- !query 1133 schema
+struct<(NOT (CAST(1 AS DECIMAL(3,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(3,0)))):boolean>
+-- !query 1133 output
+false
+
+
+-- !query 1134
+SELECT cast(1 as decimal(5, 0))  <> cast(1 as boolean) FROM t
+-- !query 1134 schema
+struct<(NOT (CAST(1 AS DECIMAL(5,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(5,0)))):boolean>
+-- !query 1134 output
+false
+
+
+-- !query 1135
+SELECT cast(1 as decimal(10, 0)) <> cast(1 as boolean) FROM t
+-- !query 1135 schema
+struct<(NOT (CAST(1 AS DECIMAL(10,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(10,0)))):boolean>
+-- !query 1135 output
+false
+
+
+-- !query 1136
+SELECT cast(1 as decimal(20, 0)) <> cast(1 as boolean) FROM t
+-- !query 1136 schema
+struct<(NOT (CAST(1 AS DECIMAL(20,0)) = CAST(CAST(1 AS BOOLEAN) AS DECIMAL(20,0)))):boolean>
+-- !query 1136 output
+false
+
+
+-- !query 1137
+SELECT cast(1 as decimal(3, 0))  <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1137 schema
+struct<>
+-- !query 1137 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(3,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1138
+SELECT cast(1 as decimal(5, 0))  <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1138 schema
+struct<>
+-- !query 1138 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(5,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1139
+SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1139 schema
+struct<>
+-- !query 1139 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(10,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1140
+SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00.0' as timestamp) FROM t
+-- !query 1140 schema
+struct<>
+-- !query 1140 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00.0' AS TIMESTAMP))' (decimal(20,0) and timestamp).; line 1 pos 7
+
+
+-- !query 1141
+SELECT cast(1 as decimal(3, 0))  <> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1141 schema
+struct<>
+-- !query 1141 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(3,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(3,0) and date).; line 1 pos 7
+
+
+-- !query 1142
+SELECT cast(1 as decimal(5, 0))  <> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1142 schema
+struct<>
+-- !query 1142 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(5,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(5,0) and date).; line 1 pos 7
+
+
+-- !query 1143
+SELECT cast(1 as decimal(10, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1143 schema
+struct<>
+-- !query 1143 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(10,0) and date).; line 1 pos 7
+
+
+-- !query 1144
+SELECT cast(1 as decimal(20, 0)) <> cast('2017-12-11 09:30:00' as date) FROM t
+-- !query 1144 schema
+struct<>
+-- !query 1144 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(20,0)) = CAST('2017-12-11 09:30:00' AS DATE))' (decimal(20,0) and date).; line 1 pos 7
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
new file mode 100644
index 0000000000000..5dd257ba6a0bb
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
@@ -0,0 +1,206 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 25
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint)) FROM t
+-- !query 1 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS TINYINT) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 1 output
+1
+
+
+-- !query 2
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint)) FROM t
+-- !query 2 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS SMALLINT) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 2 output
+1
+
+
+-- !query 3
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int)) FROM t
+-- !query 3 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS INT) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 3 output
+1
+
+
+-- !query 4
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint)) FROM t
+-- !query 4 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS BIGINT) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 4 output
+1
+
+
+-- !query 5
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float)) FROM t
+-- !query 5 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS FLOAT) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 5 output
+1
+
+
+-- !query 6
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double)) FROM t
+-- !query 6 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS DOUBLE) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 6 output
+1
+
+
+-- !query 7
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0))) FROM t
+-- !query 7 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS DECIMAL(10,0)) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 7 output
+1
+
+
+-- !query 8
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string)) FROM t
+-- !query 8 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS STRING) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 8 output
+1
+
+
+-- !query 9
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary)) FROM t
+-- !query 9 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS BINARY) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 9 output
+1
+
+
+-- !query 10
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean)) FROM t
+-- !query 10 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS BOOLEAN) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 10 output
+1
+
+
+-- !query 11
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
+-- !query 11 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 11 output
+1
+
+
+-- !query 12
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date)) FROM t
+-- !query 12 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(2017-12-11 09:30:00 AS DATE) ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):bigint>
+-- !query 12 output
+1
+
+
+-- !query 13
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as tinyint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 13 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS TINYINT) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS TINYINT) FOLLOWING):bigint>
+-- !query 13 output
+1
+
+
+-- !query 14
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as smallint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 14 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS SMALLINT) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS SMALLINT) FOLLOWING):bigint>
+-- !query 14 output
+1
+
+
+-- !query 15
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as int) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 15 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS INT) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
+-- !query 15 output
+1
+
+
+-- !query 16
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as bigint) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 16 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS BIGINT) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS BIGINT) FOLLOWING):bigint>
+-- !query 16 output
+1
+
+
+-- !query 17
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as float) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 17 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS FLOAT) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS FLOAT) FOLLOWING):bigint>
+-- !query 17 output
+1
+
+
+-- !query 18
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as double) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 18 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS DOUBLE) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS DOUBLE) FOLLOWING):bigint>
+-- !query 18 output
+1
+
+
+-- !query 19
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as decimal(10, 0)) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 19 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(1 AS DECIMAL(10,0)) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND CAST(1 AS DECIMAL(10,0)) FOLLOWING):bigint>
+-- !query 19 output
+1
+
+
+-- !query 20
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 20 schema
+struct<>
+-- !query 20 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'StringType' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
+
+
+-- !query 21
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 21 schema
+struct<>
+-- !query 21 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(PARTITION BY 1 ORDER BY CAST('1' AS BINARY) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'BinaryType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 21
+
+
+-- !query 22
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 22 schema
+struct<>
+-- !query 22 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'BooleanType' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
+
+
+-- !query 23
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as timestamp) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'TimestampType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 21
+
+
+-- !query 24
+SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00' as date) DESC RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM t
+-- !query 24 schema
+struct<count(1) OVER (PARTITION BY 1 ORDER BY CAST(2017-12-11 09:30:00 AS DATE) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING):bigint>
+-- !query 24 output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index 73ad27e5bf8ce..a52e198eb9a8f 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -61,7 +61,7 @@ ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate,
 struct<>
 -- !query 3 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'LongType does not match the expected data type 'IntegerType'.; line 1 pos 41
+cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'LongType' does not match the expected data type 'int'.; line 1 pos 41
 
 
 -- !query 4

From fe65361b0579777c360dee1d7f633f28df0c6aeb Mon Sep 17 00:00:00 2001
From: Tejas Patil <tejasp@fb.com>
Date: Thu, 21 Dec 2017 09:22:08 -0800
Subject: [PATCH 170/356] [SPARK-22042][FOLLOW-UP][SQL] ReorderJoinPredicates
 can break when child's partitioning is not decided

## What changes were proposed in this pull request?

This is a followup PR of https://github.com/apache/spark/pull/19257 where gatorsmile had left couple comments wrt code style.

## How was this patch tested?

Doesn't change any functionality. Will depend on build to see if no checkstyle rules are violated.

Author: Tejas Patil <tejasp@fb.com>

Closes #20041 from tejasapatil/followup_19257.
---
 .../exchange/EnsureRequirements.scala         | 82 ++++++++++---------
 .../spark/sql/sources/BucketedReadSuite.scala |  4 +-
 2 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index 82f0b9f5cd060..c8e236be28b42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -252,54 +252,56 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     operator.withNewChildren(children)
   }
 
-  /**
-   * When the physical operators are created for JOIN, the ordering of join keys is based on order
-   * in which the join keys appear in the user query. That might not match with the output
-   * partitioning of the join node's children (thus leading to extra sort / shuffle being
-   * introduced). This rule will change the ordering of the join keys to match with the
-   * partitioning of the join nodes' children.
-   */
-  def reorderJoinPredicates(plan: SparkPlan): SparkPlan = {
-    def reorderJoinKeys(
-        leftKeys: Seq[Expression],
-        rightKeys: Seq[Expression],
-        leftPartitioning: Partitioning,
-        rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = {
-
-      def reorder(expectedOrderOfKeys: Seq[Expression],
-                  currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
-        val leftKeysBuffer = ArrayBuffer[Expression]()
-        val rightKeysBuffer = ArrayBuffer[Expression]()
+  private def reorder(
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression],
+      expectedOrderOfKeys: Seq[Expression],
+      currentOrderOfKeys: Seq[Expression]): (Seq[Expression], Seq[Expression]) = {
+    val leftKeysBuffer = ArrayBuffer[Expression]()
+    val rightKeysBuffer = ArrayBuffer[Expression]()
 
-        expectedOrderOfKeys.foreach(expression => {
-          val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
-          leftKeysBuffer.append(leftKeys(index))
-          rightKeysBuffer.append(rightKeys(index))
-        })
-        (leftKeysBuffer, rightKeysBuffer)
-      }
+    expectedOrderOfKeys.foreach(expression => {
+      val index = currentOrderOfKeys.indexWhere(e => e.semanticEquals(expression))
+      leftKeysBuffer.append(leftKeys(index))
+      rightKeysBuffer.append(rightKeys(index))
+    })
+    (leftKeysBuffer, rightKeysBuffer)
+  }
 
-      if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) {
-        leftPartitioning match {
-          case HashPartitioning(leftExpressions, _)
-            if leftExpressions.length == leftKeys.length &&
-              leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) =>
-            reorder(leftExpressions, leftKeys)
+  private def reorderJoinKeys(
+      leftKeys: Seq[Expression],
+      rightKeys: Seq[Expression],
+      leftPartitioning: Partitioning,
+      rightPartitioning: Partitioning): (Seq[Expression], Seq[Expression]) = {
+    if (leftKeys.forall(_.deterministic) && rightKeys.forall(_.deterministic)) {
+      leftPartitioning match {
+        case HashPartitioning(leftExpressions, _)
+          if leftExpressions.length == leftKeys.length &&
+            leftKeys.forall(x => leftExpressions.exists(_.semanticEquals(x))) =>
+          reorder(leftKeys, rightKeys, leftExpressions, leftKeys)
 
-          case _ => rightPartitioning match {
-            case HashPartitioning(rightExpressions, _)
-              if rightExpressions.length == rightKeys.length &&
-                rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) =>
-              reorder(rightExpressions, rightKeys)
+        case _ => rightPartitioning match {
+          case HashPartitioning(rightExpressions, _)
+            if rightExpressions.length == rightKeys.length &&
+              rightKeys.forall(x => rightExpressions.exists(_.semanticEquals(x))) =>
+            reorder(leftKeys, rightKeys, rightExpressions, rightKeys)
 
-            case _ => (leftKeys, rightKeys)
-          }
+          case _ => (leftKeys, rightKeys)
         }
-      } else {
-        (leftKeys, rightKeys)
       }
+    } else {
+      (leftKeys, rightKeys)
     }
+  }
 
+  /**
+   * When the physical operators are created for JOIN, the ordering of join keys is based on order
+   * in which the join keys appear in the user query. That might not match with the output
+   * partitioning of the join node's children (thus leading to extra sort / shuffle being
+   * introduced). This rule will change the ordering of the join keys to match with the
+   * partitioning of the join nodes' children.
+   */
+  private def reorderJoinPredicates(plan: SparkPlan): SparkPlan = {
     plan.transformUp {
       case BroadcastHashJoinExec(leftKeys, rightKeys, joinType, buildSide, condition, left,
         right) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index 9025859e91066..fb61fa716b946 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -620,7 +620,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
                 |) ab
                 |JOIN table2 c
                 |ON ab.i = c.i
-                |""".stripMargin),
+              """.stripMargin),
           sql("""
                 |SELECT a.i, a.j, a.k, c.i, c.j, c.k
                 |FROM bucketed_table a
@@ -628,7 +628,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils {
                 |ON a.i = b.i
                 |JOIN table2 c
                 |ON a.i = c.i
-                |""".stripMargin))
+              """.stripMargin))
       }
     }
   }

From 7beb375bf4e8400f830a7fc7ff414634dd6efc78 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Thu, 21 Dec 2017 15:37:55 -0800
Subject: [PATCH 171/356] [SPARK-22861][SQL] SQLAppStatusListener handles
 multi-job executions.

When one execution has multiple jobs, we need to append to the set of
stages, not replace them on every job.

Added unit test and ran existing tests on jenkins

Author: Imran Rashid <irashid@cloudera.com>

Closes #20047 from squito/SPARK-22861.
---
 .../execution/ui/SQLAppStatusListener.scala   |  2 +-
 .../ui/SQLAppStatusListenerSuite.scala        | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index aa78fa015dbef..2295b8dd5fe36 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -87,7 +87,7 @@ class SQLAppStatusListener(
     }
 
     exec.jobs = exec.jobs + (jobId -> JobExecutionStatus.RUNNING)
-    exec.stages = event.stageIds.toSet
+    exec.stages ++= event.stageIds.toSet
     update(exec)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
index 5ebbeb4a7cb40..7d84f45d36bee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListenerSuite.scala
@@ -383,6 +383,49 @@ class SQLAppStatusListenerSuite extends SparkFunSuite with SharedSQLContext with
     assertJobs(statusStore.execution(executionId), failed = Seq(0))
   }
 
+  test("handle one execution with multiple jobs") {
+    val statusStore = createStatusStore()
+    val listener = statusStore.listener.get
+
+    val executionId = 0
+    val df = createTestDataFrame
+    listener.onOtherEvent(SparkListenerSQLExecutionStart(
+      executionId,
+      "test",
+      "test",
+      df.queryExecution.toString,
+      SparkPlanInfo.fromSparkPlan(df.queryExecution.executedPlan),
+      System.currentTimeMillis()))
+
+    var stageId = 0
+    def twoStageJob(jobId: Int): Unit = {
+      val stages = Seq(stageId, stageId + 1).map { id => createStageInfo(id, 0)}
+      stageId += 2
+      listener.onJobStart(SparkListenerJobStart(
+        jobId = jobId,
+        time = System.currentTimeMillis(),
+        stageInfos = stages,
+        createProperties(executionId)))
+      stages.foreach { s =>
+        listener.onStageSubmitted(SparkListenerStageSubmitted(s))
+        listener.onStageCompleted(SparkListenerStageCompleted(s))
+      }
+      listener.onJobEnd(SparkListenerJobEnd(
+        jobId = jobId,
+        time = System.currentTimeMillis(),
+        JobSucceeded
+      ))
+    }
+    // submit two jobs with the same executionId
+    twoStageJob(0)
+    twoStageJob(1)
+    listener.onOtherEvent(SparkListenerSQLExecutionEnd(
+      executionId, System.currentTimeMillis()))
+
+    assertJobs(statusStore.execution(0), completed = 0 to 1)
+    assert(statusStore.execution(0).get.stages === (0 to 3).toSet)
+  }
+
   test("SPARK-11126: no memory leak when running non SQL jobs") {
     val listener = spark.sharedState.statusStore.listener.get
     // At the beginning of this test case, there should be no live data in the listener.

From 7ab165b7061d9acc26523227076056e94354d204 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Thu, 21 Dec 2017 17:21:11 -0800
Subject: [PATCH 172/356] [SPARK-22648][K8S] Spark on Kubernetes -
 Documentation

What changes were proposed in this pull request?

This PR contains documentation on the usage of Kubernetes scheduler in Spark 2.3, and a shell script to make it easier to build docker images required to use the integration. The changes detailed here are covered by https://github.com/apache/spark/pull/19717 and https://github.com/apache/spark/pull/19468 which have merged already.

How was this patch tested?
The script has been in use for releases on our fork. Rest is documentation.

cc rxin mateiz (shepherd)
k8s-big-data SIG members & contributors: foxish ash211 mccheah liyinan926 erikerlandson ssuchter varunkatta kimoonkim tnachen ifilonenko
reviewers: vanzin felixcheung jiangxb1987 mridulm

TODO:
- [x] Add dockerfiles directory to built distribution. (https://github.com/apache/spark/pull/20007)
- [x] Change references to docker to instead say "container" (https://github.com/apache/spark/pull/19995)
- [x] Update configuration table.
- [x] Modify spark.kubernetes.allocation.batch.delay to take time instead of int (#20032)

Author: foxish <ramanathana@google.com>

Closes #19946 from foxish/update-k8s-docs.
---
 docs/_layouts/global.html        |   1 +
 docs/building-spark.md           |   6 +-
 docs/cluster-overview.md         |   7 +-
 docs/configuration.md            |   2 +
 docs/img/k8s-cluster-mode.png    | Bin 0 -> 55538 bytes
 docs/index.md                    |   3 +-
 docs/running-on-kubernetes.md    | 578 +++++++++++++++++++++++++++++++
 docs/running-on-yarn.md          |   4 +-
 docs/submitting-applications.md  |  16 +
 sbin/build-push-docker-images.sh |  68 ++++
 10 files changed, 677 insertions(+), 8 deletions(-)
 create mode 100644 docs/img/k8s-cluster-mode.png
 create mode 100644 docs/running-on-kubernetes.md
 create mode 100755 sbin/build-push-docker-images.sh

diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index 67b05ecf7a858..e5af5ae4561c7 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -99,6 +99,7 @@
                                 <li><a href="spark-standalone.html">Spark Standalone</a></li>
                                 <li><a href="running-on-mesos.html">Mesos</a></li>
                                 <li><a href="running-on-yarn.html">YARN</a></li>
+                                <li><a href="running-on-kubernetes.html">Kubernetes</a></li>
                             </ul>
                         </li>
 
diff --git a/docs/building-spark.md b/docs/building-spark.md
index 98f7df155456f..c391255a91596 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -49,7 +49,7 @@ To create a Spark distribution like those distributed by the
 to be runnable, use `./dev/make-distribution.sh` in the project root directory. It can be configured
 with Maven profile settings and so on like the direct Maven build. Example:
 
-    ./dev/make-distribution.sh --name custom-spark --pip --r --tgz -Psparkr -Phadoop-2.7 -Phive -Phive-thriftserver -Pmesos -Pyarn
+    ./dev/make-distribution.sh --name custom-spark --pip --r --tgz -Psparkr -Phadoop-2.7 -Phive -Phive-thriftserver -Pmesos -Pyarn -Pkubernetes
 
 This will build Spark distribution along with Python pip and R packages. For more information on usage, run `./dev/make-distribution.sh --help`
 
@@ -90,6 +90,10 @@ like ZooKeeper and Hadoop itself.
 ## Building with Mesos support
 
     ./build/mvn -Pmesos -DskipTests clean package
+
+## Building with Kubernetes support
+
+    ./build/mvn -Pkubernetes -DskipTests clean package
     
 ## Building with Kafka 0.8 support
 
diff --git a/docs/cluster-overview.md b/docs/cluster-overview.md
index c42bb4bb8377e..658e67f99dd71 100644
--- a/docs/cluster-overview.md
+++ b/docs/cluster-overview.md
@@ -52,11 +52,8 @@ The system currently supports three cluster managers:
 * [Apache Mesos](running-on-mesos.html) -- a general cluster manager that can also run Hadoop MapReduce
   and service applications.
 * [Hadoop YARN](running-on-yarn.html) -- the resource manager in Hadoop 2.
-* [Kubernetes (experimental)](https://github.com/apache-spark-on-k8s/spark) -- In addition to the above,
-there is experimental support for Kubernetes. Kubernetes is an open-source platform
-for providing container-centric infrastructure. Kubernetes support is being actively
-developed in an [apache-spark-on-k8s](https://github.com/apache-spark-on-k8s/) Github organization. 
-For documentation, refer to that project's README.
+* [Kubernetes](running-on-kubernetes.html) -- [Kubernetes](https://kubernetes.io/docs/concepts/overview/what-is-kubernetes/)
+is an open-source platform that provides container-centric infrastructure.
 
 A third-party project (not supported by the Spark project) exists to add support for
 [Nomad](https://github.com/hashicorp/nomad-spark) as a cluster manager.
diff --git a/docs/configuration.md b/docs/configuration.md
index d70bac134808f..1189aea2aa71f 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -2376,6 +2376,8 @@ can be found on the pages for each mode:
 
 #### [Mesos](running-on-mesos.html#configuration)
 
+#### [Kubernetes](running-on-kubernetes.html#configuration)
+
 #### [Standalone Mode](spark-standalone.html#cluster-launch-scripts)
 
 # Environment Variables
diff --git a/docs/img/k8s-cluster-mode.png b/docs/img/k8s-cluster-mode.png
new file mode 100644
index 0000000000000000000000000000000000000000..12a6288ed58235b497d838350d17b482cff3b609
GIT binary patch
literal 55538
zcmbrlbyQSg+c!EWQVI%4Ny$iecPI>9BHi8HIfN)3(hbrvbT<Oh-OYe>ch|S|dEW28
zbKbMoS*+Ou!=AnGxPEb6_q~HZ%Za0-y+Z?mK<JVZkS`$6vm4-ri1G~h<kfiIHt>dI
zAT167J^lUp*^(ave1d8xq3#F*y~O_eLI5SF;sYNdJ4wolBCo#0K;i+b7sX0~K<`14
zkWWf(^ZQGtUYZ+oNXMF*)8$5emCftcR4zn(_Ng2Dh$wF)@cqg)sbOJXG&Fu<Z%a%2
z#cTh}&CUJz9Pt?{LS>yjR*!nbfm>HZvypnG(ZPc^_a}w*mY@TzCtbl_Q+{4v@hC=t
zErQ~Y?>_>K0Aar+%>VoT!})(dsv`sE=so-M=Pd}N^@_7pJ2VDlI9drS!wv4rzI|IT
zg8%~g2EY1u+W-6YZUat$`{}9uVNXsqp9nI0x;&6;f}&7BXyRUoib9lBv=+^oS7L`X
zYh|dhoZr1N?&{(u?BxZ4>@wRV>a|Shxj92Qs<CE?hUc)jQ!6fazk#FFz&or-{5Drt
zH#y*nS^cVT3G-)}pRovT1lqo_ytpa{+U^m@2lW^m&y8}pxrU<hOyq@MB`_~?M_jRh
zKw<QGjNgK+RA3V}2@o-~Z^CUOc+EmnP{|)kl!}E@j~{<3!<>22GGW)xK2>Wl1cy1z
zT6v@~o_i;;QU=h3_iXi0k{`eL+E_$Ol0iz7;K`p}IxJTxlNT1eYR4_s(s~y|YiMBP
zz=P?WG(^m+ZP%a%R_N)$#;oEz_njk>ld&o$XB}TV!N2ebog>otDV!S;JK)ZrK4if`
z@al5e(hQuPl@%)`m|<I`5l&1ju#9LVhlfcpzmP-$uHY7?$I$K|id3UTzkYB@DG|v(
z^Jp2Y37RqU@I<Ap(V&fSZ_zOn7gsek-BvhpYnJmTTe<4_-6h(;VY<85$%tHu0nBI)
zqOd+NI0nJL=cj612XMjty~EQvdGpq_2ZUU#3r@Nn62rZ~HPz^GF^#vj680Af?hSww
zHgRx~Y#nZ(tuf+}oRgExKDZ3pQV`ENt0W$u*3aaX3s%m`%25vPz_oSC>KdC<%@uwZ
zz>12vJ=oo<+3@ItzR>zx@}UQacwRN84$pr3Mj$FW$R<Za{XT02fljH2z;O#rG;2l2
zbayu*As*=&0&XaW=ZQg_2Jvkp1|D8rDi8MKi@^=tw>T{7A*&Z2TxGgPS2rECzyN)h
zv4~Gh=B44{s)x`=y=BmT=fUrtVLO1QORZeA<G6KsjYS?6RfWfZmKIc0n7O*`_uTKJ
zZ;(U2ylVGvB1T5JzTVH>zuzcWDq{Z~BGAdT7v=VB@$?^JzQm&MexWmFPD4X}AK!Db
zL=9}nW$?N7B<QkEy}GF~sZgWWmX?O8z`}yZfFZ(t`)x{WP`x>^XUlW(a5SD#i;>H8
zo?XPVX~{&fr|0a&*Ymy@LQ6|)6*|lf99%J)IGY66=DK7CTnK2qRADNwq1evj17vb+
znFJe89CA=n!k9mC;K1^N^;15q>dZ+t{3CTLyR*d?kH$G8i-N-7;84S_a)#C=b4JnP
zqJ`L4=hoIsx>ko_1Tv%g5wesCn)LC9pG@Dg=F4D)KuGGE^0K}LDcRc#Mh2Rv8BXps
zwIOl_b|Mm6s;X)$$9<$8OlGBEp~xLDDuNZE;id>#;wB7S+<Dm~Se7SX{Y5<Gdth2D
z+n-4q<qYib*PD*KDMHvSDx!@fHl%fCt*q+83f9VGCI~MiAfU$`IB+1IZ{m7`VkVyv
zPxtmql1xmr11tGn`Rso5Qqx6AiGJ6vh6cvroCxGZQj#qsBuJuEX}(w`N{uKbQ38_R
zKafNVj$9DPE(ns;upSyBi3~(VtBH$iZxXeChpy=VMdeG!Vq?gg;Al2hEX6PHEG+8z
zNve{DECdu)XYf%*GerafI)2B-jv8j`vkz^)L>zn$OhunrF|5d7w;KRW1uCjc`0&_s
z<iupQ){j3p&7&zfIN0~>8WxMzdiLbf-f27&nMMHE&g6?ux^ONy%ir@-t*J5K^H+tk
z*Lg8v`&QuSek00kh0@Q=>K2F56oW_|K`4|g-wsO!ppBlR`cj4@iQ(N<g<pp#!AZjH
z_erVo-Ru^tCvJn-CojG(Td;ohlucvtcmKtn#<P$kR(9plB4hkT%EZLb+FC?b_9`of
zl>l2@qh@}kgG@X`r9_38bY$%ePs=4qp{xv6WMOG@Xy~b~^!DsMpHPPTFCzYle%fA>
z8de<9HR8Xa92WL1hkWysj92qclhT+b`sc_2|D^ZDuIvdTrqSpVE;(F-=*fp)3BH<#
zVQJX5^9S7@Hu-$iO@$v!t+jI>Ts|z+u^nhb4D4OJy&c5FuA&0Jn*L(R!3=hTPm#P3
zA;FYQ=T)Z0qEVv#E;Z$~DMM;_#@v^8CyfGAdpB<;`IpO?^^?N7ONWPZ&d-Z!6vI5A
zW`TLk(Y?JS3855Nn}X8iL&YOohOJJb2KM>w<+gj%nEWexoVzE(s%@Uux9c$^mIjTy
ze4(!UKj_}7H#F>CyG5y`#>e4Up+hg7p$WSHvZ1L3%in5gFuK$70s^?hDfxtS$#JR3
zDW+7FFXFkaabKRT>^bjsJBy?ubX@;!5^H}*sC;akv$IR3TJhHgNB=lvgtm+ukNAEg
z$lDT?fwm^}w_-8L{cC3cXWy>(nV!Y8e}s6XMFeHV_HX}ql#NnQXs3<OWT~!W)2HBI
zV^bUJ*l=hfB{|bbw1~Vs`l6trqGIQ-$K@$$M1J<mh#iTL;l;v4>VSv|+u0-y0fEe_
zm*0Dy57|9yuWG%%bai1yDN!0GuON!TrU1%#gHoN!<9PlH!G!GSp33;}mc^7lu{4qJ
zdwT+N!EK2>;>EvCJoEE~Z_&}wsJqP>Kg+b6-=9NvIik#pxV+3ccLHkwpUUMclH;;5
zXf!7L{9MSWS<_F7!fdHoqhVr{JTW!ZXKpSWWd=koW%k(CMF<5}rbIP#t$S5T+O1`I
zH-?I8Yol;&jj7?*B4w(7)!~D2<>nEVe+?{jM@)y=m2sll_^vygm^V!)A5hM@o10)7
z8k>YrrZ*DCmD$J`0Dn^FIslCt{3Xm!64}SHbf*uf$L;Pe0GmP?mz^SI#z6laTDY3#
z!UMN|NrDNJ)^_=Vs6j=~0Y(cvmcuwiIG5Gc<+fQTZ+LL;f;Bv*QHFApLMDUF)Imq*
zo5cBsI4NKnN;Irf#bS7jrMw&}Cu~f0ItouuN4>58*_3U+E-OLzb85`4-(4*5(V_5g
zK%=}fB9mTbDZ|h&`O%E)=0aTe?Re8<M$K)l1#~p}i%;v9JnO72rz{=>Gqf@RT|fbY
z!#rAg)(AGorj?Y4m>6mntc0T^2(Uk>QuC9@+kGZYm{8YI*}S?@qmP%M$5?KmU23W;
zE$Ux$VH0^PWz1@+iZ?;DU{lqQDpvu?3JVSn3vp=DitCBs;No=g*#)S^l8&xLhbgoR
zD+J>A@n?2^=?=tba*r;O_$7&$r={h|a9So|Vpz8?+~;x-<-2xT8uHb?e#G>eO9r<W
zcYWEws-6pvmX?;L`i@IOqDWl75hu(!OHQmC&9dMHVE(??;WxhgKCp56;@ODnTI;!A
z;q^{MCQC{uCfaw%x)ZN>?T6}*)a!QKf*EC=s_vV;o?2biDLao@iR1Tvzdj~kv3@_l
z3rU<=UPwZXpIAd0Z1wj*g{K&+>Mph1jG?q$9Y2wRK&{9+Wv;$gDYuCV?MG*G;$~ul
z%+N5T$)2by7iRx=nr6Ob1@IZc>N8HMEk~Jl0eV1bKLYIf|9spgyiI)$(t3NF&Vjum
zyqYSsx^4DL?JMvN=BQN?T;Tov9&q?yj{hVb^#9?g<yFu9f>|wYb5kV{bqi>i7^Dai
zM;rWU+K(ltW6|sPtnV3cw+{y9`>_f`y!C;=*n+SL{$2V1cSC;J>7_f?ySWoHIw~vv
zI_fFJPeDX0Waq#I8UncNfDS`vrIw30Mca--{3<o8%N)<LkBbN(BVyl7<g{z*o8~a5
z)E_KeMs>CV>XHzIHn?&1-!@fZ{2KKiR)m#Chl1_oy1$It^FMSYaKC2uYE(qmsiV@j
z*c^4Sr7!sb8HlApZNWLLo{f3wXqVpeqFNRSkr-d|Q+~rlPS_i{(BN)Vzt!2>KR(k!
zC;dNMeV)?uL*YxK;;TpSGj_RCH$M;JkE4DpcuqFM!@TpQJv@FabS-Ad^nyrR2!M0=
zgdBkH^qb#{y*-ihYFL>Y70?D0Uf-Y3S5t8pqMR41#bJvs`K%OVrtS*meFG=qO~KUe
zVsd>xZId(Um~<OAba?qV8^~9H8`;o3;DfGw6E$5qy(xMn+rBHE#crBXdIa^<9uFgO
zgtdB2=onjce7#c%6mUKZPQLwP*dKeHx*;k+qIsiwS5zBUe;_w>-dAPYM(c#5Sl_cs
zuid0xq%zoc*Vx^Ovx)lve3dI+C6z}j%Jl-Q-b`B$wG@ROELWy(I3{fh0wX6b1TQQT
z=)^;v2+5Oea-m5WF|6gkUR6E4#%IiMJ@zmAJ$Cy2ssw`p8JN2_PH&v_QeUL`z)v(C
zr9E$(&A-%CmR_69HI}H}-==?^N_aDNg5K2V)SXu5MI8|jqsnC`ExUpH`)Ij6q03mD
zGzuKSADn=mtalKpd&%V2maMmg8}HRFw_J|MEY6)xQzh@j)QA@aNqFYa((jo?DwROD
z<;U?itdH(Xdki(7e$(NkgenufyU2;Hl&@8>p+nVf+A5J}#lxLE5CVuR$n0@xh8SKB
ztEe_;hr$}3EvMHBp4@qblBi)XwgNYs;(I;Ryn#kJWc9Rn2X0_D9##_1H{N^sM59@E
zQR{QBL!N*&*8aLRntDTq%>tb4=-vIyh!})O41|z8EvJvAnOW<_0X%Q8p8{OBbW^73
zQ*#$yBQs5wC1<@*ZJAJdy#$qE814~r_a4#AFk)a-l9vK5U2_AGWyZ%;m}|7w6}pQo
zvPS+|7gH@>q;mh*YCc2H!OqSm31;6e80UcLddOtOx(b!=YUcudeW{}B(N%V2g0bIu
z_rzqCk}O~>?c*@8&sVxTS^Hh3g<NJxDUX~8Ue1^3yy#+jF)X4kZGx6-uv8du-~$yR
zV>-*5$yg8l2AQ-gzxR24Z+|C!r2B9l>5n7#@~BOJd1=PQV!>nY@<r!2Z#5u@QFxDA
z?tPrNgA3N2*v_bQ$(wJ&sg5+C#v4fCUdxr@wtRORzgOu!?9eged}<<t-zMDeUz*P_
zqS~bD#L7eR;vKDLl?jr8K2c<i{{DqX49t|~@~v#-r@)|ot^;F7ZfS1^$Kzj&4_&W2
zYNMz(9FMfRHtiaFqIPB&4#o)*X(_-(vtRa|Jx^5`O)!>^&KP{|<tVq`DD)REp1a@R
z!1rAj(gv|ImH3YiEj{T8K9@cB{ptVn`I_XVv191d#m(l{W6cw3M($8bJg=w4^q&qr
z6upj+&$Jki1PL)>5FqAdrX>@T&27f^_Nw!6TQW0qE6&L=S?av;cO_}`N4A;Orv=Zc
z>Xwoke!f2s?x;?)2paz#An4pNIj9%&E`XJzwQeb2i^r#jj+6vmugL~wjY$Ph^~cUN
zKqfU5#355nQZUyzq&lFTI(0pxqaDp3jivZgI@=eX|D%sj$lWaxHK(PoADxan%j6FD
zoxD0r*9cwgZ<@23N>OV|c1Xmp!DJJVZVP7U#oz*mXMf`NBByYZc_U~Er_<VDu%Efv
zel6cs^M&MebjLES+`XV~js|A!XCemDl|3MGcg{{>wZv+VzqK!iOS{KIviQpMF}mmV
zO&qy<IQP*OL~+Xubnr0kU+_0`{SU0a{^)`+Lidpq1TUY9h&XDWRQQv6)d1LvHd$wG
zsYu23-0lXtW9`a1#h{t)S!W_Y*Kudt+h!t}qag{|#DxMR02NA;U~JjBph8Dycxqka
zBJ{c1Bt8;cGDJtyCEuUi?ZBPqMPgB325>U9PXl8T%qtTRfwX&CM_GJt$bIlbFZ;Q<
z*&*x+@Gk-)@wMQ-&sG({HH4d`F{~8&(+sMN-p7<+*1B|S$Lz8x4fHK6G3W^kSc-O#
zEGt<c@WWL9i>nfouNasd83j961nyumUGlI2$rR_kxFISF4Cu9UWD14M(Ed2VEA*ju
ziun)9nXWyzaTR)v5{xB_Dh>t{f&iPgryrxbC+XA$x|4kZz&bpS{PfnU?D;>MN8x4u
zgp%`T6Pu$6ejU^RU32!N<vgOws^4cfdEPd1wQ;oUq+tayGBaB~?PwEqFkhYu4U)hm
zH9Xo!>yDZZxh-M3H(il$*>2}hNS@U$D?>+T|5Rnu5IwYNb-$FN&HEhFdFNG1`pw-+
zs}xUab6hB_HXi?7@|g-F8e@jb(3e(;`kkFRO}gDA3V=7`N)~pnZf?zl5I<mKaQp|p
zP--=(B=lq~`ihf8yG`Uh4v!3O*+A8GJ4ewg^eR6I#sdqkW`{oLAF~Eg)3;Khu%}ym
z$JdrSAL_hrXLaG~XKstJPIXI#WmAmJKF&MyLuvE12+*k$cRFuZ_}vx%(pnn5a=_-i
zkkDyYnU1I9#)m=0{E-1uk8pG?ptn-X3%z)#j*NU<Dw*^;&IgW>UMG7qPtJQWQ5Lx~
z8?`C)0LthxpeRSk|K-(0GoUUOUdF#{SKfSKls0pKDIuEl676f9XHXCo*78MrMt|G9
z<rqP(Mc2A<l|MAQ5*Kf|?Xe8N*^*2CN803Sw>)~CxE~2qxop=^LC!$O3%1t~iXxGW
z((Pf(MKX7a24>SCSmc5%Zd2cQ%h?TZm9^=7Wox08TD~}%lNhEX?G~6)@edZuGVe{l
za_x}AW=h+QKgYk$7e5FWau;_{8h&c`Ptk!bD_hu%VJm3KX=o^TGh1n&RxR9M7OW6B
z1Jz02Dv<x_PLjWP7&;InMW0N49QRuDrY8yU#7265^EBf(`;1KF;Xa6@==2oUqcDW0
zRkFMEQ*T<O8sHzjaiY{2MfhtXKaPF^e!+j^bq@N9@qYW)ch@1u$m<rtROJ8u(=WKN
zuCE9b@EWKPenNiZ1OojdmH+$8C*)7aVj)XLb9a}GpN6KG?SRNnB$y2Bv|#3lSNSIU
zsh|7yCc<yf0O7v@ZFh3Jn^YG*&&lYcl!u@TNUWyr{Rs2RxySK4z!9AXUOnmBji0_}
z%y^n((lOX<0<79UJ@mD!@f@(YjAdxHy+{-g&hJvGsUc^i8nR$kKB7g}O!3^fM`>`{
zf<1NSLjK#a?e+*&p}~e`YA2-Niadj}I-NSx6MwD~(T?#NxD)*6_OP<}%n|-dm^O3K
zaTMY|TV*JHu*?cAxvbQ^XRbFoTG#^!Ng~Ub;s`odXuTHtjb~tMLx)GG-VaD2HS0=R
zuL~9q-_mU;GG-*y)H7EP0y^5mweO%|RTP%fAf!cb873Ec(A#Joo{OVXr<oM}x%?Ij
z5UJMJr`xv&e;t8~&G2bf_MR&MyEq!uOhO}uq&}Qu4DaiU)1IT>94+O5<ou^)1ARkX
zUQx{EVlH-lfY0DLhKD~jUZ|GfT!7t;&Q6k<ydUNtT?GxN8xMxBj@(Vo9G@y(HH=PJ
zZg~%=B3sKJgH^va+4}4}$df?Z3#?<hdf4OBy{@Kp-S@ZhUU1ZH!y9%^v~9h9Z&n(g
zusC;#128HBz%bK;xocSi+@^sJw;W&z7~Vf;vFeQKrL((V2x#Qdax!wO>9yW7#J^s)
zBG*M;9gwdTqm?kgK+t*nz^g6ev7)B*Znu(qUV9%;H8+(RW`$i!Oy#y+w-;-c8gVY)
zZ@Eg(c*kgQ=jvTh06_!*JF%f$??RW<(2*M`N=~r(2({{~mam7mJ*AX%IxPr!=O+m!
zNy?E5oo>m>x5csY)zx8QM0&US2PMX4CkgUMVS9RY9#RN3P!v-*f6*T<Obc|%f4Al^
zZl})lesb?W0gpeP%f`*XsT@FQ)j+S)g-8qr1Wocgk4sF~(HW3-pO#$~QRO1~V_-6P
zo^!TJxH&zz0FFUR2J%<ADETXRQjH^*aThBU7pB!TqM8g4u$_W}n6MZ<^ulKjcq;T=
zBgMC!JpnJGEH8z+yw$JU&}&`+co{A$>F<>BK^k<s_UcUZUi=9E@P}^zAU+Ck51NdT
zs%cjT6f4+KqNe9SC9#REfqARB#!E*_F8Q*T20Wf-u@d?%b0`&H{bPA;=MUK8oR5z~
zju}EORG7}})#q;C0KDu|N>4{80kJoTRS4*&afTIo9!tPSFc_TD{%s(n_&;XGE*`!`
zI(a#bp%^(A6!aC{7`#4St{i3Y;_R2TD^}FZ21{Lapv@1hMC?V%!LbpSH)CTLd=aH^
z%*N)Ltk#+=!E={5@^>*_WP70Z+wZqQsgX=r7#5!D%^^XJP66c`Hcc@!Axv1TQ$m^O
zzKu>|l>N+Pg~xdt-!0=`ukQe-Od!kU45v5#alQJ^SVkv!c~SL5ty#z6<eo4b^pY3%
zXA8hhz<h;~DZF!zoXh2kvM|okP}lXMcj63YO#9?o#G-FOi3w;eFjDBq{Xo6E#=2eK
z_D{YE5U)kn6q&}H@hZ)Z6Keu4UAz_cb?~hL%?35xGPH+;I)^9;Mh18_ms7D(;G8g|
zKh{l4<$oxr>f4tJzzTehQ66e>R8AN!U!5sq<H?pby}s{LeGr7F@|99%-CHK0<^Cy4
ze?VrlkDJm%vrhh?(C?^9Z$@7-d$aqLSUp*2y>LFbaI0ahC1-K9#aro(tV{liusJZO
z-b-hA<E+_6-b~m)=LN66w%v90e=$S>)$s<@P%gBjM}>O{r&Qf$j35;L&}=fIZ(qHb
z#a79)Pv;Gl*H93QF|qO5NC04P54Uv##J6%g3@{X#X?+O*L^>yx?+>?fNuk(e3CWu0
zUQx$@l45akzQ^PsqUWmH2HLrOA_k-q@v)^3c}@qLD+lG><7mcG9cm;GH{f<O#An<u
znW;0<dE9=_GFKXdf%*M52VZO~E9AK7{yxUWYLCjud8=^%+<Sl5ltd4yTwmD#Gc6kN
z!0hO#)d4frx-n8<_0DAN)INiw`u>+1u;Hgqv!mQV$2?RE8$`?cM}f4(s-$4+)IIYB
zXGVcJuSdVNYyTf*bI}`th+;#Vu@sH-{>kg*?xiu3!qdC^K?T6GlxaWyq7Z6=ULL5f
zCSO_um4)02MX@rkdv5=ium6hD`&e2LHuJtgAa6sel2c1?0Ds9N|2EYGF*gtioJIW*
z8{(gfi8&EVr35h5;mb~DS;yPi_uLY%G1F%2HK}p7bgK~Y&cBNM*B#<J;x5)T0-?_B
zJ5&$YFsT_E9lNJpgyn$eM*s1T_4Nf*xr2=t{YyS~W~n4kepj4QQy~bKhJj-NubKW?
zO#IOJ`*o9pC`F)vU=>|xPgY{4@mM3>nRgB(egDqzb(|%VyE5TjdvTi-S#pIXhwv}(
zK{9&RZ~c!1wr5jiMpw({q$y2_`Lers2owc{9A14w{#WT3wkwz~<id;^IeJ=I@aHBo
zeDf9KZ;9ts8hr0VR>!ycSn5F#|7Pms4&4Q@<DW?sUXX9`b7p!r>@~bR@>Tw@PjR;x
zsfeINiBV1hphEMXHOg+P#m;9S-_W3$qq7%YI&C9+SMK+MW$+$j$Dp8214lJ0lE48|
zN1lCz2n76XCk=Frs+l9gNtR=7{u&9!)lfU0f7c0sO<5FhaMUfxs?>FE+TT8Z^mYwi
zIo%L06nhu*K3GNit(MB`tB<_e@l2M|gN%DWUen8x)X}>K_;z-32>-2i;S<^Zf}3zC
z5!Qs8qz&y6?H^*4E$DMpYoJ4Z8&|B6kHy4Enpbj@Z0#p(CY4^6$&(&mH+S;%dMLt{
zTMdm(6k<PyNCy-@2Z2U~e15@ab72QorCfCwY(d|;DNAy-EzP&=>&+|c7q^vKI-ARO
zAn*$jgY?fdKd_?atl63lTC7d%7iN?XT4|v+sgU4-z<-xYpbYiaoe0JVbdT-7x^B)7
zn^)}C3G`4!%T=Ys(J7@#Bu)76(mXWVG4ZlS#Y^MqAU^NIlICT-S9;m5bZJWUn;JgA
z7=b_nY`l$^TnSt*JB*VEC_K%94rLy0yULa6$%L>E>ofT{+8HWJ@TtqiD0!jM1-x3n
z&b`OV+V#9X>91G4e?1w<H>h{hnZ&HLfYkzd{Eu6#k+`{F<2jxrmU5eBo@n<Ybj+VC
zZ=}t3gb*p8!YOXfcxlP$K6?F`*2I|azvbrp6Xg3`YvrSiMGbwkX5UYy4JNm>uyg{6
zQPf6qmZv=&d1rWT|7dS@0208Dp!!*NujR76*Rj<~5lYY3Gm}WtE;Di?L)<&s4-+Xu
zb?fd?bD5j{hHVa5v=+yFJ=K^oxLcMd(kT6xR$n99aUwCw(Mm=QxDjE<y^tZtt%Dja
z)rpDEd>43?`}wxLGUH_+FRxrY{o}#+*QLv+TH75Mfen3G@{86G)Bo8E5L;vy7v;PH
zcBpT-NGVEI?3Rm!M^>>Q@^gBAk{a*dr>}v8RUu%1Oc}a%Uh*S*mBjYvk@;-et-^qt
zLMuAa$ME*PJg>j!!nv2Xm)EXhbYb5%o~US_84ELo`nwnHsq}Mi1i2v)cM$z=8~qB+
zhwAKBecR3?WuBPQ3|0bDr0yC;v4i1mR@;@1r&Gz$uof1mcO(p$XrunMDa=#9*t6f~
zdqR8<)@I`8Y$j<vTcUyVM#PENMd!p!M|^rNO}Pk9xd=1#r$H#LWk^gPiGME-;*uPn
zO~<KnIwg~&DpIfi@ev4<M%IL3)T%^8`QMl?OW80vnjBcdw5kri>#pnGeLh69@j0`|
z746oE=YD{(<+f8S`?mEJ$;WlAhD5%vmJMJLH?4Vah9guh2vDrtX#c8ATtwj&!Igc~
ze7MX&w8BD*<*aK}_!8pyY;DXQ`Y?;i3vrSZ_9nHoCfQQVeM-5QGh9fCywaRZ4m-Zh
zqzoldB~nMGtXB<vt+9C6?7NljOYB-K1`+#p7X<JBohC1@vij}U^|;l}?2MDaKGBGh
z=;=p|-AUIP?Y0~FI-4hHsH!Gu%24DGs-}U1hW%KSvmSV1!Y;3M;;roC!gH5qKJg#P
z_pUGYA?KnSG-*=-*w5C!cUi+7Zhr_d2Y65cfsNF9lGi<+b<yTkyKk630+iOJGTDpm
zBm2HTW%lwiXF{Ee;)57p*MH7t`@{Ndd2apW`d(LMYQ^eT{mO|u*Hm2CcOBJC1F(_*
z{PNCOvz^mJvAoaY+>Wj1;(LxS#JBlOE1kLG(|)J}+ji{9gU_-$d<jhr+3BzNg17Y)
zLtlTuX?H7^_H;VmJSAf7TS0zJp2eo}pv|WXZShjPQr4GEOe4W#6sW|IQNuYPhQtyd
zuRwXEvieBHTNx8z%d^2%2!>kfn$Dy#A-W$`Mn3N*je`w^n%B2pZ<KDg7A@JJJ+K)_
zsH5HbF6T1`;1kP%zhDNS6N#Q6_|Sz;=vSH2;&2l(n<QyZNBcQ9_eXc_5m8nxJhl*P
z-o1w<e<NBpoG67-UllDQsZa9^5Aa4k!(0Xd1EcTmB?yj2`r#}V6?+f)W?!Hmc30bw
zRd?CT+Pa;-zwuPET<DmXM;m>mb4G+|ZOMzv3GdkE(YGSH?glvMq4hXVkP_Ltf}rzj
za!{xTF%Zn#5c&=`-fXI8`JnD*0HV{Ai)%&3sP0c1r$J#%jg-!pBdrfJOlM_=@upr}
z*UJt==h?*hofcA!1JPZMv#I^VnTsD_`;)b^VGk~RBN5~LTz3(oCk;^B+7ceGR(Vv?
zC)udKyJ3VE7A9xjc(Jwy1v%jKX{zuOYbAoiZ>(-&Y}&3IWZMZ~-b2%P9)m0c922BN
znimTqq|G(Qe&@ZjvrmhdOe&WGkFt@iR{T!3!>57~$9+4nqv6(KxP45pnc`YzIR#Br
zh1=^e`f7w1AYYc8x$!Zpo$c0Cf^TAEv6r=L<KLPGI9@<v$rS_^6|wcejg-d2`-#OR
zssb@21kw<SnHIP0#<phOr{#8%bxj=jH#q|w3wy7__Pr;Er&u~9EGId9!p?4$h?Ubd
zax-bJ4j`)ofea-g<9NAD8ND3aBlnu-zvS%ty*)irTxd!-PWeixEKCsE1<t}vQ6P)W
zlQ0Z$>1=L=9klGGxyrWY+!gWcN<L@Wk~iM#U{(B*lRPBMf%tbxMdow<P{|VVWYmeM
zJkGKV#C!|BI5(qF(NxnsXA*lN3j1Ob7!a1iid#J=%a}nmYnM;W$@i(4JPT{75c#rI
z89qh1iRz<ixAve|S`QGKg4N~yMccl$*&2y}=H&@R@C}rm_izRe9a-4)!DyXY@t70e
z9wic3TT&sifgP@j^Q2%;EB0;L1FXHMtoyY3J$p~6c<y1duZ#(PJ}H0sAfP7hFtaWl
z3BTU$rt=|O+*qik1v}jGx1$zWk!)cjFE7k!kgXp_Qxr{ps*PshW~j5d3mszn&5|aM
zPe?j@r!w22q;B<<{hck~y+EME#9|xql%ZL}BDc!*mX!>hwj*Rd-R0@G%g`NQu&xen
zG`FMa)%g{1s;LF9Y+oKyW%bLIw40+YnK({hf^YZF>qNF=oI7US1}-yyr(t(@Po-q+
zwXW@76E&m%H7|M{yoO^JwH6H~9>*sE-Tq7d7;6)^e@5GFU&+0kFo#W<SXFNan-zi8
zy&emG4<Z3bxme(77H`t*Pq4anu4yn{Y4RGgTRD}LcRs?&z>6Uf=uN8qC)(XH6M54l
z2^ozyL!%hWDI>8L3cUdV&)a)utRL5+B0RuFoOO$p528Nre;GSY=Zw6z`5Ey8o?i1U
zXrCni1Xu*|gK^f;-`@wriM-n26)CVG2FqH(Cx2ol*Rru-9J)=a^*;j4QvVDcBY)%w
z?7-s49%o{Kzu*Y>_Yj$DJEE5N@yS#l+Z?cKNkC{TP!WeF@SFo+gM4>7K*DfSSdL;c
z&Zh&t-lqzV>}{kh3)DbGzpNI;&V6Sg$n4RwwgF=6M6}cI!8%SrFf@^_84TrY)CDp@
z&@afe(Ud1Lr%+hs6`9NSy=1SD*5z5o{vGnf%-@NT98a`z|8u@w%}2%5<ak=SVuDE)
zWyTGJK#%jCVCjN5M2nM3CkN#KWtxyh)y>%?3ja(vj50?8@E*q=O@5+~7keZR1unYB
zBYR)AE)|9=zum8_nduv8tk0deX;^h5UpC(WQ5Re=WW(A>4>`}+C5E-SUv646VdQ_w
z7YE`bOc`!F8XWOsDX|K1$VrTF7ntpp4omi{dFBQHY8%*u(?+S*rffotThYJoNm8hE
zGgRKrha?5L)Wrrq-T;Ep;tfBXK9y0w7nn<EaXwASlnv$pQ_`@9a!j}pd0~Fo19|DH
z5y?&NYI3u>Z#BMd5qmC&^6RSmPwhf8^kOj`H|^c+*4=>j<H~ZKU<nh3c0H}}CE?4%
z*N-<!J5Np+inFQfKtro0;8;=Z?nMw++Ip`du)C;({&xHb@r{#H8+0yJS`p2D2O!B-
z)sB6wsioIgQ6pBY-sifhZ+wqr6JQx6GV*e|(B&r;jH8TD(pCz`vD&<BlR5#(ER85F
zZsjPV#g)X>^MM|&SN9;BKCSaL8r5%%bCYWC;$y_(XO61e)6DYxi=Q92IB-nz^+(zp
zIow5kNk#WPidkA1vGaO;ba#iB@RS_%?5!SVsCqp;nXS68PUJ#QD2^~jSr3V(Bo?q<
zp{^g}Ivn|u^tUZg!Z4d&t>l@>9X7V{3!j8>@)R4;R1RMuSjXr;`%}2)w#)4TfsiiP
z899^+6Eyy|tx0u1Z=G;0u~$(|zw-Z>xn8)Yc2DAAQGV@}&Ktq715-uO1OuBckWPG9
z!C7#BQ|TNn<yVaCGO5KA@X<~Hh)44`7Rx<C4d#0<N!se*&jbW@uL#B(tcC%WyS|*?
zda|9ie$u+Pvm<dJ(0!{x^;_*t3U=d~5T4NemFVk3tsAQ=hQQ(p^RC<{@1HuzluEx9
zmhQ^kHnk|J5xoR8`vjL@BW&TeZoBo>S2NfOoe!?R@2~T-?w^4jR8(sPuVOV9@EboZ
zy!2tVor6Fs=~Tqv-mc8b-2IU|Pmjsjm8n7XMfex~LU<!dBY*6IdcBK_ABGaNolWyk
zld5aTS6pUFbxIqr^yhG3K?NI0RM_d>=|RapPtukrBqmj}84pq}T~a2}ZJq6r@?q}V
zEDG&|RmSne<1LQ{+Z8v&5se(G-S^jHy#p%OgL#b{barF<Sw_UJZ41l?R>Nu+NckKp
z<VUTaQd-Um$7{xfcap<iz_^6)rjco-ai6-s-kEkAvHFxsCCP^Fv<mppDH$(jIEr44
z$6Mw-0j5BkVFP&{2G_a%1rP&-T{)`RaifZfMDyuS#YmuT`-H7EE|7?RJ9Bvlb~sZf
z!oU^>S2|Lh;Z}Tr%=eIDEYa_DajALvjadEZUGFH~|CXO&TfY_W%XDu@2j&vGG;ITe
zU8Q4G;fyjPVZB3FNwP9EQsdILXOs5cur3W332qwASlypDXE5}u<#$$-gMZGjpROGs
zHiDS$P9?R^?*P=PX2vpV^&1i6W-w`c(O^0s%<sB#bF(&V=GHZ!<<6Md#gc~h4&mVp
zPBvfHO$asi+14P!ys#dR@hEd|tgl5;umi1`;kPzzfkDmQ@R5)wNyrn|UabURN74nG
zU91mCz#f0iUyESd{mhAh=ytR+O;wppa=YQxy+E(ABMD$h%$8{*;XQYyNI~k4zt5)t
zt_AGy0JE;Ms?Qqz^Gb}5^}Y*B=P+>r2937PPY@QNNApQ%u3$8i$`9Z~idXXDcS+($
zSDFk(o}?;^c67QiL{X}GLJri$?A~7Bv!54pa9$XlI#C2O-^|iD1agWqb3AE}ka0Aa
zebw}CcNP<~@JXjfnR<gx)%E`FV6@t3u3@U#I0n9ha_yzP(5A3x2g@<T3>|r0zx1br
zN9I9sCSDfu*7@KafI_<_Lej<wm@NP;^QXseu+RNWZS(bqoq9<?kWVCJs9znOxV*FP
z((fp*w|)+<*IUX?ppXOdV8ByKLf!@ME~jox<!w%AHws?PPkZf06jSKG^nA&?W!t&a
z`XkBX8KAU`pt@QzFkgbX4z?9VXK-ww3Gv>efQ)SL&A>$np$$b=&_2ylRfM>!ixJyU
zhAs!RrTx5%SNx_zFNh4>HU%C`rXNfmv>H^VezB93NdbVX=KiE^%e&M1M#{dkWIqT+
za!h!j)O!58bQ1AFPybhA4?griJ8Ve@P;HY4nPGX^*9;SlK6Qt2TlSgyUn`olpTlSO
zu2@9K*dD#kCrk$f4_(XT_d=UY)5r{M3eaO_wz_~c0Rf*#a;&3X_rCThz5}vuZq7>>
z*6R6DAbi7ENX6uOMY~JZwVdKh7kZtTwrkAJz{t66E79@IEG8HX#Ct{Cwe>dR;OTqK
zIrp2ssn1&l^pU;8;aXmWHExl>bYLd6-Tv9P?V05U1Ju~j|7xum*8V*pxCa~mw(b$M
zFtl$AtM=3em+D=M!)jcBJPW;zC~)MvbR<yR;kK9OABqUFvm{)2ycJi~u5oG~Cvkkv
zrQyuE+Pe<Ln>{(I8`8D?QKrGrW<a3{4%cw@hn05UUj*p6nEO056J82h3R#;q)>?)1
z4&#iZ(j^H*HuxDY-{MsDAC+TA!Acteni{zy>M01Y!$fiUKq}CdVLuTB)X~Q!vX1&%
z_etY41Mbek@cjPM9vP4A`+Mw7lNe@CsmPSzQ&W7wdoIl2ybR*%$r9`B)y{-Y)N3yi
zZP?>cnRDqA^+g70#Ro9J8bB-Urj$cZUtXWQ%bPoG(7t{cd_P-bsW}))YJ75|FtY@K
z)QV8FFQrW>e~oq0Y3^26#kL9?Btj~smvM`F7G<16xVxNQLoSW$BcZEkXv3gn%+G&O
zgh_udUv)IfGg#uX#%{c;WZeH(2Q%};e|N3`RRNIy+v6KGqKNxl%jYCqx-o-Jmxrro
zxq5k<U2X1s^GQt-00XIb{0w{zH%<`OuIqLw_+4VX?#<O2apy#gnbfMqr0e`VbVS}`
z$9kfB<erm`Nl<~`QSNTfNB<rCyxX=$?eZ{4%Y|oxx2JhyV5MkyL@)&SlMf8OnR1mO
z6f)HL>Nb3zrz=Qg#e>MK5G>|#tw$DkrL;N(9SI`Y{p=|lS6Db_LJsE9WoO{Ej_~-R
z3s18e_LTN{cR#UdFX+C~(oT|tua2yatXIw}qzS8ERrw_4u|MLCb9^(e<0kgdi@k)r
zQI$X((#xc`EB#DRcR}|FUn3Gr>PI?-#0c8s4z-ZUe06%z;(T|!X?oaQm#zCQw~0n@
zmHxoe6F%AIPzpFfdw4y4<s%<ty$EIlz2vG8y2An3>Fif_`#CQD&Q1bCx54x(s#xu>
z6gE5z^6gpv;hBD}7S)#QF5Kk`1)A7pj+<zp<Q07)`Vj9ZJ{`RkQOx$b(t`FY-LL-u
z>;bM@MNAvqoBqd@+Y0%zNi$gsJ)Z|FxbCc(k_q50bDuL~`&BD&CGHVw?dMCEch!*s
zzaRuAY>}mL8L8b-eGe+3uvW_I;ckuQ{KNP-fZx1iS?%j>to_&4Pgc-r@`&U$Uth3X
zeh5sg5rlEj6RKYxRuj|U$TO{5MhEvUjlZR5bF&hNM`d0Pzpnl)l<r{Mwy<wmay?c$
z(-A=Qij|vziAekDDr5-Ad?jqz6jJ%T(~mEUDFh<+CXlOVJ^?g=b{8d*+0x_xoFLGi
zW3u5LutNa}*zV4beSEXu(|=n}wc*N5@9BE!g%4fs+l9%%a(h@iBhdA}63oTZ4Pc#C
z8l6CG&+M04fjn)>mXxt-q19yX?1}4a`YvB<>?;0hlm2yzA?!(YNW5kE3h_+8J*Wad
z`yH+0J}!;i3(%_&C(u<e(#ePPp&k3_k)O~Koad;#ugc(8DI%m+(<D1wIKjy7@>1U<
zhk=@m77C)8iHH-_-DWPrc!NT;BwrD|#Gac0#9VLtfLXFSTP)zvm6zYh)vnJCO(#9@
zu>Oq03|Q6WrLwCrqOI&kXo9VW#peU(diq+Rhzt0p6%yQ6-{B)HuGHbY&iRbfYl3Cj
z&f?NvDPbP3NUF*PCNDckgQ&UhSwLZ!Us@A;Rz1XQh*m!kU(mIBY##3>%oZO<dFn7d
zRn~FxLYbQmm`LHrcO5B>WvmU>R|npP<uAlS*Xs{~ItMP#qF*J_Q*CFk^L(G6?Z{dy
z_W`&`lr~zUTT9T9J8^CRL2nR>Q!)FluxOb*TJ|U=r|1yaG9*|$$5#Ce4qF~GoQ^u{
z+C{n1J5vmi3Ezt-er`zj=dJ3Ui>mwUsk=oTaP&nBG+#~tsR{!WW=vB#8#TnMUzN;8
z%U2_kzTCc29lI@e!F@QqUhuo!B(3^ULmtp~FVexwkM`lV;@_{-vPZT^2dRWLwS>hO
z<lci`X%*3@Z}*}grD?L_Ui@&Dk?o|Z?Oq%=oNk~nTpf$J^{bnSIu!+W`fE8E5!e4}
z<^Kb9uCn}$N!2@zvr`@V4a_I^0rcuKVgf0ZLajz1AHw2b;$WbIYOw_EdZlOm$g&L=
zdj9dI&OBdPpISMs^Kc+XldfRv>faQI$K2t3^;##B6!-ksLGIJO{cDD8=2r{_=`VcS
zB_qNJyvaiUIH=|h9<pqYmggJ4xm<RX>Rnv|FslXB_WC8`$oAgv{rY~nff+f5i!X<o
zM$Ez;4q~R>Q9$iv$~oHPX2I;-eaJbqy_&$)Zu@@Bz33K<l|ZPD%t&6B@Tj^^U_yp#
z7&{QGUkTEClZE&t=_Bi)@U0x-M3!Qp;<c5tiqMFlZ@>sp-RFF%+X|FwKcTmM3Dma=
z?dn7H{ghNBMQnl!o!Kk+unv{iTb`+)iJk!8-Ktqu5ZsZJB}HY+w0{?y_Iqg#p>@U&
z;AEde@NMr?&P8`5`CPSNV|U51*(IN+!%-bgx@{{sX5r}A_MKe32s>x2eiY7j-ud+`
zOeq%wQ=M;vge#-R^|gz*DQHIjZG#<}9=#7$#jGQCo|Pp(f2y#}{H1UzwG=T~n}DUH
zPMuUNT@2b*&jvse_)19=*%LMr88d4|46@>_f{tpYTr<g<w+yB>9dEUaff@ZFR{kY0
z=ox~FX%jD{0^hMVH;|F5t62#z5^&(+rUZEq$>ZwL_kJNKvEVayQ4iGW$h>^F2~@ov
zv!#9Jp#DCL-fn#Xx>ua~Dy~4>`ouq0*R^U3Ej<l`Li@v{!XkdGV);DTy?!}-^XE87
zd6$_NY3@vmEaZI_Ml*wWO!_P&=dI|9(l7i64TFkMv4T^>s>@zfL&M0qjk5utCD*l-
z(eN3a+fef0qMjPk>3tHtQT?h}$3Io}P@n`aT*>?V=;$iUDoGp3cy{n7bl@`tQ-Rpt
zjz=swv>=I_2$zln?F^%1HjXipwD2}^E7E&@?UY{qej1jp9>>~rw4_Roci6Bg4-Aus
z7U6r3e1|<yB)NDJ|Gkb0uu9c=I!9LKoTJ%~_>8_vhvWEQ?L#hF=C9ud8J`vjh?VKn
z1F6|GXx9Hca)g<`#5xnpNu4r3^U%zFEA>=$p3kQ$=qWypmHYW!t%dCEJyY5B0rG(N
za!(>JeU`3Vo6Y;ccH;`uZuxHGme0rT$PkE%>B@vi`WW@xC@x{zDF5+6f=NyZUD;v>
z)*|t*x7Z(S*`z;0q{pM}Y4yUSe~6iiQA|5wSl$)^>UOE=$R@yReS7bzo9n4*C6?B?
zesMPmcDHpcZfw$0w3ELyekxcTzcO*e%6nDiL+}cD*V)(135ZO}7TP!X5Vl$AaYc8l
zOMBchtWRD!2c{<}K_9TuHZ8F1IQ8~5li8t(u9e=J<t8hHKz<Djmr1t~2LCqmZMb60
z+hlVVg8H8TK*+*%fROD|_Q17#LnB4`wzw}v7;;nv4s-K~NM-wdlS=#v;yRSry|#i&
z7WSMU!&US)6@}5Kmx5p8K4Rv`kc4kB`_xfxi}}H9#u94v=0h~ov`ps|1VEA1IjwUL
zHA|IAc~fy>Y!oT1wtUt{KUj0j&XjzzvcAloycDDzoz6>#X~^GEPqq=tD4!;<PJqK{
z8Og76zZ#Fd9Qd?GZF!6xCB(cq!mHZb+shg0c(J+V6rr;vhv)cRHE(Fu{<Cy=@pHV%
z0+n6rN$WqorDXfp6Yf=%uq+1p%TTMn8T!mLX8gf7>T3J)Y8^9k2CBX$mRR20A4s<l
zyAu%2&+mU1ip(nPOUmyymRKEy2-!N_+_QgJdIIV}cwUdcOXIOc1QV}{Tun+IPO`g>
zm>E0|=N`l&q4Oi&ZI!bLxx=x?EUlg21Iyhj2rc7yUy&u;U3WB)5;4G4TyHJo^$0bY
zX;#8fuiqlHy{Rw-?IW;O&X!o|FjPPh{=C_nPc7batm45Je$$=FudLCuq-1PHG+LLe
zn`JkwFo%cIvNfMu@yBJur`fO2w_!6yh;c^d$UdxHuzHTqcJ|bia;{mk=D~-<6U~tI
zj>OAd-#{?W|06K?BW5qXyz~Z@cM;cyBhm7Qq~-3<jg*Z&#d++5yF?#`KeABKe&ok6
z^{^TYS9D~rlt{(FZ^NaMt!g0FkoF{9fHdY&6Q8Y@?(Bn@oEpCS?E1%F2`2#BPq2@$
zZzDjU@qO*w{T74whGnOr=&cqvKupKe-g^YOmmch4Qfa_Oh_N;u{<eV)-ogn&V!8a8
zP#EV&Fo`M+)<##i9yM_y3S61~a*DvwHH}AOti&a7EAq=69yPJ3m+nX$wZ_i8AHQhm
zP(5tpTNu;ji+fcHyZ{a?sW&u8ZUt7jo<KLhVpL*;p^9nh8P4ICOrkzDP`(+ImHJKb
zygso)eLuJFMSoE5!w}U2f>0<otEqQ2R6mC=Wy+HZDP|?epX;akjd%Ik#tKAZWaMVG
zg=Zkp?gu}{RmkL=WE4xi4&V$Ziyx-fNNO~qF$*UVKqxV!F=UriN=gG+`;HBd-g#Om
zKgwuMV@Om5lyBRlu}-%Ns94xMyp`&h@STFT=PDy-jOc%yh(e597TU(*p(?tIM-_dD
z(J<Cc2x?Bf%-<V?S>(z&37?^;?1DgVqDBT-cK<kqOX4a{f{^dn#A55Y1-MS!Z>|Hl
zB4ep(tmTj_-M_R?<UYOBwscwWIXP&WQwRo@+Y+|rULf!6h@?W?-RJ3FMT7wcq<33+
zw_FWD*5o1xZ|~eg-ZlA^OV4p&Pkq5}NeUVV0;Jd9?1v0O#<?h*s>yGg{oSbU)YDCN
zwpvQ!_+A<v7Cy5bTn_QiNb}CVLn56yAiQ?m5X)zrTt{~&dJhCw6xVzluke_JoU!xY
z_0;4>MPj7Lkw{A}L?EC(g_vxt|7SgQE%Z>vLDR5ui8Z!Km^f>R#KqM16V;GY220kr
zAofj7e)>>JHO}e}GC#zaBnAl!)#rfc7{mqvD3hCnsYw&J3_Wi8bpm@0zy{d4KjcNd
z&3`fdq96hxM}41Vpm%{qx=eEHTXR>@dX8TxGEcCE!OEyx2lV4!>5G#tjL|+;M*i#s
zM+91%yl|uclaq7^Q)sqYRmIq)$wC}=4{o5TRMVm~BF1+-k3Uf;RLnhcYq<V7w-b$Y
zpMa!Q)%yXw*z&F!<ow2yP1EgHkEW{5+^D&SogA4*W~W<bnDyCuDnFFfwVbauwAN#8
z8VE?t7i!C(0TBrOA0sD0CG3mnI9<LU9bH<hgZ2BV+vJPY@0je;7<rElT_PW(h|{Jn
z#~=Mo@-3x)oi+Iyvov5j1(1B~^MS!1V|-<W{l+(^xum4*<@AVxWo?M|kwYVwa<Q!s
z__p6fMQ)r|XqmRXeE#*-#0Ozpp*Z%CpkAJnxjNYfcGbAAKiMKywwAhb8z$OHysCMV
zBlU8JVQ`XNhlbN<=}pgEyi&u0!hElyd@pcnbE+FICSsK9lU}R^QNrdAu&OO%km{kg
zM%DoqDu$|>H5GIFm6t$F_YPDzJlkrNwZF$y@77hK)3I|{=s*jLp^o`r$eN91{0#N)
z<IiEvr2C{LNUa~#3aj#0Wm+oOlJDz2Y==>E+#I3J>KX^P74UNisr%>kGe_iX#CtAz
zWtTp6vwTHq;*}gQp)=?TnX~6fW0cl5b+mM>9(U3t6~SP2^Hy)%$+YNtNm83MWI!d6
zHL-z8!^*LF1gMO?p%R_5Qt*6K(d12I<G;1sQ6%@@#)!>T>gmq~mlKpZglJRIzpZKi
z&t8DPtHdutae8}&Jgnft3EGoa@yW&SIPVtixD}q6(=uGv;vqK5=Ht<AHKsE1wkb!B
z5^9<~K)VOiSXo+0sW9%XT7$v>(M&m*ka?HH;v9w{E|RbQQ3>B}Ejm14!<!0|*vjHW
zKL?Djs<At9ht7*>py}l0xQ!*EX*Cq}_2bV#cYo7@DwBe$Mkd#V4>eS{7#8%T0IXQ_
zfv!yUnO=L%1ie)rTmA%@LwLQMoK2p5Z@<3h(6Aex2CU`kuq$n{PnldcsgJMI;?gny
z_$dO>YE2o~yWxuK!gnqzF0Ug7VANfpP%;@TUq)Cs9)>}MS%}h!6p?l9n-Roo<&uhq
ze+T$S=ee-yiNY;G$9h4|{J=<*;K7{Fb?D67Z!iP**6cje?%NkJm#z*0Q)M{dRyaup
zjvNt(jDsqNsiUJag802tjqvxpbmW}X{W1@<VvGdgaf)v5O-&@tvWdh_*Ngm+iNOZ;
zUi5OhPcL6tSLDmfwJQU?JXu{@TFd@ICGjoeOE#*ulwLDpTL1W5CbaC@OLchBHaA4B
zFpJG_er$9>IZ8?-XPo)?XJ>3T-uW4ytSe<hCM)vHGoX8o8fiYJUDpKwH|y0eeT|_I
z#WkF42xF|>7^aJOvydnhS7A-VL*;L&SV@P3<AhXt;bdVcf>zpF4?Hn}4fpgEhlxy3
zEAN7Si4JePQ6@><C{H7NCPg&I;Xd77uB%ef7fzD!vyf9U(*|4I1)X*q<(nM%&r9Ox
zJm~=;G*&Jj(<XzYfu#;Zg_7a6ws|h8@7Z6FL1L+$l%>}$r3+9SUk0EXO~;_Xg--D#
z09k9BJ^51;dX`RV-e9zrm(z6<RQYx{jK)@kjH7PfJb}l60MfQ|U;cKkk*W{E|AcF(
zq&BKhu;B5MH>`TmA+uL-R#R0goooLS9GJVIsApAV?`BYdk%<30(b7CaYlihgApE|c
zit4<4?Az3a3zca7w<x?}WYuG<$NZ+GcF4CpxG3dv#Eh@3g%JVkw;B!zkd>6cFf%CZ
zt%&A+-*ZP1ak>a;fiz`1xA%2PvsP^q76g+MFG7vc^DwJM1`2t-bVR`Tc%wR|9q$gL
z(hyYcpMlP$;i+n_{7lsblq}g|F&v{wrdbB_*#`6Bjk9^lGMcLEA6-^orkRpLxKkfw
z1?(T01nQ>V;_PD}{dJ!$O`g@X&WK(@w9X*CoNCS-cBND<BVDQCzWUgJ_W112bu^RK
z{!;&kr>_i%vg@K%KtejDhL(_S7)rWZx+J6<hHi$G2I&+5hc4-Eq+6uBLAv2Syx+a|
zKb+^pj<wd_=ZqkHzsZ};;;TQKQjK!4FU?t|T~$&>vPYI0`<se9xi4GalU6)9a=Dim
zPepyA4dm50R5Ki9b-Thka7pY)kgrTIzF_Ox=@NT{F64|z6roT9xOG^yDRM8E|EXGH
z;CIHNs&OwrVt_FUf_AjV{Enh)@6{It0Wj5L(+J|16%quC>HTIQtc;^Ey~?7MwQt5z
z;tQK7yW06D+8e+u^hFZsR!zy?zaq3Bhw-an5xer~8;zF2Q{%Wcq5Uq8E6pD)HkTqs
z>%FxPbzvEsVqVG2z@0v^Akp1$Y>k6_9wus)1EqySmK3Jnz;#sKv}CG>SwKg2(}WHJ
zK27jbZG|cAVZpd)%%?X;Rs^j2*2a5W?0rDTjRI$4@j~yLo%^coB8a$m<OH%J=STYm
zd>j<H@^xV@3xe4ae0fh}Ur~1)@#WgP?dO|aB;Mlvut1sOj2LM7GL;+CX~XqLPW~Ta
z=vT_!PsyVgpn}k}36~#UJo8Od;zg)4J}J6)mW_+8OI33DGqYxQkEB%2{1RD5)*-~y
zwA2S3sq9|gz;D?PiuYPj+?w!smneU;{@c}+{%{}@Ng%c2be&s`;evK$`UL<zYmG}m
zzu{|+2&^1sp?{KvnkB}K;MW3eiU-kzF|r;6nNB5SMq~Q(qZ~Bf^Mw2o*gjpFKp8pu
zOS3&z*Joyy<%E7+Nt*;%d}CvHzcdeGKprp^{FPC3bTCdDgT-+@HKD6l2>=HJ^XySI
zsU^ehgxo446@_`HiW;fG7nML{WTE*4c>SNv%7~%scOfx+Od77+!z@Jex546!kROqw
zm2z34*wStxb>!AI3Vo8K?f(RP*ZDsAS844Ju;@|hR7%99lQ}joSrC^ZUAf!M1F~;;
zcuAycX`5t{QAe^&t5k6QL+WY%^W8A5xwNvLjf8#MS+c;b?4Y5q_s+W<Z8tkqf}wTC
z@8t&6Y4yN=p6)N$g|%^)-?Oa>FGK``u(2Pz?|V3R5Pb}XM$0w@95yL|vdVRd2v)q)
zk@qlj@v*$u4GLBK2hA0#Z2mCnMDO$rKnzT6U+!qZ67|v?j`iQ4<=Hj#6yE>KUCpS~
zv~R7O<Nfx;pKyq&@2C2g5Heno1_`2t<ZSoam56f0_oZrd!jJ~>?7o6&y(rGq{=34S
zAF>*Ry1)xGES2tVF|!{BH@>S$w>kg1Tk8+<KjR(*N^j-N8R6mNIX^x3?8?_|$v<U-
zmEscYZ%zbucw&^NA8xq`a>h!d2SywyP1Z5Kc25AsvhYP8sxtx7VO`bE(!zdn^DgNw
z^|7yS%{jJ^j}@->7sO*Ddp7{#KK$oF^R~cJiYq2~u%fMdjs_`5^%EN>6>km-7iok<
zk`Y#Xmb}DSNjWuDg&$nZy7jpSBU+Y3k4gTq=i&heeF?@GTGp-{(Od%m(e6juCgP;o
z(CKmAw6*$5)t?Z!ys0fbocNWo3Eua;1ttR?0CjP^(z>^X<Z~a6H2j>7XVI6Fu(vbq
zeyp##fjhkrqcT|!-(L=EeEX$i$L4uKezZ8zM-fMl=~94n=ZWN}N$I`2`@?mPkD}Tl
zu5t|@>8-{byN-0-x-RW5m(Z=co>c^WJH=@%E3h;g)MViNaTRk05kNpWD&~55?E~I^
z7#Ybns_OEYcpEMMsvEDcM^Q;0i}U<h16_8EzFXtoS4hn7vzow&u@oanR!!3+P&&}i
z66#zY=83)T{%xJ#&G2dPOCs)RySM&<FubS!{$A2yB(^_Qk-_vtCe{Ndz`<|6RBHz|
zOi!VIn_u|?+x(L>MNKvVbe14mUlekYvob?cNz__yBxYJMGlcKy|5QZw?tI-Dw%Auy
zAeqIEuW912T?rVQyIX(0Eh7M`Ll2P}--FUx{?z1$MOchdQfY~2=`9SX_j~@;H9&`S
zmD;NM1%8?10bYv;2W;<mzRi0o!<Nv!FmAI1&#x~<>*141peP%2wqckbSfV1esfcL~
zqN+Y#S$de>z^#^|BXp%*@qf=N7Vx~Zq2--e_|!8!o|`{dDmzU2&2{-x29eKqfS4Ko
z#2L%v{}wX31H_h2M#!=5F!{TsiYW3&Eq%5`=6I>HCQ9oW^x4Peg%zLqe48yP2B4^n
z-KaEoSyxe?zvLja@Ooy-X<3n8)EbP9qBc3TY~;U;DclzDQe~$)(_>Z`r=&V@#|(`k
z&lxZ)CCfS0pJiC)Tnolf#t8(bHYYQ(%a4_>3Uj#<NZ8SM&$Bn0X_p%HG9SlI03|Ak
z;oVmQZ#*cjmzE1Y=4ZS(efz#{oy3Q<dYRO32G89y-Au`isI$JgB7i;Q;Sa=Mp__5m
z*Rj_UgoV=rFC8Fcp_Eo}G;g{7ZTvC%)@++xwXZ&Aaw<e~Un1O&nnc<pktA5Be-Yd6
zAo5Un0|Ag%cZp}@vKF?_`8JAxM;a{0pU)jY&%V-+<EB72fwKlXa5@$cz?4m$_@2j>
z(~x*nx1XQW6pm{ZGk6qFzSdm?{hfp>Af>v_qo+sx-mvdgRU!hJfy|jgW(?kqRg^bn
zmzI`?2fRfNqTsc&vSLUg@G5kly8q3crr_s~290z1OKTT1c2Hr)kw!bFaNe@>=z}3@
zEwysJVtBjN>!jPmH%qGMY%h@{Dowxq)V!tV3OYijOwJME1t(kDOCUQ@mG-BszBajg
zEyCaXWHkNzz{4qVU^zR6*YGrJh!7ex_nGbUS@8wN?JqRD%(5J=8GevZ+4GR)O<|#u
znMs*TEb9e1PM9mEYuLE50I?~ZV&f~izcm4*NU{9|g`+{Pv8vsZNaW}(TW-rg%-mEe
zkF7cI3N%TO#WZxIu+DPYW$sq<U!x|=*tc_D1v8^z3V7UDK(V9=SiH|zTVw|p!|5lY
z9xCac#VQP8<>^Ak%Iy^C@#GC>b-6PK3M=L0heh~mt7YytBk3fSpJfLgGY+47^QhY?
zpwP7W$$MU>eAC8>swbCi>4)A72qrSf0SC<eW!_wjj=N|8B8Egcy&U(M3vwf-AtO;T
z^$fK#=scd4ncDR1pnhf4U&HSgfuO!qY?Sb!(KL9EK*Nf?^?2%X@IGkTH94n!9z%f;
z?DbQGQ^0Y{`g9$Cy_Yd+!Mkla2un>%11$%<og$(fmyUh8uZNts)|KeTX+h$iKfNFL
zD1un7KN24z-PS@xDZ$0N?}{L8UgP3?QvZx^u7qp8BR6Z8RJ!1C6w==QN({--;)2G|
z09@9tXT8|9Wlgt}d=qhRa4n4zRAd>&E*vda>?E1}N#$n{or=|rm;_x8ue7v6aWE%_
zE)rpi5y4wuw;qj<1fiknyxuTP!~U-^;(X=gyIRS3#HfEPw;<@(6{x`R_&I;*+e%o<
z67TnK<(MFNw~Kew_dRIWnj&V8p!eC14D8qAf9zxLN-7p^nOws7d$5FZ*skK?{qB{3
z<op!FFYDD+<=+Wn^W@oLH^0=VvYx%ZT3X|o?^HJkLNp{#VxDiu(pU!uX#VOdh`>+K
zv^!gr&hb9)sXmjuyBo<I!9bDpYOT{LXKFK{h;J2gEn?nS5Z!%cPvxrrcVddb4wAS_
z!!VZy4p$P6wc(=8ap&p_p_>*zYeWhpLUXJW{1*If!2funX?NAXDb0fMgQvw*TD9Tr
zu73u`w_i_ccLPyYE*mDLqz~%_AbbsX0BWNnWSYFIBHuXa_s~>+E%d~QW5W5Plfw1Y
z_O{#?(QK<{q(GUpWUbZ3Crr&}`}vCVXk9w?8hzruV((XK{leZJd8ZEVEpi}xU6~b5
z>e_dT#(U)g!aWBbhX!~dZD5o_j8#~`_$e{hhrG1$JOQsJ-QC;+MG%YD0hM}KsFE<R
zuw4pW2ke^S<AKMAFxG@JUo04W!1!xgoL_3Lr?o=@Ub-RERf^OQDjXYw)}Lm?d4kg<
zKS<F6P+!$ztcvMd@6-GiEG)zTEF|?~s8{d@r7_xY8x5A=Zt!H`JO29oB4+R=N?~CP
zQ{nuKzv`|C9-(vDBdMz}QW<;$K}tH9WalIJ{fm8dd{mav7(`QZKWd|=spe)VN$MrW
z%oE@LQ<x=8jbqcla}t_1?Hju3G7b(vH8$D5Ijx$TJ);YYy?LPht$jPlhsFU6Qz@=2
ztv!>rS6DXGy+M+|F&3?7trjNONng{~Kb?p2_GlAT<H8htOUi->j3Zy^u)mnvr<7Es
zxl?UHyIw^}E!_Tj{}r(R_w?x|Bf2U5G)8Wc2hMs5NicC-g1OH=i_2IJV2pqTIAbC*
zMA7f8u?zBjejcy2_G7VWtdH_BjA*1?9oc{vD!>80eKo-H6%;`{VQA6TWMl0t(Y99Q
zN_j%eL93T_VU6Wu?^c;Q=1$qJlJ~%fhmAcIPq{Pd;X_i!CbQ%z=TyPvnd^-VhxNSw
z3ZAa)MLVWnf82fL7N=E^_7;T3Nn?=%vd9xxZLm^t^w96Vfr=TqAP*;wZLD>@Dud_4
z4U$;Nbi;zFPn1?z&Qe1i33NHWwQf;%ax!{GaLkVA91d$|4%NXR{~NZBjj-}AGU_VS
z<{UgaOJ|W1<1Zv6sb0%Yl{!M%y}sY?&(&p8!EB0tiG+@c(1j1Di{wb`J>=Rd$5^P8
zN_c<xx^rJ2<MHCp`$v}r?&rmPyQ6%|-t*JPphJ(A7F7qjaU=!MD=5Zi`Q7QHFO2EK
z0$y@v8_cfsZyib$xrEVUZ9cmUoRVV&%2=iQEVIN0f9)9w^Z><n#=vSQfSy&L%)Bnm
zILOa%*!l%56oMrN*edGey!*(u1Y#B9r%+7Hd9;hc@V?IfHYs^m_QBa|^rL&Y6KYC{
zVw#^69vj~ixzq%n9n%h{YT@WNfnmPJsv#=}Z^HQBBw=R}uPi|){mHOktiS#-7aOKX
zV~Gc6d^UwHvkDge5H<H3Dp=8YC+cP=s#z)_i3&BhG3WSBTO%o<@ynVJVWvi4fu-7f
zy4VWYEY;opm~2fa-2eP*?q|81utQ{wLTLzUpIsm~(^!Y!52cF9f2YRZ^S)AD$SCIx
zP|W7oZ`*mwIGX;&XI^0cTuJT9VR|~O{ht+eT#5)FRIFdvs^?K-F>spaWMbt7m-9lj
znS6iB_J^W$a^1FwlOYZ6<U#_fO!}wUw`!<+TFFZ=GwG~)&%4T6Vo%=j>1Q@s)Gefe
ziQzL{-mR;ZfS@t%xtfu~V6Qu*t7v6u<q|@)R4cPuraG&xxu_rU%u=QiO2T17h_x7O
zq0Nklds@hpRaoa6re(g<+pW5~rxZ|}FQq#UTk;Y-4t$Fv)1B;Slw8oxZ+l|9YhR2<
z+tJ_TZTu3Go=gF^TEcBfy6gSWWNzVbsJg&T+^6+P`a`N_t@alwVQh&geD3sB9X=lq
z<T$I}Ag5mx8Gm-aKAXZCANbH;gWjfSmj=*dQdWI)28<f}Etc3)ZkK`VI}=K1P>2!%
z6@_by#M^5`jc>zjN6%QKv++q9VEadMCU(Ng<aC-1b$2$1WtYj8R6VeqYuxMZ_Xt2b
zV(%vI9yLKKR5EAs(|W=mOvs7ap+i;D0Pf?6O@H|L&fW5`ePW3ULK}ua4P`O4MHtd9
z7Y(VTg}OKj3J;`7ihkVnKzcNo+^@DO6q6>N;jsFBi-HG6&O5G?0ly_|jE0GKzbg(B
z!}rbNxU*^&WMMcb)t9tKUKDB;XpS5-mY!fdSD#1DyT-usUnwioP^n$N1^fqa%q!IQ
z&Bi}shLvf~anY#{;SgcQENf`%D#(f$@&D$PnrB|;ZGbLZp#~oYxes>J_<s4Agv6x5
zaEYhZHRqR%X(=a#?6CWGt$^6vfk#`2Ats|t1V%&?$({j~Q;^^3KdxgaioXk~8Gqoa
zTQ)!Xc&*s3n~)TM^*8pEz5y;XnjlY{Nm$I%YLW&BUksq=ZIF;Doe}X^Om-a`H-v2%
zOW27UQi?-=#6ep~gqA8shB}913Hoft#tz~5ewz9@SpwQCUy(VYK64i)to7^?d-c^6
znkAE^h`Lm(a<=|MmP14pwX41R#u1$l`)wYp&UaC0ft+}uWN7gC>!?~@8ZP}|wwK`6
zQ()klSU@#(Q4%+x*`~5(pKk$#g?T)2Rh2GyU~z<@@n8psRg3-&eIBo8vZp?NQWg?Q
z=<bK075sT0Z$FM~dY7r1SkphIo{c^{_jrBx1RKH48+f(IzMVK46m_X%-e!GVz#_3d
zHPTx?%-rC0qsi(NY%Bs<zBQ6<>vO<qu;lyMhP78>XZ*klFMt&uXAE`f{InVP7$?)w
zdz{w&cz4}3)2M|P3tDWs*9fB7jVGG2Y&@qOA~$jGg9{laifT^7cO<KV_h{+2P)v!0
z0$Bwb*BLNlkZLkJ;c`}+aC)+9g*qhzy5@rHiZ%=13SzJ@yThMVF5&^po9Ha<(tQc0
zem=Gk7Vjgp7zya3ANKXO?G@nOcCABMiXiH#d@3<$(DmC$z0hwtCSoE4=?Z&dSX&Oc
zH!9gluRp*OcWE1hYxJ&LqHE!80(-$Bf{hG%WpK1RKZR`$wkX=hyOqCN+{E>Q!o*F}
zeS`(E_@>D>$^RhD5n6&eUN5|LRiJ_Z76f|4D~qnn9zSA!Key#%Ke|l{tDU3mQ`b$W
zlL=bZZQK2<xD=S~)mwSSbaWgrZXE8Ei$>2P)565>WUn8kj)5lDk2;=sQ&7_wEY#V@
ztY6Lp+%p>YAMh3a0fSMa*tjdJhxG~Gwos0Wpw?mlpMF}3nyi8)(wnxn2FBK?e2>y7
z$-PQRydV(~l(1zS$M(u_xIk9l`5R3e$}F#>pPc!VP}A`Ecuxa%iAdkljfW8<kMK}C
zjup{i<%|H0d6!GwkRtC=ez41+QlKQlsTy5gIypyV%>}^{i?>%-?<ZJ0b<UsNJ3^)l
zm)rFLKx<`x9UN%wr|(H+pJQ=)W=2<^&MpkUWsBEQSU@g`ci*blbSFC7WIt7OZYj{l
z)CBO%H&_T`rO&}1e`Iz@Y1LWU$gl|~4}c|J2ipr*uCLnXJ;zxOkQ*op$10VF9*igk
z{4uM~*RuMfzR#ytWZLoXjO9o?r$kxf09(u|z7)v%E6W{Y;dLp;yE^~jD@PXVg5C8!
z9rNh&MU`E&D};>l3d#2~9zFlkXYrx`;Ub~bRIonJ@9@68nGzt3@z|+O#Q;$*8)R{w
zxz6NzBFGZO_DZbZB18$mmQ^g|%&uxKk&BM7g2q`IwSHU$5HZkIJh&rtwB|U|h*BB4
zWeC{)B$ZCZa_2i#cCY87kR~7`)3wi<^{v}jIJ)64;K`wK{^|1yegzC$<k$@aihnBV
zNR&M$d*b=&Q6&yX=%<qd1Pxb<>(d=r4fqNQSw&iBmU+Qg3E`Dt?;J&1v?#ERx1#TM
z>TGPkIIJ<%Qj!->1Nh4)zHL;6GY^+|X){|4$-rSBA=K3~@;?k_NvZ+(r`0RISGq_a
z2g+gRPeMV_z(Wu|3j49W2sLkQE*V!o**(ZSibp6Z=SdJ9OuWY+C?YVgqz+zng$IU?
z$&RT6?j-}+PbuBGQz&Fq#Xk-P+j!d3GXIBAoU0OY3Iy%f<sZkv9gsg9kh8?oTA5iP
zmj)vT9HxQKm_O0(NB5?odB0-4o+(bZm35)xi+FW8M>yX9AxED?H_uER>9L0C5AM6W
zR$uFyY}P3#8t5n*`ZxLnnT~a#&J|*XGarpafblhS?|+iZm1cl@FYj1mu)B$a$;rJx
z3u`g;%x*eI{oc03kHDgmk%;3tnfwo@-B17=Wb@lzAxLOrim_|sVClM;1Z`JCzhr|n
zOM4@tTZt}bDi$44;Wab$$Aec=ZNddp3%ky;ak~fVP3Yvs?Si7?YkX-{H_2fL?!&7;
zB&*4qBEDk%bssg)#Y_cvgcp(GKx6u;E97iiGEIEs+%LEkBi+OmP~Ut1gaL0coq5W0
z(c~!e#f5DV)LyBEeIV_jnKGbo!%rkR<F8lR7*`#<xMEetGRmn=YkgCQ!KN0&Td#@;
zj|{@>{oJw7qHts#(d5?M4Is=c^dz^&UtL#k)x)HSXC__w0iJ;^1`b+SBEYtuB*lAG
zwvqn_D7KXjZDcn5a5%~r7FO`l`k8-k=rj)mU?u&@1+V6XbZ|<u_i9UXKGFDGEEZhe
z#)Me^h=AAMuIcGh+-vW0STIw<n82n}F686E4Hz)LVeN;&+r|6sJoueK=6>X+lNoo(
zChr}mnvK(0;S908S<;y1G_bA6Yq8xq@NEUeneYA>1(?31^18-*PKEQ#MtwV7!$3Xm
zrs_ytpe6~S!aWdt)Q9%+)(&m#eP#gPE<SG20ud|(aGeG|fD%vzYY#bmW*z=~!Z2gU
zI1WK4;yIado*&TsfsqnDyVaNl!v%A`p_ITVS(|R&V`d=1997&7r||6Qra(4?>~X2*
z0cb+ad90y*Ton1&?0aJJuW*R(t1*;CK!R;jRO*E2b#9(P?VC*c1-THP|I+@?21Eao
z{{1(5hxhiIa*tKqQ~bOZE^Y(|b@;os_?QyNOPM3_;cUDQv$ep3haC}O{nTJDUCDeL
zw|Xh!950nTvwLX&49PEyET7R0n^oPcR6RbzpoowNn0WqwIJhtNlTkLs1>Sn~#GT-1
zG>oVFerXD*%ysPj{PsOk9l#A!=StqtmkCeP65(#Nc2BcszcHP#2!4HF86IYXUmL?Z
zIo(aKw&PHL9_?F-jAJL1x1dvcph4WmCw0hLf9qDy4cxeLq$;L-{v9M(cBd;n7O<K<
zok@j}@-KYsq^uA_Q9E<-rlIByI45}aR909(jE@WA=T6NlLiBv&_kyPS=r@d`MX9;~
z>#SSG0AlD3oOr7!{+nBbPELCij02erLuk!q_8C4v(EyCW4GbwGk)i6E0Isv5eD$g}
zMp>mKS4l(QN$9WU$LSCBnIXvvUmQGT{_W^_0PNu4oDcp(aQeR%8Q`X2rep=xaA|%K
zQ{jZ&3umVAFtpvzWgOg@s~!OqskNpoED9dau0LK~RozV3#zn=KMa45;iz-jjM<UQn
zn1>sjfBW&-)Ob*&f?us(0uV2ayXkGLgX4~rmZnunhMDy~RHD{vB`u+MaDf<Sw*p3d
zM!t)JjpW|_MsWE%Zs=#R&nEk35*vEt*Z`rGLI$;bT)wZm%ki#FS%4W97%MY0Zlx;1
zYH8mAH0+=G-z}$^f+-*!rnyaQKruxB;YG9e6{*T{lWoedhrP%GM{&WwI9~@;aIyp2
z`f&5-y#5SJjJPY{C2%979Q>0!TYWRRMgs7ySFDA<c&cDt)u#-hMC((mzfFjRj=$E<
zD01}edzJi$E0*UNriw!Z1K*aLje`v_aMdlo5a{j4w$>QwX_vH{BJQ+#>a&pix>EID
zF8rnbo`}-yFC4I`tm1~>X6+N26x=V+FwR>%3^~>5II3nD6+y({TYY}QfZa_OPg`4S
zeEs1Klu~2GR{&Tk>|I07<KVv>mRs&dZP4ctLzeWZ#{K`l5r9~A$XkE5VOruM+71`+
z&V=nV_ZDyJTTo`1a+Y5oZU{!x^7NYXFG_5?W#2oh^@HeG%3IRfnCot~M>&U&$a!V4
z$bb<XFkCFZHkl&eE;B!DL4Da#Ir_I13`0DMIwmdyfRSQGJHLk<mM8DpfGMhydndiG
ziISl1gz^Ia!O4@IZt<wMK6XN)<0Irvp=hzsT8|q#1|C?bl&*E1+pKUeM(}M{w&iRX
zZEAK4Kue~xvSUZ@=P8vbAhsLs@x31Yn2rSU<iu01b_f)=K0bjkCw&Ho=@ed$CSRCs
z!Lf%jKI<25S5<@v8mhzVuMO}Yy4Sw)m?pj)*GVvi$ISbw(7OuA1zvbkqD!RE_xvIa
z8Bnu<pWXXWz(9y~^i6FHzO&!9w96CI9|bY`WY*H6b$Xm_$%4+asDrhuD4B#p90|N2
z%tlmQ!h8~c_AdaX>6J&pIE-n46Blgd`6ipuHBLDl+{em@V$;rp@%bo<noS^vd{_~b
zu87(=e|nSqS4F$T->;dLJK@X!)Pa8-({idB6AERa4vXuod-7>+p(B(+*Yar@;KY63
z9zec{O}Va>9*?~Te<A)B#|qs%KYitsb=sW=3$DSsF1{I#>IK5M?i&s55^3bi59n@C
z05dPK!<#X_^1%Gv82|(lfJI~MDUC4vhe6c1`iNfCnItLh+i9yy%)@isYmyu{uD;F!
z4Bh?`z@wOCEOu!1WhL|lf`e1P6j4KG$JUV>fIJ_N!ZuCfTIHAwWcukvPO|BVsSkJ|
z?gJGg$7@dL*(^&^q7AY;=jF+kQ*P#fxDXo_0dtwN9Zi5~{oZ*nt-lI3LZ+SH!@;iO
zKz;ijJ|N}u`~)Y{XgApFuNa_G;5pi25D{pd1n<lNs2D$CO!UA56^$B#pl@qV#X?|$
zoM#+uti_)Har?4{PhYzo?o~x@5pzou*TSb=iJrOsA`Oqg@jUvLGM<T>X<kK;S(=YM
zPAfvd?5S-VpS{5^0LNHJ7~QH)sqd=1iKrP5a^(>lh>b0ftkyKI;$;SMhF!UGAPoMM
zp?I@k`JH~9j<j-x7456S7;GK=e0D0mGQ3N7*|l|d?$5{#nE>63G-oV~0kDfL`&?{n
zR`INZkSIW<gLejhaq_uuxy+6=&He;*pcQVj^JI({hr!UbPROuHh*D-Bj$_*rvrJ0a
zG|fl@SMN%#a6v2^^vh^ky0zal=d%;MJ!&BS2hooyyD_PHRidtfocCb%uyOy!zd1`e
z?q~Hk98~rQ<SF%^u&@uMGyZPV<{lg`{sS5)nH+{HscFLIkU>xt3zwvcY2&I3V-$2@
zzdl9kq`gg-8&*e#Rg{KgVE(!xmvx(fx{+vP1K>SCbco>41XyGSMXtNwIo;8db3d=p
z%dr=k0uf`!o^4iN6nV|o_jm1jz$%%;UCa?WN?%d+A6iq#^09Tg<7>=APR)oz%c!sN
zWl(>lxz}wmHcqKre#N@a<2(BRG$(GNXCVQ$8%yk5ZN872Ne>$F;+tbq%IiR7pinHV
z`#k{7TV!`BpiK)@(QR+O7R080LMkGo#3OSbS=_gd;7JC-0irFTn*xYBmUT<0CP8aL
zOz5KVCFYm;;)&7ozJ3N6>NlM$mPhYx=a<oA3BVff|8H)#|4;PM{yKO@nhKVpcnNYT
z9Y1hBAJg~@esar9D)=wY15ER|QF?*fY_(d|rD7y*bAW;%s~mU+fldso<@fhA@zy!+
zCb=BQ-+U4Soi_>~z0awX!!*`_vSaXeIHf!&0#MY5RfrmF%}U$E8ME^>HcQRwvJwOV
z-ENga{dI>+h>~bF0Hg3<LdLR%M`{{)@+ScsL?OP9tA3hR4KD_3@+Z_cD^C<?>SZc_
zMXR8-Ha@*pJz=DE?)pQtL)08ZhKvI$t0l@ndkH`<6shZ!zfi2-2<UZuhH6CyEMKwZ
z!-9=N!=YXcugWvtH0a8Dgq^~y;L(8$kL1`8<Z8J;EN4EZXlx;|0moA3DI}uWccC8u
z7(dzXe;+@;sPOqE9dYGASXXv%{qlA8&W0NzMx~cmRR6wkaOZJJO&uF8VAjdz0`Kk&
zx^aa8=#&6O8b{Tzif3x-QBtT3=7f8h96f+OzDaV9fV~Pg3A{_j_cWjkNbtM;gy!LL
zrGYjbH~@CMsm;g!?gpGIG9Yleg1?FXCb?AhEAw;;E4;E+?HnV+Dor<MO(cDBAqg*V
z3!pGt53Fc{1OXX=4RYw{d*+EplhX`Gs7?nUH7z<&2AH?7YKltxn;p<Iv<b6*gjqxU
zIQ&io?OT!{RlVDaWG~JV;aV>>n?1&5=Zc@E@-ESN5VcL$LE1y!&P2w{=95V@Ed8~5
z7=OY)jOF*yn%&aKd$KJh+`nCoQ2~@50{~q=-kfIP*kCN(U=<tkosz_+?stQLS9R$^
z%+c)W#&_J$U+fPv-++v+LV<S)C&g_dgO(EjgVquI_R(A#KnX!f<tLm1NB}S(Ce39J
z{t&;vl(9%<8?K+XU1<T_E<t4DO9s$n#|IqQIC_ro%c2a=93hGU<-SY258n=sTAb|P
zTB^VLN0VNXs2G9~N`%8X)M`abEn5d$=XXEs`+J_2#jGj?a8tZ*2!%25KPE={Mt5A#
z_@huPLDw(HHf-Yo61H9D3(e-xw=cTDi$pPhJaT&~4AB>GSmX;srAB*225tFJRxsO=
zU&K}2o6$xMYi4PS(-eg4a7OFVRAZ=iQ0SvwtvN;jqSeNf+fPPh-F`b`LRcd92*3jX
zN%J=okLdUQ=aQ!#LuS$M)ZYL-na5^-d_>eZ?sW$>Ai?iXS{<R!nKJ=!Bu5p{GuMJ+
z-sod0Ab+*v5+>WlP({D(P6bR&8pH5&lAp$m90)c|4Fv#+R;>y`g!Nwr77|~d)Jj|!
z{ENK}+hIm%rI=>`9ttG9YAG!)-CrF~7&8_w)?*fbf6u^0g~K5#R>Z~hX%pho!JU$i
z22xDClSZNSwqN!x09z}t$uqHS(rs|!&LdN@kcba>T`{~Ei!ePB{3r3(b;;j%W3mgQ
z%Nzb<MAzI+CgC#!UphXlIbDjkPVM#^FsrgBvR3iaprKlVcugGA%C-2b!9-=gpUkih
z0Njr)_N^=lq>K#MvLpGXhLw0L4o_I6t$_qqi@Zf)M(EaCiQm(E79MaFS*w>|40<vP
z%6JBk2LMz9nf6x?v_HdrW*l1|iWMf`;TVmq-R%sA5YfheM4=|#(aXcU+2$X+dI@=$
zX(SZ^E*4|zdQB7qoZYlyM3e6&l;5-c!4>?2Yx@$=s})B`pmCbr6##s}2a7%D)waN~
z5&)7eHhvo?VP$rmV^%2Ay95mYZ~_PGS!$(NxE9y}*i=FLGS~R~kSbz^)dRlQw1uMp
zPBy1CjAv>@xiPrS9P$2z>06#0&USzDxa0!}`W*(AEv^N~k=Z>Z@<l}Nil*h_b@Cbg
zWpm!~HW6RyXM1$>S48PK6HM-IZ>Q5Voi}PSA08yVk!$w)+bytJ)eRr0MMK^7IaliK
z4cHX0T54=#PTbR&*WF0%Sofpx2oXJEhrreXwXCa<7v(486<`SsZvahm!zUPv3fT~-
zjUjZB>|=7^f<^P*l4FCRQXM`!J+Lj|)CeA`;e@fRZ&tvfKIvs!d98`!MyS3L=g~}+
z-uJordiCiS8D;Gy!s$a<OO)k9-kZfP-CtSti<}wofNvZJbOvx6a6_UjXS%)?S?nRd
zgw54T3|33mULS43V<N!kT?vqEvB~I8Y-1X|0(IgOyg;ds)QCP=RZCNBoDedjK57}7
zb%R^gU1AOZBN0&I5t%tWaQ@#~NC)$-{nt;7lF19)AM#-1D6tOzKzXKjy+^p;wBda*
zNxxG!09iOi&Y5&ip0F4jzHKxWKyZufiQ-7_M5?0bPP^Uk50G(sWp$AA&ZN8vcwcOp
zRt0ZaKdZ+~3J~S4vWzJqQQxD0!l93~wp1q&{H0B8mG$^QFjMj)nOXGL<6zS^Fn4H$
z5<g4Q!zDvjI$P=B7Ep*p+IBhlz4NJF9Tx8~7q_;QW4DbS1SF8-;O`8&C2lI{tP~AM
z=C1NV-QaaKMq^Uji6!EBmRL^o{@41iOO2zm4SF{Neh0#EiFzj0Ee}wZLNR9+zPchr
zAm}r35@_Luvc}?q;DJ!_l9lY?V5apmwBSX%O1g$J`vm1^v^=-f0~s?ApkO<q%)jow
zq{auw)ne*eJP^(j0<53(ZI(MP(B|+>`sDf4*rXe&ILpG-m26g>(L;|6hbg<Ok6GzI
z&LQkNfUDqAdN}UEUjW2+cd5S5mAt_>@^?6#W4AK9lRp9094_C?_q|>PF||STAnM&q
zJtI2!dCfx8`90O4Z7>-n4Yo1BYkS2GyTy9|nE+`t(d(@wV%3u?=U6q<f5?n>7x_3m
zU1=^W=kEz;!SaPyM!nhh08x6@p16N}o(gAM*p?uiW2iJ*&p{L-?Ms;^8$0@0<@<t0
zsh(iSV-E*eN;H<($>H2FGd1tYjGwPuJ!KB3NVM<22M9>noc6z+7qc-~myw6pN>MzY
zRmw4s<4MSQL-pnN1t*1TDsGx~$&}Lq2_ek7NfCjKs$-4_9Y`@z^aNs-AMHFWUtcTp
zo;TS3phZK@yWciwRM5Ns33xJzuu2-FtI3^AzyH-LfhD((r0KBCie$>Cy7b&*RL91@
z(?lWq<pkw9*w`t$D3@U$4;ru)6hP4NN<>O1P%tnM2nKTUyZ&$uhZ5a;`kGzRnxxAI
zGeK2Ex;L*DgSPhv6`TW@JGDbQv#~13$;C$M@m^q&U>hP!wrWRZd2M&n#||9rZesTT
zMg_`|=vAdaff%6o=VW$Ht-`da__XFA7|l@&s%|=bKt`@AO0yodG_4a`utZ^;6gQGw
zvkxWh91{<b#X+LW=p=eqz=Vw1a2ybH)`!t*>Z)H^W}TXU>>ZdEMTVht4>i~N`2-P2
z!LeK&G7Cb--6eTeUSyO8%xDeRFbM^+gq)6ox}#Z%a@kVc<}V^>@8nBC0ZGhgj9nDd
zbS8~R{PAitCT7*S0RiouB}nV^(h7vD-XHF?gMOPixK>)Bv}X-m{`#L)2RXClYGL=8
znZJ#(Op)ZiH9xUe>u2Yt$qj5&&poOFxSIf_QBrm^mL+H$T%PwLoEMWUc>^^&(lVMO
zWB~oV>hJ}M<MS(>%8TU0OE@?aaPv~g#Hj)#;6vE@rY%4#4GvTYqkxK)R#n+Y>9~CV
zW>8)*psNN%<bE&7Yijq@E}2yWetbu{Uc-z!FsDs}M&aL_BEjx=#p@l%h}Cd69oPJE
ze`KlH<1V4^h}$%hhKJUrdkz|V)4Fm+)D;gu)|rw6MiSrlFm$q?24b_-xq|fo_&5Vj
z2BWWVu@vXEmwne;uF$ZS6|)#8p?^yWt<r|eWZb@eAKpDxP-UB}tbCmdN#!6=@27N8
z{{tkcjt13*72Q9&{2%GjXT79wWoCv%riMg<vDWv+gJx=e6nj$QWHA*p(3d9m!cp%K
z50j*7xpHS7l!7Qi%j)(D>04t|&~b{J0^LYWC2!>&q6W`)?pzb)n-nH`yL9UA2Hij0
zE7ck75+3KH&D#H9cw{Ren}z9xH7nI+VG@K}NSjOCh6T;OK4h0t@5kUneY6+GlS6!!
zfrh<LcC;E`-Jr$E_fuUB567`VkRU921IO>9LO|5*k3X<O0SZ>4aSYDIoQ~#&M>qGI
zqvVL74Y9%+Z69)?bKw!kcJoT3lEXQ)DN~kpWZ*_CNq<3Cu1q_Ye>nwLu_W>Q`aczB
z@vN}Tf^IN(w@4YUN_21}E2CcwMjVT5>fkkjxR`kS((h%m85MH&AJbOI37Z?X!Wm&$
z32))QJmD}H*<KMeJ-7lLC7g|%xj!&2$sEY{jR}RU*>de?IxxA5dIZ}G5)1X7C@YX^
zy!7BxCrY?%Z$55OmDQMixKI0*Jw0UiU-u5bK7PPg73}nLA;_;*JL^`>qB#<@q;7Hs
zG;vY_fdINl+@V55WgO=+#sr~XMHz_r`A>u0Y_^&bUdc>@jQtN$>uOU0L55l!6#cmH
zp=>M}pdQq4+^_;!lhPP|*?4bjRA9TM9y_%O4+{Sr;>p(bqch0HziT39x#q^i{-9Pd
zM?#ONMY#UCA)GoKe@IQtBs%hk7Ph8Ay4~<w($y61aFH3){8&XL%$N-Fj4jP>$p6}a
zav`*-n3&iCV-04nQ7!VAF;S?31+88K<+W8R>b#bcy0&)3vZ<I;t%$ER-IAT0UFzbJ
zqJ0rN8O7AYN2CG@IAN?ATvIlEEuWlpgvawJBdTIOYje2hIA-|7OOQwEhT4l=m1Me<
zF(P$HMMkJYKZ(ch;H8v!>xQ}(vyYbZ^(DLwE*!+!fT#8XcNNPQQ9}PdUe(<`vR#ca
zY~KQM+ljEKATUWXHb9zxRh}=;=~(yZJoGt5>g0h`S9nx0Nx~eV|D=?^d`jSk+`Q%=
za^4U1rHToGhAI^7$hm^W%?VS(N!=N)#Vtn#sld1q`2>mg^55IqeS%!g%>Owl{iKK+
zOl@xLnN~3~27YVCKO=IQ%0ju=HVU7FLY||PN%PYcvJ(HINS`wJBRvxGw_wiRwWYMS
zKZ?o7q6Z3s4o6eR0y{Vb8xe&s1<8o1?!ddQ?M{}tX9l$SRDT?}F3tWO=7>=_WMtIs
z`zaD{F7Yo*mm=KwC&@W`6WU|-kr;91a+ya?ros|G{jHeH$#&M>7Rs1t^=vamjK-WM
z68G!p9kcG0J~lSG*0&7I(qs|hHFEhiHNTucT{`QhD=y-3D7rONrkVz9qp=ue=41wy
z5py7=`1H<eQ+nY;-CUM!byd`EKlb9cwsFuR5EpS3Y{(%e{nkQT3BlgqVemkg&Rc2X
zp%309bpNjDC@BuVEuu4;xE#XRTkVGm#4A#cD#7>Wg9&HUoPu9EWFsGbRkajWc)}HJ
zG9>)bDs*UIH;K&hMMR#V`-ZgKTB!%Cnf$~k!qHu}Uho@_Thi{^kHGYwm=w4OPYgCA
zeVezo1orpS2mVR#TJ@_(iq|9lVH##}y{nC>sPU}0>hS#UsFm{dRkg-7*#oEG#dcxC
z3)v&OrWRgol6lAe|NbEL|NX#Zq|51P-$4K8vA?DA(ZoucncK;9)O>C1%gSlbv<BL>
zgZkC7`$el*@0W8T)b3*i1Hzu`jDM7NyAJoiU<!&#W~b}GuSA?|J}V7>nST2zf9|ny
zcZfL!L$8gC&e^)qHo@Hn^iYsu7({NQ2UM|64(arNAih&JW*xd1_;g+zIEw)h(I@MW
zu_D@kI6j+b3god~FMY#<`fPbzsJJ(zO*plHCW7%C1ASl_>1aGgtVu~n5^#un-*lcj
zcU6B}D!eTChda<MCcOLktg!z?DSPaJ`ANUCsfR!1PLJifUN`1;dORXCH4OREJtf9)
z*Pa3vQCfTPY1__Wq$xJmCDrIwKEI|{e`0Oc`8>_Ofl)qr;6bDdzvn)r@4m6}3Uj)m
zrhl%;l|{D|D8RIy=c;h5iYGDd`~BqSuDb37I!n&TZ$^G#_s>`A+z8b8)q{n{Z0{?T
z49b7HEU>fMw%@pP6EVXw{=}Z%M#r*5kCOGK`a5ly%6}hG&%giG{p+y|_tIcfJhFm?
z?9g3GY(@J<yf1R6s0`FgLN04`B?l}y+6(?tUOdA6`6*zwN*1uXpFyq@oerbhGLLzA
zD65R5_JZEuFCU=(QDIXlOKDsV(+<y$$J8*<?GVFJ-O`!{6mM(glv=I)2n;n3puK?}
z8l!sb!*PPi?x$9B^rq|^Zo%C3Z04-;?ir()rJ+QhJpY08<CSz#Y0LXj!JcNn%HSU|
zo2|gdBjgpcAf77pME_7^A{YH`Ej5&<p#BNS@n%e%2x8%}F3Usy9k%(A({*T&riIk!
z>vdKjm7qM+z(gmM6<WVANn^Q$IE*!|*lq;4H7Ii~k49$fLWD;>EoL9l#7AsTN14v)
z$%8vp*2{}QwK1qpzW~++Xrmq?AH-^Yt$!}69s>yvpU>Oaz2Vi47O0c6w@la)I@OW<
zbsy!KW9}i}I|h&2I$%QTqF3YWWP#ot%oO!I6J)>a*5*n|XMR0#fSk8d$fSUbTyv*J
zxxtRIeJ_&GHf+05aBMyf7DfVS3Y%0D7rc=s*{672_Zv12(N*0AV{nO|-_cY42`{_N
zmr4fT#(9De?{I!-oR7?@;>;Qf=6IegFk<3lyXDDOOwj$uPNt!?c&_sZ&qaO4`bprM
z{T!Nqz4m4%kG@{DY0nyrS;|bMWXL`ns~z<YY3D*pf#R2bl0^)Fk9^?q^X8rQQnQ^?
zl2^ax*BmZ92^=UZuiW<Y4z;^$ceoeYR_h?3<k4~4v&mjqg<j9#so;VXO?o(z7NQ&u
zl$3zK^w)UUWXrRXbxod6|EjL5%x1h$V6-GrK@D{lPmC&nD^pw4sR5=MoAuD6iLO9x
zQtoKjb2*cT_Oq@X*mcaROK6Fj<jZ8Bb}9Z4a-fC41c$6-;=oX-+4pW#<k-eJE-`^T
z+q$mc%lBYYz!Fq_lf>*lvDz7bD(rp<VdoZY>6ee3S1H)@OoAfAK14R8Ey*V(%1yCA
znUXo?w~6EX-Fi*(!R6DNWcje8uQEg5V~)g^fSPKE?i)z)Uyb$&0FkwIT+h(;AJt`0
zst1`89nW(uLU!`%?L}_9E<a|J`4-Bp57#N&aLmT+Z`fD6e(B+`7$KN}w|hCPY@?N5
zHBqVNPPJEShADne->-_RTrJ(-3<hGOT?0{FIrGVOG}gZkw<i%l;d5PqQ3*oX24lnM
zG`1r<fNm@L(JhdnDnBV4CGN?s8FGNNu%_77{DpOJpr)s)WI6q)D%Im&wKpBD%&v$h
z-;#7o#>Ry+GuI21{dFMhBkr%~)c@X{L~S0Kb)&d#yFjGX^?BlYAxYCUZ_7DVtw_r7
zt!37MBMesKhL{QtNTDWIYgj`I#4<57oJE%1Tj75;UA`XE4i(lOT?boteK%uq8)Ed{
zTy}aK9}53^)v{VCT`zyRg_XCwq7t=O;h*sM8KieuDw1{mIQ7np1a!RD3(Eg^pdWUi
z01^?d1E-)<qj|84L%odb^1Po6W6Cq1)1f*d6&{0szAng={C;3b8|tIm^oCd=#_(jF
z7cy9zo;`r%vwwX`>?T0e%e84TiN|YvoUAy;UAK!A<4)feY|N7tVe++)q?uLINw>qU
z^c;{6oWCY!;f4V16c0DR@huNya?_;^W0{o4aj9s|h3ZRZ6wf)Y4PpSnpknNgW%f_!
zZ%SVo=)w;~C{5b{I+G}+Wm2wEGRLpL?!qFMCDpUWxgx?yV|+fmvK^}HV7d(BG5UeJ
z8KHk#g~^@Fv1iuR&o7wS-Q({}l^pz;JdNVtw>=VD4ZX^iC`sFjBYlLnQSKMYp$cs8
zAD9uZB#zDS$0x^8>N0l+So1mV5Q*ZP>e5P|qcXeJ&0SP9o#tbAGGL%Hn2=OE6auAy
zS=g?1B`&-$@Q=2%mq*_%z{m5t?mKk(dI&^YshDh_^UrP1_ci2#&O<hJPz<r3UL7q+
zniL`(dpSCimiu>q4`q-{k*4~KmM1;tc7U%srRttY>>~-J{v~93h=7%hmU<2;>A-#q
zpRWhPv8?F%=YEE3yDOO}MN$VYU5-?o)zlf@0eMZUcZ7nWHFDbeqGvMI*30WFW}LyU
z9iGz-|D;6Nx37Sc^(p@?JS58Ecl!A&Q}*ozfDgtL^$ytJE<U=XHHxh(2Fx;8wp$()
zRJ?m|lC+JHs?fU!xJ`o}=a<Aic4ZvtS_bdn+b5LQ#u$w6&YhH?^bJ-GHm-~TVY<JW
zIenRw4}2L=bA}9UMB;Sv=RiCMV*z;-?@En9aVdY!2FY@~hH7-`&z)jEVO6%nF4~ep
zkSVdlMBekcX^Fqi2w@G+wDvUCPQb#N%_kzwqIAtIIGkW$1RcAX_9TsXwkr|{$o#Fg
z^&q2ti&(a?<>R!dam)hv#heSQem<u2V@j<>%|E-Z**r^6=F7f1lqmImBT-kVp1)l0
zn|Ivxvh1JJYYg*_9f<euOrA_DvX`EYLzYa+$BkiJoLAh*EyAYxS)fDK+*Gw$1Bc|9
zE>o9>#vmuM^Y2$51Z1~s2Ua&Sm*jJJm!6&uk=$qBJ(y1(ss!FdOyxY+--T<WYyn8_
zP2`YZ=DGQDtMWVBdFR`I^nE_{ExUUp_BA!#hyQAM&7YEGt-bQr6Xw4ckuTiBFBcKj
zC5ojo#pF^i8fDfU_gWo9Es!he(SAO6{t>}H+4DR@>U}s}q*<_5qhmuQ(Rq^etnGC#
zAi|w&%lDA1fHhEQTRX_7e>{PvgBV8?aheD`SF~}KN*WfXw$EkGOGp@s1K$mlDmj6U
zt}`hhW_?b|v}CAXq+kEH7mkj-(8Nym0ZW7(g8z6^ka^j>Wx>SkMK<b5<4g1<SDO%(
z@**q>{~&Su#t8w4Cx5sShOY|O%5#Y2ZGSO&_+JJ0{Xz-zQG;~Y%0rC?Zel%kq<@vM
z1JA<LMN0JF$VQLwt)xMo?LFfQI<Y$NmjTRSuukeVoYS^e_sqLx9z>P$co9u-rS<>b
z1vm@e-bQVWrdwcS*3&U)Y|VToJfF$AW<BIyz_E!BI!069i7KO%L=O3*&4F9lh*`v3
zK)#a3b+WTPxilN&a<chGO>`A7YUI2Os-kS2NAo0lkH@w8)N2!Q<oKkgsbcss^CkPs
z86@=-rkGKaROg%C0&#N!n27X%xn~*8tFmA$F{5mR7q1f7`(Vw4j>v;50(h-H!KiT+
zH4)*)8&JwH6eA<##NZ0f;9Xd@E4Emmpelv(kC_86>0NdTs3FU(c|^1l_Hf8($a0^9
zbxelca1w3Qc`CV`u(U2T5OA8){rXVIK9*kq^u$B)XlGlEGq1a!-#ew19@mg}`+|XR
z66Re?O?Iv&U;o^){=S5j{T&$C5llTU!gkT*%|*(iPgj@PC8`&OPw<}Ac+rL_9Ul!K
zHi*~UX{_6Ay!V|}l!<0t6zZNyq=Z%<L-XS2YHoMepS;xxbPddZ9m?5!eJyrretT_B
zLia|}>vRlHp}QOi$2%_oiXR{l$t3<MO@BD_KjzQ*8X1lAEf&li6e9Gq0Kaa$9IS*M
zlb?0_T1=Alb<d4#jTnvx9}?w-^I%;p%P}?jbi!^afOOHe@5v1=uSa&=>?#BXA6zss
zYg>(~=5}eK;SwfAJnvjwb|OY1&M-k^ujv@7s~6<Tl0?jx&568h!P|h_4w)7b%`9Jx
zo3U#R^fyY?o69-s*Kq}gRLRIaE3Em~X_6b8mTEY0E@wm9MWh`~W<(4>+=j}^M8=Et
zZk5eQ^Kc|5B-2CEi%_3U>r@BU_=U800$O@Flq}pl{sIXtS%BPL$Y@a~f9AmC;YSJX
zSyZPKPPV_Mj2Ohf#A<fo_n%CPY!R{DjP#8$0$=`*KLI=>oO<q9&A0B4ntkVR89ygi
z&-t7Q58WRN(Q8kA{-fP?2w)kG#74rd!jPhX+1Kh8TrxlphCY1~nnxo{Tz7~w1*OMa
z?~41Ipwqs~lw=$<pc05wh416TyDNYk107F?f6jGSKR^=D{nm=|y4v4p@p<%?e6Rjt
zT002q&a0Cgw|7ncdyUi&pg*?an;(#+`e|h5KL7}2kt3UzRnRxHza_V9(%Qa#YrWV0
zHmwM?E;Yu}TIHf1_{`La_?z{<W?^Y$=EuGFO^F|M+<jBbRcMiYY97K^SwLdQ@iNbL
zOR||E{(L)Y)498E;?t-nfnp1tvVS%)VJ{8V?+>w=H$Q(<8SEA|SSj&BO3kqm9icP@
zyq9n#NPmw-&a$lx+EV16>MslFc1wi_LLaNo9FX(0#_V~C=>V?*I5yq~77^!lYA<;w
zi;*UyWxkrb5jNx1eP|1|6`;tLhX$3#A<&Bc)|}V%1<uMwO$8xmYs9N#&SliVW`B6y
zQB(oWP*EfH(xemsSvG`CvlulEHQRxv<f)S0)0l2df6s&FCufRp(zMLO7kH*h5W~vH
z2yS6jP#!-r@Y5Q7RGN28i+6RaGv3O@Wi=?76B^e&6xSCEP{ajGsYkKXxkAw?fb1R%
z+-1+FYF%=({yqOb-}fe7t7l~o81O4KQo6jN`;Drhgr0;y|M`k))U83-3d=z`c`r#0
z%O4Y>d#df*-dh2F?VX_L*!l{XnexUj%v_c0mAi8r^Ecs0$TR7|6&WE*gw)-D?TS49
zQJ|j1bW3ubp@KeHuHkX4ZPhH)>3(H&AE<}WF}KPR^yw0294)nfYAc5T_qs}l;nb1-
zIp5g=os_2M-HLE{WCEA4ne{-x5J%3)73#ywHSL*I^b2eJU4G2zz)@l3xFxhTIC1f`
zsTDO|cC1KqDFbeCAJcN}HLrC3j*Uy?F4T$45nU2vue6A92*G57m_T-qT<|Jz21*gq
zw8FGCk~*g@@<+5{BXukCzUsEtq1JQTLHaZefB)&*z0tgmfBLN+0OTF~<{zwPM2Auc
z`uP1d?MW-=3xy4EOnn|wzCY{QF-fZ3?Tq<+ar-uDuaZ%nB8-~6cYX1DPOs*e+yFkY
zU&-%GZIGAx1JXaQ`IxX$1N^kS#6L`|?X1^UWiMsZs1W(q5yaqcCN#0itp(eov9~wh
zh)V$opFD&mr7)KlQ*YGrynBDK0a&ZyUk&9Istgt?TR9`CF?2qj-#M&a+1icI__Uy)
zCoLBy_c?NRZanR_By1e=v4)e{nxFgDmAcPg0Q6{hGtwTVJk)e>w5gC5bGzJ!vl0CS
zSnAJX2TA0zm@!V!IdFew0@Jp@oqgi#HM#HesJH_^6Pt|M9pFRhZRfGcrvu1&Y1n&f
zqvfyjskp2s<$9sN63yq9Ze^=)bbF4+<`xX*_{CeJrA~?sW&{rIEi7eEflyp)qKCyl
z!=?4m7tR2{UPQCi%73bJHGbC*$NVBsw{gFiWH^=Tv_@~&bQrxrdC1SJ4+Cfw&sAr8
z^Ech%<sXuOR*O~CPY}Jiw}b1*fwEPe_eT^maw~<01?Ul?i5{}-TB!th(OBtJ5lhX7
zA%O65X&II*VX-3M33S!2ZIIiS2(c$)DnH+ieI3wFWU~QCp^nKAkip1FBK@pCR258i
zNp$kYGy}~eO|~27|A(@#@T&6pz6J{vq(h`cy1OnQ(jY0_-Q683CEZ=p-Q7xecXwR6
z8{XmbUB5rzeU?k{a-W(xbI#uToSDgbEo3KYIjE2|HN+=Dy&3QKDEONsV9nZR1c02B
zptOgzF(T4@6?9%Rk1_R^Ohr6(H*v9cdN;>+iQkZFq5-v2{TS8!nXJ1=N+NyjG(Gn8
zz=7OC0sZ&;@J%05E)UV+s|l52f_Riq%y&MQrTXiW;&4v5wa)<Y>l;-6r~bayW;ai#
zhw!zo-@zhdu|z&K>X7@5ys5I12jolEWzXY_g^{dH#&N~Y!9QjyIQfM893$#Ah3S74
z7e!%{-1@UrT8{NL8&$XEt6LOT3l1%xdE^iN*>?R}ZIPSdrb2?p)A;0Cqs6DVjN*^Y
zmOo@3mVFmN*HG)(8TFBk$9RRjI!%u9vnwk86<UR2bIru3&9JPM4s}wlC;vg+#A>2z
z_?zJ3&*Xuj3G1cZ&@7F5=Je?0OyEqik}Z3a%snfAp5*ME*yc~;*6>Y6&a=-<!5U16
zJwJ$OdT{ilWN$qrNkLl?2&DbyxED<q96+z8aw{r%pMdrmV>x*zlCkul+LD!3RsGFB
zcQ3Bjm?3<(K!cb@c)Ye>#kv)5);byg)iWUUJpn-Vqe0J+eL;>*Y7D0-@TW<=B4n_Q
z*ayflS(3B!`G>VfA8f2I=c1FEJTIVuFc-6FCeseB&#vY*B*r5vL@EvV+s3<Nu8GOb
z;t#y}DN{-yNDDJ|w3V^Gtqk_}Xg1&GX>FM=*=PNLm;&{jE2(pNqNa~=KTD_<4K4pO
zVFIX3AeR})dG5gMb_J87GYL#w)~af-2*UHY+@39`UDuwNm}*DQ(zZ~t3+sQ-6SNMl
zrt<VV$>znjXx=-X_tDH{msw2Yn|syZvRwEqkv4?0cQKX{7*ih>*-+R=dh1RB^*Q4^
z{->hu<b&8W6}uI}oZ`+n!FfbXG&^5rfl?PKAhOQPVB()GLy22?PA$_jJ`!cgT#Y|r
zga3IwlYzO}gP{jHl5DGwhg9<FmLCr*=<Q$BwYlZc;6Ax7uk1O8J=Y5Y2z|7zo;x`3
z(YL`%dObl7KgO<POOKcLfW7s`pi;s6AI-vrzFW4+{zkLNa?n*AfIv4u)0;x3lB!Zl
z{h;tUMy>}jg@xP{D)?{_wD{h!>wYg$FKH}pyu8XB>+!y-`U>A=*j5^t`E>Gyp`K;U
z@ImW6^ABR~8QIIpU*zHtGigWp)cyE)#%=}{V(vvuaU~W^jrVL5!~pX9s|{Nfa-BTo
zEJ#?nP@PQy(LqtELNmoP&P47iXO)h5^xl3o_H~3DZ!zS0I@tl(2QLeT)CyR0@rS*_
znt9r7^XVw(tOJ<Ej{?eQ|6=dRIL^B$bgcTn{Uu@fyRrqKolJ6G#uV}n^;N)WnG`im
z@D5#@%|(2L(6|nUiG6}~`dN^3=@D2=KDbmB|Dr=%Z^LQRg8Y1*RG*Dk#=Di%?_0Qj
z77ZCh&*f#G4Am_;4!U_7Hm(Fz_5OIRr81@3(4VOw&7rkPr{Mvp(KAJ6g|L9|Fa#VC
zAW;4*FF^;u#P6r$9hbzrx~b1pM4!x$DzK*>PW?<4*pqyDV_cHT&ubDV9`kqlyNK+Z
z%78FQS9)yQ%Qk)hZX7B3d4c*avA8DQ`ZKKh&(gQ}gp&|3`^j2MdIQe=1+Bp_Q7fsX
z>(~Y7H}Gvs_|T`EOJ{(W@;)&TeZBk&MbrDI@{%9ucR%WtQSE#Uo4KPpP)nvDraX-0
zdL_&={kR^V9Bksno)P5X#?5~mj-V83R)c5Pr|k^%bVu2Z0Z}XwZeELn3nRYs?Tli&
zhraeqDBKVpsR{i82Rn<&<^|701wUHo@@*KP5ZF|7LG}D^m!E!4bb`hM=&$zT6rSp1
zQ*>i0-GQha&&ZZ(apu&X`|(vDc6BHaUtjN*xxc=C1!-<}xoT{X1I8sC(J`h|+%><p
zT`{e?NGL$@<*T81QIT|_TTSH@iv}m}KvQOxL0GT8jgN3C?)M-WxU<jcsWXhZ`=%+<
zg-A<r6%1X{-*6CV_WR;<-w3rvHvWS?(jW$%Ae3i297?5nmDLYS{Ov)OcHnr1@D_bb
z5j96O0<FTks=ZQkN!LMKW{y*7no$h>VBrWQe|Z2s?@gcNsVjdRL|@OYngAr(+@p7&
z1?ntYj0bPi`d|G-{oG3t$T;_FV$TD-*ftIPb3U2#jIQF!OHlS!vYXfSe1m+JE0trJ
z*vR(p%%0b|$!EbxfT|yjhlt%`Teau3%|bhAsL@{~9)^F!Xg0b*cwKAZ`N^Std`pOj
z^XE$`*{tH8NNQP$To=$S$OLzvp6-k!f)wY8mQ?R;gVXGRSuLeKtfX$z#}9Q>(w=W(
zh!xG~<JjbwpDr2DC##%I-DcGf-A;XM>^;--|AQBgC!P9uf5WVm?RDpGQQe(@#~@~&
zy%G7leVO~L7BIXryOYX(J=^^r5g!>(<$_&>=Z9)7x}T@6#})o=J3{qF|3jkj<d?@4
z`sVY<BX>0(U(FlM`BJ93o(UISoUqN8AzupLsB$GcEx&R8OvNu}h`OO8oa?&c9@eB)
zs9&xO7(&tES!R>*S-l2`R!@bTmwmHVv<Jx+2x&v%9iM#eIqucgjFO>wAII{oLI*vV
z>|$0-eK*anI`|^IeHtVuxg;UGxd!y0q^OR=&BlIezpN?>U*V;m=6l9Z?e=C-<JW&=
zE@q)UI$1)qcIBn{YRo`N2*z1A)Y4J%_~(y(wTO~@tUBxT)Ka&6uD}NNP_fvZN+QTD
z+<$fUphUibZ%!xhp@Gx!R!`1yOL;>y_myzISO&N+HI2bthLIrN7sBnL;K61hlGUoU
zF^XH~Bym%3v+kz6>amrtE6^ePDa750;=ZM2DNSvMw8)Y<?7&u}t(&-u39I`H)i)WV
zLpPqiX;y*BfhcQh`+z0VUDUE5Ic)joHgg=X|GS5&%#=h5l#iU#JO~dT%q0DSpf?y!
zx{SuVt}gA}p3F)n>id%j3xAI+*12`WnHT$V<f8a>94MajqkC3Qiq1{p5{_u<RKMQ1
zhusH>`&&xK3zd3QQMeu6Txj@=Sl;0DH(WT84|_e`>XLw%oSvjLic3)v;(p|^igVV+
z3?JBm>X$UoSJ^)P&T^jH>&gx(=n|s;TSi(w(K5{RmEPfS{V9=jmC~UphlkQw$aq!&
ze0o6(7No;XWRbEzd80dbK^H5682wdh(?J?XSA{~htI9x?0pqwmMsJt(Iu$aQEo|?&
zWI^e&%zF@BKkM)TWn4|`?Qq!P%ej1lYqj2*n+UmJ+7Wg<SMl`2WdPv^+XhSY8_;qR
z{@mL>uD=pGHNR$|{P~@5(B&18piI_ob}CKiOzVB=6~2enJ}i$a{;C<T%9kuKSK+qL
zV6pU*EZi}rtGvK~)1y+S^fQ5yN__&#8t?%Rl1Ba0Ex8jWQV#N?T*?Q#dhU}LTu~*c
zYTtS&e{!`$CIwAWV0B+bPy>iDKW*>mv9=H=-h%T<zI%=?cIi<n%&=!BW6bmfvoK&&
z{Y{Jy$JFI_D2pYXy&pWQUDT8H-c9~84{*wBF9!{L1N-NpQu@9mJ+FQOa?o}Bv7frW
zpfy{BSzl2Ky`6!;`!!{M1ds^*_&R4=>mEKijX(&jpjDU8RrOMT!ajT)eFUm_8s$Cm
zgL3+z^wZ^)BXoESP)qe4iz+!EYr%aBs!?a>+ujlQ>6PI6+jH$MPlwiag(i^p)N2hb
z7jP0G<T~gh$i4d8@R_ihxG{J>y16UZCUR_e`60QTh?Md{T9OpE)G#Xy^){{T?R2DK
zfIFgALYjstv_l+}<GLfTrsMR_<G)Lxz9L6V{K&NNRW4sHbexl0C&Cn3Ud+XSs05jF
z{HH?bY^7J81d8)2$VGUvKy2jLd-=4W{)YF<ABX)%<A$+`dM5%fZRBt6idx5Ot`zu)
z4NLOH_ryK~-?U6sryM7IZeL4H#3x~$*rE2W{f;TGAA-!~oNr3)UTt)vHtgJ;a;PgI
z=l^c^YIFQ`D_T#i6Dz0APB*{(Zt1oI<3Lu9{CiiQbQdbjHQFP0#CRCx*1IQ0z4bY<
zN;&V0PVP~lIoQmwL~}CpN3|wDIWr7f0p+EAw_mt8;};p7TW8q%JqOXV5Wn!gwnI|-
zeIU}-ot4_TJ<PO8o34=gP`4fX>$8V#A^m$Iz0Mu9QAQti>`$aNvT!QQsr{O6R(;P}
z0mui^KfzYkRO|6kp*)qHb0&OE#*91$#*oWog?sRn2!He8OZn-Vtenw09^DHVLgMhF
z`{lk1u`>cOEyvBrj<2&}d{a9Pq$#X}?Tf_)1dK}2C@8Unv8Jssy={2|4e!&&z>a@|
zmM)_d6(cw3(-^9{5y{++1X5bJJJB(S-Rk+{7><fg1-H2Vjrza-lGr#)8Am=aYJngZ
zUj(qFmzEa8SRSPw(KjU&s&}@xjNpa1J=!tf^5am%E<T$BAXwrawC@~v3Aa10#n0Nu
zIvh6*fy{JzYbL3QTpJsfF`!+>@ORp?qDgD-xzcdxO8Z)>g$UGMNGkav@%Id6xuw^h
z4B9Odn$ohH{p;!G1Q8&uVU>++b>K776Tlx&1=l8mgpt0IS*Uba5SM`)4kNu&Nr|qe
zqQ=KG1Fo<d4F)-?&q!Vd5<_VfQ!~5ymV#vPO?qSdPl|*%qy8IZ8eC#J9}F2dD%a+`
z^Tp%U8QwWOscrItG?E2YW0&D7NDZV7H4U1&kcdBqm$wy3<DvgrD5@LcSa~p02J0L=
zkFFu1KBrTw&C_@j#wjRS(&DS9dgHzO#bK2L*5c7U&uHwyo7GvPcTak(0gJ)k1*5Ew
z1DeX;ZgJjdx&s2~U+D8K#c({p3k;)$;lb?K*kIY*o0`Loa}8-w)ezDf&l_K(pG6qy
zU9Y%TmQtXF4k|&^mwEfCKoH`!yVFaPCn!T4Bd2`mz3fxqZFRi+H=W>fTpE}}gdWV{
zXa3?qYy?6FDy`XbpKIck?j1SoY;@#qZT+JTi{Tz0M>`*owkM>ikQZE6j$(Ctg28Z#
zo!KS64d<Q`6t=&$dp)Ev<kB%3e`V~ac<8n1#oCQ$I^s0+RChRnizGdWAx0z6xHea`
zzjmw>DrfZIKOPHN*e;Js<5sJhTvc5H^jFADD@!eNe%zGie)T*XT$y7lD4#|1G36?D
z*EQx1Ha?hM@Hb9}YjVRnp5$WX1R1CSOx$zeeeS|qj;x{hCi~23?MX9?PP)f|?MHw>
z0<Ei`8*4*G{6;VLi-Yq(L?X(I8Z$4$`a0%|LW!M~`DV=;JL+|XrrUp^0BWzwREM#Q
z#E0Ci(KfJ~{a3o@tLncwu;;ZaP4A)RG&WVJMu$&e5)qk}Bwnlj#e7d8Myl~-+UZL%
zWJjHE%hY;%P?je#UC2$7L%;FYXwoJx19lwCGXRkc8GMt_oS4oEr?qT(N25miV|^}|
zC_=7s9)!%r>3+u<*J>G?*`Y$kd;(*)TT@gi@pFs|=<ERiQ||ha_}fIw?MeT^T1kXw
zCz%LS!Yp6_2BtuF5f?ik=UL1hlWsfsHB7~iHkREKiSK%SA_MU4Q0EhL&3?!fbDejA
zije*|4Co^esc}0Qmnoi>rl14kzE6yfdD@}Wve^=E(KtBQ?k*})tYV&wFS6slwcvEZ
z)=sAs0o;B&cG#5x(&~gKUUbqw+>uKsAs~JEf9t`dkqMz4oa?Dx1UmKy1wZ%+n`-RW
zjOKh&#b8=%*P$S0(b4VZ{*Jb~@G>%5)7V1i@b324Xti%C;r?~?WP=jM5k2lrxZW<2
z4ITd2ZUx5n9=X|MZozFL0%n|uQ>ZzuTL_(ywyEfgQ=TTpy}F5sdLuSC*dJC=L?PYD
z8n+#$k~8JeGf(xiu4eFLoMVRYvrTV@HS77<W_KqIi*i~IEIzi!&l`6~e8gZK69{mj
zqjw?SIKAO;;-*_?M9eSt`Z@6jb=j(nLD60N6}X4oe{N>|hh1DbP`!(W8fy^U$<%jD
zs!nRrqB@E1I>V6R(~e=cD`#HhI&RmIIyOw|{?NlYD|o~zoHz0z<D#L~7LFt_#S>;7
z8%_ujit0oi=SM4kZ_=L^1~T&olZK!)Yuk@3WJd<=C-13P6LYVp6615-w{0W|D(S(b
z9`8^yB?5tjhVC9&wZbG9l-Oxb`5w(q$Y`p`AuszGa$PbuuC{<o2$CKFb*#b#ONavV
zYBH$TXrebrr~TJm##yL<LA9qxu!$`TQf2<C{Z9%f=uvq_i0K11IA^$Rk<d4D-JZ!n
zb^*j_5J|f73k{UfA>?2bQMEYt9%xI?OmuCG{;Bv&UslKH#vkL2DChK2<|0OSxxW(=
zVnO<nNQD1BJ^N!}P-tk|7ug3&dW^b5w!?_1Kh!4C)G$z;4bRpxb0WbR!JRxT=nfbJ
z=<)uJr*`$|>WYA<N!8UFu7Y|Z!nC$L?3^FWQbB&J8QKdtV!reeYG&d)90oh)%`8G_
zcU=Q+31F&RI^~!%n5ElRqHC+X{&xZ_s^r!ByYb-r>fIH*C%f0HplTpr{Xiv(8+H=D
ziDJv<*(Q@`xSd)<Dy-)yniNJSdgmC$7<byK?q2lnYx-fWR<!}mU=@B#+-iLHo1QKU
zd4td7Ri&kb@-r?DS$j^zxAb|0vJ9M}Fd{vd&4Im3`3`=pekq+0TqkD=E&?yp*H+W(
zZBLC!t&lk4i*6KtobPo%`gaa7z_bwtVktbfOK*APsyP={Gei^O`Z2-Wfkrv~s0E_|
zgCWuO8eT4`n2wO!9;9LH_JCPRBrE|kxK0j%J^LB_v6=6LoGN%a%ZvKXINiSudX!Gs
ziuFvO*0U5~3VA<m(-8rBOL=N5O%yGbZY&oA=k<}LPa*`y8h<=RXKucgNVE6Pzi`^v
z4*mJ%pJE*pa7NuXjiV$H*r}nMIs*rTq10UrTjH7c!7eYo15`j_sChdPe@tMlYw1Iw
zXV21=>K8vE4NTtl(Qrs?*oQVlVZ%>aorP7QU_q1>U9hT#SG{lTp3UmA<u-i}LmWrW
zfTWH2-X{4vm{s-)=EvJdb-s<60T=(X!?nDk?^(ZzeLzWKY&S=|_Qwy-h|KL{3#J30
zqVperxp=p`gL|sEytuy{N%@UVzFu1Vwy;s?S>6&#EXNj+Lzy2d<aKX1C*To;{j*J{
zN1u?^@1QTRPAj;JG&)I!_{DQ7&Z1`P-*UvA#Pv)5eXTAbITLjH^H2#!JH{Y@nU}YA
z`iF4Z&9%m2KEZ(u!!WN3UcF7l%`ha%w-w^ip5njsaaSiTt!Ap=taNhkQKP<9V{$nk
z+B!n)q|H#~l5kk{L#XnT<y>t{Yv?9?nj(G?fpOH^7gq40t*o9ovxNX#%ofe3aXie5
zBHuyJ8;gWTk#;AZTCVMK8?@37o$CBZK@IgA;n1`2Aw=ock;y-#B3eaRqBb`z#7DmG
zUbg%GY?ap1;Y9KpRPp*#AN(Dq{B2kkeJ$o*<>pv=!@p3eiy!U3CTN0LdB1eWdF3>H
z+Zzz=VBfGij!2>h>#2n5r4-b-yxk&S4`6zh1?D`+_nvB<4s9gy=LrZ;KfihL@5ibR
z$tNq=e;E|7VgKoVuTmzIFIFs^ESQ8&%Bk_F@>F+1M+>tt!<Ob$vz2A!srpbiC|}5a
z{R{sKC74?l{?!XcpI1kO<A*t`I<E+S?A@)d!x<V$lbNlhE*Gg(`srF2L&7y0GepqV
z@UjCk%^Qw--w~{ztk$uIE!Aguq)_9%*jAO?JsF$5OK7w-Nh<b&#-DhPY?0XuAb3P2
z*7$i&%giqc&NK~7q&H$4)eC&B$vfX4$J45wq^+ve8<1XjI9j^bs>&CHBYKL{KNG|2
z4o=MQT&7cUNG)v$@M1RzR_B;fTYv57x3%2#_|G3lq`Gu`e!%3;=Y~g<%CcZ7>q1O9
zl(@U8PYR!9V5)3?P;}$b^V9N|xbnZib9pc<_IYQWnG=wSeDgdgzt=*?rmwh|tcdj0
zYi>=w*9?n`)}T6Ije-q1S;I$EMeA`pP{DV5l(ZRU<K2JwdD%}#$nn}Dt?Umw>YoS3
zLUX@W<E^|n-LaG8@?azTXb(YpGLLRYQ>&8FLy4`Kr=2GoEBY@J{rPG<bu9LxU)1B2
z?r~nX1o9qN>^Te~Btu}1ef%jumIVg>UoAku8i(xwb3WD9V+cgWPTGp^amZ-!k+R=y
zAWv9ZRL{qaj%TZ}q!u+JPwbC}?21Y3q4(65#2oydQTbaWBKx&NFEzJs{o>?u!>{h8
zt;Byq*?6I6I#z3FMA3?BV@|3+!QvgTP1Bkg3TX<-GB$D!ru2K?_EL;e-(IJ+IBm&^
zZXhTB^7J@MGj|;}2~C2`!4r<5+b~zBH_b_bx?V5FUpdU94cq?291dW%=CxXbT&k~G
z_tZ#J?KT&`am+fd5JJl=6l;!0b$0Hrebip^LhVzt;X;zf1n-b>aBMC-PAfN@cMY4D
z@F5E}n{?RR-dm_zm;aKc^#*aA%A&=gamV7x+^>w+%T0P*)nT`})_7F|bb9GVr=M%J
z9J>7sbn@;HBZWVXeF!bmM6QEhYuFyXTecp~UX*rT3Dz#Qh95|ATiNS9x>$Y0$uHzh
zVi!w_-I2|U8tnS_^4s>wdbjrLftdNemvLMy@3d-(j}k=S*7NKR`7HUOVD@2*t<~(k
zKCL1-Lz0->N`o8|L<Pf0HY4Yzq!M6bbS$(BJZ>A;a!jAZ=kJhEC;1>j$uHo<N;baq
zlr%E`)q|YpohPMhndJ+a&L>f+?ajyF`?2`Nz>3^8t0p+y*P^xV&u@|7k2S6@X^#mv
zsYgv!2MsTa6R!Mq9&pmQa!w&Y?az)WzL3PXaK+HDYmS(+;>76B{(@b5alkOnNAY8O
zmBAK18G&)oet~6tJFi#Mp*0qwS)<SNHoxIf5nc)(W+W1rq63o+$W*nODO(b*WH|J(
z4#6w7UJ2bLhKL_(#O{9DVDGXxMgZAo{Yo2$v8|u4C`)bK1@59-wp@wt;$&y%2YMq|
z8|v~K!gM%i3{S<eA@D@ZJT-CkXCaA^p3@9A(Z#Vq*Y3z+zO&Z9^jJ4fHPTP}XC!65
z6pjL+hf!|7MG)`{6;|E&r1VY@gy_qGvP70}(jR$&=RcFnib?$g>Bz2jy=-;Qx^T|?
z0tW>Sns8on+~04X+S67ZBKZuTc^-=Id}0Cf<c~?r?YCz_S@)mHFKa#NTPb>6mO><4
zo$mzGP?x(91)liutly|^8!!#-(u-gx_+Ygn<!ZtH9cNq~O`BVe!kFjUw9<I61=GZ@
z1~;Ki%{Pqg=05Ot@xq5zE1^3PB>QuZv8lRIcOk+R*?Ff`PqYma-Fvj>lH7xMa4}-b
zcw8=@3Op=?H;R}0(=4JICZ;A2*%0Eg@2v>Ue%Uiwq%O)k<Nci4#X+t<-uP-1=!-f7
z2a}2f31FK3SGzo)E)+&`hSq-48x1;@&=7C&Y<@OZvQ>B_Bb>JB@}h|Psjq0A*e5Wl
zfLHXXCTz~0aw46BpTYPXC5iH(<v~;YU~R)ouM&e@p*6{UM`pCsb6)*i6wP)sNJZ&U
zVMI9q1)OhHXVX;LCFcNI5iVT;E^jW5gwTCt>IVv1fA+ksRI$WE&hjtW66ivN+uZuX
zR^BO#`Ks^q<&}*q_1oPhiz5WL(utol`jTvqB=o2SJtX$F7w{txo)_i4f6?p5DMVdl
zMvmrR(x{6tDl=YLt)NcN{XNe$N$TR+thszTM!(v6;zwjyGL!<N<V&(JN!Hnr%x{sy
zkfvL=*=na-k3023$_5Oo2<?%fk$d-%+{}xRw1UXvUo)VsNt7NUJUE<#MTU+HbnB~X
zr=!F+45X|WjJ@rjM7*0eu|2=F+WnfxNgS?DwLKrgyeDPJVhp`2@$$a2*KD7di+rE(
z<Z^5z8rv@_{w^TcL7zSolBmOKGP30h4cW#|Le#4#YeCX~1;)jP59`4$Lu85%Ej;x(
zJHXP~s%2(X3AIx0s$vAA)fLmkjHXWRP#_S<&ilOp0VS~B0k_1#8#Dvk7AP5!JqrJo
zs_FP%sw$^YOfEVw{^;XRrO4@--uC=mQUq1t1ATt>bKLc*qv#Ou&31$#RIN5FUmB_K
zhpL$5=CqTqw3kg_MA4^vhu=}m6Or3%!mO;1b#(tkODOsQ7Vi1*$y1%(gNnHE5Xlv*
zMbD1M@v5!pwoxODkeaY?ACb1<2w8VL0l8}akKh!M?~DY_#;kqS>R@>kGXd73WnEZm
zQtP9uBx;CYL?L7`xHx$C%|nU*x6i{4$DktHzDoYUwSgFos-ls=i6^&~!V>NGrfeWm
zf~1jY!hJq_^C8!q#V0$&5lATtnM4XJEQ>Xm^2L0@@?j5$EP8qW<MGTq&vzYDixV}a
z){a)tAx6HQ#Sw1BVfW6^k=`nyP+0HT#$)@+>d_K1#7zBvR$*}FXNPaNl@cyIq3qXJ
z04bCX&W*PwUgQ&F&D_1mOgf8Cz6tf*?jPe)=rhuEv2YtdzfPIK$q~uk9_E^8m3Y@K
zT*^z{ljilvsm%9M1cB*8I`U!7#02kf3e#WfU)f+QbEd{=61<fv<r3K)IT7WZkgNzw
zcp`&1Q=8L&!Y<S;R3QDa3q@BkrdHd_q-H`ZIGc+(Z7IYTGomBF&J)K`G^q*}`EZKJ
zVIYb8hJq2a3}Zf`9S<8;gssAM<c4=gpoE26RLsx(vot<=(5IPw<0X?Aj*dKhDDlj*
zM;vctdId~!ibc{JLq4i4sn4g&B4!cb`<&<5Gy9=Nkga%GdU~nR{B3n|r0m2WQ;7xJ
zeJl27V;5A=zqJiGav^km(8%2d0Al1}Om#P`V$3U)|GjS1&)hf9e+kWI2d|OWB01|Z
zr$}6E%nakLk4_N1r{~D^><#a_=52kS0OmA6o7`xt=O);uM)rXv_!kZuymPUibx<+t
z{%dh)c8J%J@@(VZZB-u7j9%@jrLl^!);>|wj4+7}bFf*7l6%xEM^Ktud~oJL7nyBg
zobx~&*LRq!{TLT|Q{wi?LbRklg_deOry#<9>jhkEddnWvrJKxq3Kq<hqW^szSOiy`
z@$a}}bd)dEa2<>mzvhl6nN)>kI$`g&El5jdH?p^JBrYrI>x@%A9a*2Qk1EfB0I{X;
zj|0pI<nQrn`uWuc^u;dop!!u+`%959e0^3M20W2tT@+EF=|ayaN-j-Z+`2bpd{a@o
z?WF5L@+UjCn}fnC{o>}{=_Q^h!+O!FqsI)C#`s{NQHF)Oaf+TIN>74_)Y6ILWBCQF
z#a0!y40p?<09QSd6wXE0wUuS7N#4~|k{+u)_w!%NN^$%bgzH}h*kb?pQteUTX=}Y&
zmoNB*mvuO~&%L9eFFfTgf?W2QDH_I$G;z6e)!)Bdm6{@wWT?h9-!=h7Z=f;0YEd*s
zPZyjPs0j@ZFMlN;R~J@6?q5y!>H!~9+4+w7ZD%n#pp21@lO;lew}U;hecZ(KCeP%!
zG4LY~dPZ9oPTDUv;6@-|fgbJaGnw1F1HV51F+U?FRS*V$6_^=lZ*`U9kL8D_)ed^0
zFBV#vLHCbFl|Bj^$>PNNC5z$t>qHgpn3%-YZU*(eqP?)qUP}lrYQTUR^Q=!x9QZ%9
z?VU|Q!o^X*dO^d+TNKG}iusmgNec2JC;IOqdIaHcWN`FCXci>RHD_0A2-%*Rn{R4w
zD-zjkc+E>Py%9}r*@y=hIWEtY9?Qgt8$5q(n!yt#@TXE!d5nBEJ&5M9h$P1M5tgy1
zeb0%`artK=4fEj~H1Vj&;NYNkOI&D5%HmK8m)3F8FD*D{*dC12oKVGbiMgWiZ(RBZ
z-(37F<Vs}o<f*bU1xTZI^%qewG<e<({wT2$9oJ(<lo0;VHaqmP>gEWambRjX%*9M?
zJJT^+eJOuu2P&U051cjzQ-ajgWacgtg*{owzp+U|&R-@|<FoiMKP$v%kMlOK*u>0?
z+|)psRerd`7w8UjUp{|iy<7gBuW&}_A1L(uuB&_lYDljkL&$}2%dY~P@LPw(q$KII
zFYlMv<Ax$h#s9+pE%V{aWHz?%ae4Y>F_bwvY8^)A86hKz!OX)((H4i4eESOK`2g1V
z-vPSwzXMqNe+Mv>{|;a?|NEi%<$wSF|9&4VH~HUr|GWONtqHn#!P)g-n=3<S5Fc5@
zJA`j)rnr84a9LjopKtJxnN!!4Q|CUGv0|jW8q-WJXz=1b7j^nPq`2SQo$N>cTTJJ@
z0nDfON8E+)Fqv*j8$*{Q`?|br_g$<kwM<rZxd!&%cs{?=AKt3K4qlo?BC0}9MNBq?
zz04{1(Itwv@0aT?$5v~R&4uAR%^@;!>PS;v>T?7(<R@LsMGTH-KP#fu#@T38(VeVU
zBcw5yTqY9f{nEVQs*qq{1d`}tzJ;ruCYye#)>XPM$<*E;a4CYi)?jW4Ykcq7KnQhT
z!`LwBV0C~$cHf?tZj>)bmTUc09dNY4TqLP1qZ%W(kBY@&aD&G-+}35(Hjq+?$>j3g
zeU70Lw@3#ThM_7z_3SnOZ}#4s1Z1j!RdV0g*~04Me5Ydb9cZ4qv3W&_{F$nCa}YSU
zs2+VE83Et2L(A+@W2EUt*j8Dhs5r9ikq`F%Fw*LusnCznt!e^D?aA7S{zf^I3e#E8
zi}e((D@FE@ASkI#qiCty8BAGp5_5ikTOwj#I!E-kQiG;GF3}&w6l)5*GgrkeWZI|_
zg`c+iJ?UE<Nq$sc;p%g1Tln)11tOnsQFiCgwC4PNh6EyK)z<V?zz#hG3|4)MEq%#*
zlMYr^7WSKF{_6@KAro0Cut-Jz4$D?T<G^KS+IKpO*0@BG^zSLZnto#|>2LIt$a3)_
zL*z=shM<jxZA~n1Qg+V!istdS#6x*XDZUgJmuJh7(FaPmlfn~aXH=39W#;5HiBB4A
z=^tQhbZRrQLiV}>IO?kwIcL-|moGw8K>t--ew2<b8<0fdFaFH<Y_vc4e6nhOPFnQf
zeJcKJ)0X<QFieHVO(>@<bKBolipf-;5akf}Er!sI_DX_(rZx>)sp8sI6y5$kP~;R@
z7fJgf+<nL1bH6&A$LypTr(1WNSnmT(PQZJmPZ*o2V)0cR&bP4!i)M;c5QOy-Y>jUS
zLD$h;e~eO^KIL1;Ohc|@&&G)LXteCm77K@XqJF?locDZS8&*#nTq8H^Ji13B(r2?Y
zSIZ>fy-Wm80v=#>lzn$aI@7$4RcMnb0E0-Mbyi4V@0CYIQ)81^zC~HeE!vucKAOpS
z1O4803>lj`L@vzhLU^h-2m$HId++e85mX^-ds(s%Jd%chdpQMnqfT1$li<A(ValZ1
zV9;l$cup1cEz+vcx&k)OZ6~?uf$f>+hn&MD)hH-A?}--+^5(qcF8{N&?VwdX!DhYh
zg7L9>!&X>Wf1t@*tt{tXbqtw7T<$ipk0?fk-xT9oqLj)Pk>*!!_B1?QKY`;|iOEq>
z){;{n&6Z6F-b9nL_LX=P{Y<(idorH<Ax|DZ#kGs(T}a@wN1%0hkMYr=DybRPzxcbI
zxP^CFS&?|Q?%6=Vj%b459Ay+UCvkvDx$Nld5(UmbU0M5l+3(|e3UL*gmDx3XalL=#
zOxq@ZqA=N2v6ZhDAZzy_`L-|?e$M<ln_dP?VS!&Jr8`XXij=>{ZE|5?eH+euaC7W(
zW)RLkA^1w_A!xAjEBsqq@gYx)$K~TFyDKZ*vUno5smG?n{tl02dX9TBAz@F?eb(Gh
z^xTJv`Wv|-T^w(uQk$K4e1#8W^COW{K(GES->&z&rov3wt|iDA$rEo~4tXC#m}GBo
zbfWhCZE0!pyhs$@3qaS3i49Af;FIwx?gY%wg64^56t{Fx)7FM)#d@0R9+C>(qz4vT
z-!3bprXE4&husW}3-fx%+IIs8@8=!~QTZ*?Vd<B)l&(@yVif%StU8906eBoH2baL>
zd}*?eADT!i_b`yOxv75OJtgCmvo|^6YGr0&PpG~VxENBk*%IH!kUyd+5(IW*cj>u-
zw;zInhG6Qcd^_Cl7Fry<H0=~Mr4`E2Y+9RDqw}x{zmFF<dwwK>w!5bH)Zsbk_f7{m
z#oxkfjfn)m6@`z8BYJ&2p(hBA)X;aDW3~t>ez<Rd+y9{yK^Ft<TSE7>nGQe`N6xKn
zW7qXK8!uQq2#Jg_x?25L7pisBK8BA~QX?LE)PZ~sS^|0<b1jUPuwShkdNk+TJZZkU
z%XGAY_0c;O48PIriHkU5c*3ue-*0vm8U-$ks@(@Eb+=r(k%R`+c^wAb(<aa6IwG~4
z2o#)pgD-58K>s+>k`O{PQk=T`#$UI_iOd8aw!GRsl9XcBOlxo@;5QC8PKor9ND%zk
zVDY+!_SyN)y5vafEL<o?%=~Dtx%_y9%0^6oYdR#|YUtth@3$GmZrk*}Z-MzO#z$7?
zZ8&9{$exWKlH%7&m8Th{4Sv^4;`Nhcu}$?JnCbB30|-Uy%hLyBuxX-?^;vW=RQgw{
z{GGsoFN@bZuv0c>*T6av21ul~7*M9<=MMI*RKMgsMXyXYx}CXkY0~zt4eW4=s(5yl
z3REwV+~&H37?Uo~(5<B6v3s-`jSkfx&Kl^WkW&ubN#ew^o}2OX*AC4ZbAc09O1_dU
zr(RJUZaVo4H^Q<5Lgz2l?2NPgk2fo|jk<gx)2Fq+U#p|zmK@#s#yJsEi}jo<fAB$Q
zRvPRq)bJRQA!<sBe#le#mi`?CsU+o$FD9@61pK8I)V=O)tJ_;D+>&y!LK2EoO^mj4
z87@ClL#BQ3WDns(e!uK&dQ)<pA`+LnP`tkrJntCa^{4e>@(rMm^7ddz6#`S5hhv++
zaiP#ZjV9(bBH)!ag1WSDfu%eT{^o;=N5%PF7p01fW^biPHq|_JcX*JOxBp>}Pmb85
z7fDtv58^b{H@+6uAbUxK(YX5+zt#43%D~b}QUD%Pvo6*B`240AE)H+`!ao|76?j4E
z<px9|{To%}lGoeXR@_MMQx_?eiKj5)f?ij9nJ|4v=g)hz#GJx{O^TH_z$VAP4-h2t
z_qfAh3@BY!0%1<=yV3+T3GiEwl=t-0gPIRi=~1aA5l)BW39islHh7}c-2TO^{f5RE
z<-HGyNh8%V-#L`2fGb~5lgh-Fun>FApC>@o&w*8Vc>fiC^5lpZ<2MQjQM@KL44OS>
zYPI3Y1klFH^BF%4D$MBzt2gBKz2hJ8{`2|GC`ZHOWSBtJA}=IY>~wIt{<H;yWeSEg
zhyu<>6WGs?&3O|ndVMk@z@eA?t4Y-qw}dbNZ6o%Wj8m;{#<e&;%H<wx1keEnZe^M0
zV|uak2fBvnDcO0H3M1Ok`#;PVOA9Ts@?<@?k8K@7R#aRNgkf^KMf?|RwEtM8RWa_r
z2Fcr!FwKrE2>MH8?8u(TQ1#H}Q_@|*C-LdKCsa|?sZjI<wFKe@bdUZRJVK(S_Yglg
zf=>rf>f@&07FkCZ-n$TprJs_kesFqyz->i}`kYkycPp)L2PGJ&{1&=z`SZdi-yZ;Q
zShjFK{~=Rc2%b{^l((6fI88Z>pNzNlV&tKSe|pxF*~#+{!Ps5f-}fc!%T^c@#vjx*
z3OV8Kw2So)HggP&D7hf0`~#K<bq_FiKjSllClJbcg=f>R3lWT24)*MkF6pG7Lek<%
zsnEGcaly70Fh5qi+S{1!n@Wb((S7CL;1kKq*(3ME13z-OOd-LEmo0BE70;fc?3y<~
z)qy7Rpqa^-!`iu-Q2`Y}%a<M9McK)JbUZ_{+!Y}%tOz<eQ(Zqv!S<-8YK*tO2y+><
zF~%W84xt`$7cwv3MVbl)jWO7E8%_LOaEH-Xcys2=H7RF3J!0hZhXlcIdLsc8)U=~U
zl@mNT&<s8|THr>KHS0xa@=nxh%Z`4eZ*#GX<-rp@g#?v3m9cM+Z}N2SNbsN~*xR^u
z^i|#+Yt&T&cf2?1O-e%{1TXf~GAc$C<&-bqu~)sIEcH1Gg@0^0DIiAjd>e}@utf0|
zMCWDeCnBrJN>?}>HI+|38?v+3uI_x#0J`c0vfga&nsn)$<UNU>R~U*ZS|GfL_}?6-
zm_RAupc^gI8_ZydFe*)2Z<|Tun2+TM8nJey2|taki;4v=e{AP$InQ<a$x{_<WjZVX
zub{tChz$mF@UjC&n8i0CBY>Vs-rX#&H{!h{-%b_#WQ&v#f=f+>p6BK6BLjZf>n}y@
z!*Ll4Po$%ha636(=qTd%h$^{o<AfvM-MXws44_GUm4?T~E28sHVPVgae`+kr)^#)0
zP{+ggx+{(X=wQyINa98gtS+8=L`~xaMA5bzM&A>(fvbQ`TE;{$E#nCo`mC8Vsshjx
zS^^Etb<#hpN9&ViIVF_Y3^gH1FP8wQtA=Q`bv1Z_?Zxl(MNKYA#!u+00J53Sl}4l%
zloz}q%Ca)&NG8x)nTCb=N_{+lEb_`sJ)=PK9dl@>jIn%n2J_8`$%QBaT0#>b9*2`q
zoXWYUFNnzQkP=2(MJkLP9NO_Fny%Mwzze3?as<vOlF?8|a-wB-8}8x)0B*9JiW&3v
zS4}Ykh~OZuJ1nJKrz{r-&H|Sy8PEQp%eOn(Om2Aah=S0EOFjgkX9|NA1k@dQWqe6;
zSns$g*IYk=Xn%KC74Wr&Cf>{leNXA$CO2F~(eTurdcds>P+Y|<ycB-RR@dG32l!s4
zOM)~fYlL#-Jxke|nB1UrM=@BY(9Xcutia`~*3A^aiSLGs(JQBZw2>_4*0>H(sgcua
zx8$#{dN@thbGYxDR`UDo12A2^5t@0f)Ze7VZ&4Lz8KrZ#5&()VI~hgv=;z{S0-J#^
z1Q)$woaVUX21q}<1l0+4dNEayj2&UGQKZ{R)WFt+D<z+Oi&cbe`16&-3#&+krHLsS
zB8ealaU-aSvzS{uWe@e)_067t=75Izar7S&qNucZgU#KHL+TgL!K(0AuQDCKC>2^^
zn3^u1q=HnyrkRwp1?hIKR^i$8&!pv;3APO(t?CpAEZk1~OmP0tdVlr-6cvW!DpBAE
zd6H2mO2N9)n$Al*(2$K{oB&#9HVW^y$YdO<0!NOalxKC8zpg~&bKHpd{d~3v@X9Z>
zn=1ij(#O%wp;}@_Ok4XJ(x0Qi&A?!y<w>^wP3_(3Zr5F4r-nqfe>1%Bkd*A*j7#OC
zLQXHgiIK#qQX80<=z9@MAjJU*<j)*hY~76u(Gu~qWWr@vq_eQfLs50*SgoSgJQ}Qz
zv!%A2UmZZ-wo!%UmReU_J&?bueAjxa>~m%4zpvuv;<Pb3iyKhDaoIHMHf^>$(F>e3
zN*}kUrrr;B7WRLQMOq}4S0!PpXPun~%UL2_AFPF^PN-w>Q^srQ+7AoGxjq{s-UbgV
zu3-;fn`f74?*Wgq6bA625s^=hl?v#J4#jNd3u_4{<Lfs*;nma5pdWW+O*cdMdGr*4
zy^XD{?8}{a-Po${1A}-<ZD<1>#^SzyFYt7B)=(-{I`>Q%KS$Z%-TrES8nMNWL7lcA
ziX-gwu)W^d4>6RQVhT9VZ$Nxz-|Zoc1X8!=iiZiW{J%<bZ|}+GmmKgq@D^I{UNxff
zXE5=Z3Rd|B(IT+}Cx6g~-94CMfyiN=%bo*zSd#Pf>=*IV->^HjauUFHKs2N53DO(Z
zJEQqVXVlcf6x}19gq<R4F!G+mJej<Vo0t%J<6Qzbh~dKXb05`?=^Y;lrZ1nuUGAGg
z!niJVda@4^x`*YA<!Ox?ys?%@wkK`ij!5*^NZN*&JwTV97|ByY076w5#ydb+xw-r~
z_$~pD3+A4^X&r;*^|mBiIZnGnrINJKfP<4VzEjz?tZL?;SpH1c(qwUU*78+P6uUnZ
zUnFJc{|p<}``!}!6me?^-{Y`VZCnp=y-oNgM6URE=-(0+9ov0J5n1~0u0MrJz9{b&
z?&&N9wU(;cor<9Cxz}qKn|(VTf{uD6jo!eXvBnQ{1CAPe%~Ls7Etpoy+FKsvY4R+r
zq8XG2zCaLyMlp?ANK75!&+XrzbShD@K;3d%D8zbDu%bsl*e6fSbolj`S9V<8mZ)hP
zTaMAiYO@Y+sT2ip32sPEITHgGFJKpi+T88DTH2Rb=S})HA#&~w+c1A-zu$XE7$NUr
zw}p5t4X4K~3-^N-gg%>ih^L3v812W279UlmWw-g9^k_Ec@Oe5j&gSk-1{3Vi-$H;F
z%k2H@3jBIXiF;@{D{r+FQ|SPA{p9mZxxQdp0kH)!c_CXCQ{!8VxWO5+#XmKDv%r3!
z_V59T!RS`TxQZ))Ju4%}spT%1y|-S0<O`cqWku#V(MXhzzKkpFym4@T*&i1DVby)O
zHh*lOB$ZTpkkh`S3zTkby8GbWX`i|amX0GgtY-ZNyl^}NJqi&<>z*+nBL}aIs&ATr
zLW0UX-8gjVMme7fyE3VhxLaqEGt8*Md~-I;d(>^QJ)D-t<wR#cEWVv>-?12*cq@X&
z@JRg|b$Y_4Ay=KMaA<uxP7%j?=1bwW3vHS=itF+G;$_a1Sdh^v+2g8)wn}gYbWHvm
zX8(fVa=Zag=cN0>Ol-rOf5nkGP)L8kIzVrc&gHQF2Ph0;a}+MeY9{R->GhR^`(DMk
zJWEJKS%t%8CysHfwNp#^;jGmS{!(Xm6#EfbgQk&{=?RSdJp;WB#eApNHrV~{7+_}}
zz2VB#HL8c^d`K8Q)`aaMKDfcylGkU?OcB~KR6mKe;g(%lhu%b<X<j)Pm=Ek?f_(C7
z^@o=^NQxtOFvs_0V}u6TyzlNIJR?(6v9YmJGc&(YKJvJp@0GSn<&Lvzj?By$o0}Jv
zmI7E|Zf<U7#>~n(>5Oyj@VE3?1#A*bt<^FTJ>;}9H@`n_N0yY3FdR(0SaLgRzFKno
z`SYhb-POER>%)Aw`(biS%w)aY_Hmk8tz~sZMc>enXH7_iC`p><Ev38Mvv{d(&2+9C
zB`z+WE1mXuyeh!XUP~$-TUTG-yLmb=F!1~L?~A=@gjc^F?yg#UF0b$Iu(7Z_FD8|a
z4PJn=f%yMYC|`akjrVD_-Jj6w{^arDjxw5ej@NE$z@R@?GJ$FH@6VUuq4@ZCl4)&E
zH+rMUrN5k!iIJex|91iJIHDTA3$LduFH%y{rKKf*B>c}lNL~N_!FH?o`}^}Mr1<*!
z=I7_TxVU`FuBe@^s;)LLG|b7#`Bm|s_BsL?M#b}Q1T8>N+Vb>xr&OvL5*BuJ0zKXA
zcd)mYm6HpI48o?jtygz)a`N(e3P2~ftzW1#da6NJ5D_uDIo-UwJYZ&IJORhm6+0s%
zQHC8j37q=s1*~Uw4Cej>49v>r2u%^a<d6#iD*%kZq_3mHa)pSZYA4u??^n>NC-;B=
zx2N?$AF(~RuNYBRtkv6LV`Oq~Z<Y0_)`uf(4O8cLvCih^CXtV;dImrI?%v*&V?LMB
zaO!&m1kk%7zkh#Vw^Vm<xY$nfBEZ7(Ls6|eF12$n(QN*RkKgS1;A&*FeS3R*{Crr0
zpF4xFxi~nUdta@_+yvOcsz2eeB|Q-Guz3$t=UsbFZb-zV0~0?RmZUWpwhTjEq}{Ac
zY^Cmpj#xzSKTeu-?eV42bmP1b)3;TSvJrB91fkY1d{8d*o`!}(Vfg$3k+WJCyMIBy
z_eA4iujp_S1RG|j60(Ze{zSC^V!^L_h{7AT^(`$e<>e=T_inD&!*=amzkLhz_I^o+
z`0kypz5PA-Y<nO<U|+nnw6xWFwJn*`?l_H`fS!hihK<efWUUkQVt*{{-Tm1xE-r2y
zt=b!8br|D(3vF9rq|U5DAWZ^zBRD+J$9ZAqmf2xv(z9Zos<y*Y(%7EnU0%n{)4C%I
zB^KNJ)t<J}mS4Cn<Y!sZ)6*S53dj2%vkgcP9AaCad<H9zRZqI(FGN3E6ctc5L}OKN
z1x?!cAFI$w@o55FK}HDSvD=0!ZH%&<UnW;77yVR<jE`5=(jvsd0-K)rfPI`yl$Dh|
zJUmoYKYj-TQ{wr<++6Zs&iXJrHXRSU)0MbaJh^>9wc>3yy@H|c<-ls3_E^0NauY?f
zp@x3j-+C!9x^HXZ2Gqx>INudJX*eQvCpe%o-=E#~)gPtYwzYNEaPy>bERO#+gR-<!
z@2-URcBWk}ssJ(CePQd9;POm3L)~^`rQxCaL9J!F#OF<NL)6j}P1f2SS=Gw2M@$#U
z`(~bPWWq0a&R`so!l$>jwQcst6~=+#y5<#7&EUZJ3g)X73&NfpamF^`93(yk0+c5l
z8bkcnicqZpWCIds4}+arW?rf$>&1g&6M_EI*X3gG5*9XxJ23!wRh+;C5ezLs-FdY^
zol37P9)0U|0ij=kygeE@Mfrdv2<nripM3z{Rp9K@D4?U3#zw$SI)p+meKFj}q7tI2
zZ($`=b$1y{Kn_EahG-}v<=Am0zrWGX$1FFUUpdy+Y-lO$cjQ9etJs@7#!}8lN49b?
z3Lq5L$dMbnheNqNO>9H?$#NNYkYH0kLd4|BC<3LU#0}mgN1$WBHfz$0`zC~!>MKn;
zzEeuBd#D@a!7Uh&k_MO3+XHbq!L&I*WGrTOJ>T(1CX9`a=HTMG+A720^=!)DMQ?#~
z?w@S;M{aa8cIHlX)inrN)P#XK7wwBk13b#C^u)WyH%6saO=#`!;1L=AeLllsgWps-
z^930#l2j1K9^O8)F+K^8SxViWx|3U1+BTP?sKIUoBLH14t}Uq3(HY`^qyYVkj<dYK
ztrLrAf-F|B_$DKr-&nWZpe+8rkiDX%m0`O->^R_DgKE;(LgbLl0M|zHJqI!UQ78+*
zwhsrUf&F^y_~Bw%KoHdUrqH_?46x&o`yJ(%SCY65@&(W)F8iF++w?17S|DA(aCo~i
z4G}sqfhWQxylSl)O7MwLAMZFm0%Cr^``W{yhx-{bK_9ueZ<rul_iiG-M4JEIZ-NyF
z9<zY3B9<^I`okkn2;cUM+B@68Ai8WyE)2oVcM11#A$I_**0oZNdG&w6By6xEKBT8b
z-N4e*`4%1Cqo0&tdV*dFpuh)Jt!pLzy@7bf^^O4afPerV(-hG5L-CB>Qn~Ex>{lKr
zuM`=$9QSAHY>0zpVHm!4{7P&Tn+3XZ!LY8%78y_jel0{QM#O`x4oJ7eKY&ySpA-<e
z#q>ssbAzXU$3%$Fzpf{JmykTj-0f}eBrPjxBNX?|;I9tQ7ES^lv*zPu#q=x0e;+;I
z`U=I#KCm}sj;5Yi-2i!dqgyj+!a|qF*d*V=7;B2_jnotcz4Wk_^9DK%4>i;2u;omg
z9aIVL-c4s-$40>d$Z0^__K_(R#j7?v>Ktxa?N9{_PaKWuh|IvvXAu5YpR>i)b&|lr
zUuL1E9X>r~^iIOOZl>49w6g>#w~EQTtZC5ZXtfU8n2O9ZPsL}g2^c(!pEA&X3zFC9
zVobxk-4=(bSjtyT-CYW719{pEC6+6q1<8QQNsjFm>p|t&cghK5RbICI4cjgsmvQeJ
zTJnfq4JrkoIfC8<E}8(rTUBVy?9OXtMvX3TfzJqvsv4DdGxw7aaI6loT3eGldfZmB
z+rn<FZ>G3_w?2|e2lX8pVM=L!%t$8&C<U|=;l>Hv8OA2&B;=^|=@A4>Bs`8X^Vy0+
z)IPVXLm~LVMAp3ka?SesdLp05$Vir|p)lS@WmVM%=aaRidtn$D`&<S(UqLdUV4{%b
zDjlRYv1EmMwL-$AIi$Am7x?QAtx}uf32+3=d2^UA_%Dk{O#y~}tT-{#EiIA@p9YDG
z20gH|h+sg#`>TV8q+4s+9+ELP%38pewaem5Djz`WfR^XMdbF6&f%PABaH434X7j0%
zT3q~gf5O^lkt_A-4Fhi4Tfm1h|GW{8f^xT<Nf6T5ALpi?>;^IyK7qUgMwyA2&VmC(
zj$V5lZLMIr!B=@1rC@XLpzya7=mRNLykr6#IXU&EOcEjZnCt~A@%n5{UETHrf;U53
zD$&^sXiAfgPaNKXe(#8McwT(;CMYv`t7!5DucrOH*JV^`9+wnYFyOTpvotjJWda63
zSjiTf1+<?V>&>EAk31FRVaiwMMDTNzx09z#>Lc#z1Z91)&gL$^`E>^K1z?bu4{Opv
zYvb<Ibo-L9z-WzG!v5)+?ugc8bMlMIlfe`-@5FPvFnu|&y*=OMeY$5xmB^H*x;mK0
z#=*%4L3hyRJuED2rbyW1#kAIN3fI-;yw!T!*w`3=bp4yBlZ9!n53+P|$hpKquq*i#
z(!)nz#s_(#7%neP%;@piSlB_v6Z@)nmD)`j(3sySQ~>mQlJq%3eQ?r>Sysw~41U#F
zhzwb9)}3!yB4E$6m2UtxKK|icD1vJsx#(mPBnf`^|6|SdxDyA-!MA5QaOSG~dLzIV
zv1{<ZG_LL;>mr|Tg#f*U5;Fz#D+)i-&s3nW42L!6q&t+E5drlgcJ9sRQ2paOux!x4
zcVgDQ$vA%$etnGselijGfI|ghpDx$>e63AMTe65=hx<GD^aidgT6K%bK@MTsiYyS?
zPzosNtd#m6$7F~E;RE^#+{Ph5+91VJkA5eF-%7>9w6$HW)pE9I)FkFLNOckXzMKmG
z-f)_IYT>h<v1NuOi>?^R%39@HSR*DE(J@i56l8bT0Y?P_x<|s-_8w@VTL#llkbBFP
z&A&5w+zH}}X=WX6>ttj3f1RCoP*aH-#c`FYGz(x5G=S7#utX6Aq*~}r0g)CUEGz<%
z7NjUeKuSPF7PG)20s=}=F?0keD$+xZfb`y^gc9I=_{M+U%$qm+?thuw<lf0G=R4>8
zlF;*h_gzxepG$1$BB~W`_XqcR1jp3s{C#uqC}ufOKUz@1%+fOD$&-<Z30cAqTV`fv
zU%^h$p_P@Dw6rwmKmJHIVNKFeavQ1vU>gJ-ma_74DM?A4zhqlGJ9)UdtA-cfxe%kG
zqAs6sh56RgGdeu{fi<UhM~o)5HV}al-d=TWO^2A|C?^BRCf4$}WpXkRZ6~Od*KA+D
z(xGvw7-~%(!SyS)6JsPXbd!A1sOLA2;&%$?e1$pC-@BJoR(;o?g{slIK>KP3lNX74
z^Wbl_0mIKBW{Vdcm1Vw18zPi6S)F!^WDL>pVEZ8%S!$6~-^IdRi`9abO|MnXq{=}z
z;d)2fc^kaOc9yyvd*hWYXLAU0`>-rXxTA9<zBt_`b>vZhzx%$3tUJ2rn**`9^_2Bn
zXNW0=U2>}8U7ECvgF_lY99ajDYO`ZC)h7iEj^2nuAd;SIZoRE6$$v)C6M>7ZL`x%Y
zxa5>^-*<R`wYW>avsy?0)2YIV*WsLnaG|h6lCtp2m6bg6V*J?G#6w~_wm97I-3C)s
z=a+aXvdcj3DZ}NJ6~ZOK*rDd`ZWDz$S#j}+u}0DkeOw8P#k$GagFAAjzRwanII`%C
zux>icrk=KtlUMQhstEWKI1JzW_E;$AJ3Z*oJ_d#Gl<opIYVWRSXg!Z1g{(mKLAiN4
zvMA%&y@u?+Y)}|3l}oy~6@ujHAw?9*P*Sev(p?*mNIuD6#&l3iIsf#RS~5_AoF;p6
ztV%?dCLh12tvVekV=EL~6WE(-Lsw)f6RhHK$Z~4ox(u#FxHy9ai3rLq;51d3K9%(O
zGE0cMuiOVaQQy+KwJis6Z+8kNWmSOEqy5<PDz$)(f_<BBZ@e?#l0GFAdyNH2pMOzX
z1x2iU3=<F(`+0BqJKxZB-rKkBVulMFdxF?DW8NPPu5NJI4<^kQ?UM@jyuO>W<nR@V
z;1)1`nCQ#2g@q#gD4WXF6I^`!{9d}g=Elb9`e%HW9imh?If-CHtL`(GRr?E=qKRFC
zZC(mDJPPUk!CZ(Q=~s{I{>Gw1R74?sYh}^HAeQQ<-R9_!xx%1|adenJ`C74GHtwD%
zlBN~tvkqG7Y!hBGP<{U_-r;D9ogy2OLeIBoFh_guB^c`z_*7de>#z>Czi};iEpaMA
zlN#Q$6ErZF9ibDfH`+*@GRp*bLP#M1G!=1+y1~$3%I^2AQ(T#4lAoUgPm23AvZ~>1
z_^LJ02Otv|t{_uFZb7<ltt>&pDPvoT=Qto+`a&Xy+bKfSZY27l4rBw`&P`KtN1+Z3
z<)FF(zlk}e6c%fy!#xP3dTLwn8v5<w7P+9<*0d{}Wsft>-u=d4b;8OeV1Sv`BV|a+
z7C&S30pZa5P`z(hHA|sU54Kp=*CoB|5`I_BE)Y7S+Jv?Y6#OIW`o@VWN-G;uB)6@T
z8>pJXAMS&ejfm;dhkjOYF6{K}2#07SK^{}e)da&#{<so^nVBswf$wOmN(rOR(!~(2
z-7uwQM0c9p#u%~0ac&25vGuPjySPZ^W*d7|W0YPshefKhXq#O!+_Gaoq9SLgyIDSs
z-hYVyRFm|JlW{!EVh;lX?B>HSkn>)&yu7YHo<7TcCY*w6^bL-=oA1?2#^oK4-lY!C
zU6?e;I!H?rc`7=orO?$=0f*G(FIgpkeV|%Mcwav+Cy5eYF_BFpPKK6L$@kiVf&%+w
zj@!&aTVcNO3I%HkytWLf=MW+8VX(UVJ;r*LQ`r{Z4a`NxOLNuOurJvK<PSCWT0qmF
zULqeVq(3(_H^vS+MsN*vn+gR#3tpKzOF3WgV$GnJk27=5`Vh_F&gb;mxP+!Q6Go^9
z8`_rwBFe{#-FXI8s}W9W=mOp#uDu*2Z1W+WCyoIPGv(9pbAldjn-}0CJHTUTBzi*s
z)7=U>i;LC!#l-K0^w+QAc(a%0@?WE-=Dv}aQm(#o)OVmThVCo^K?6lmW5IarClP&9
zOq8;v>)LTYHL7h=S&YYz>H#J}M2AbC%1YF|+miA5`IM%Bq3`74Sqdves4)A<zNdwI
zqYeOg1Wiy;0zLxkA|aXcRFf)(X0J+JbkFFAa5&6j$Jlm4bhjvY|3D3yQ`)$jKpRkF
zHfDRuCe+@?69je{DBas<D~XMUU`%Kt!_g1oI`+nKt5gRXFhzX!AT#e$YzrfUPn#%C
zEj}3$M1NGtlP|hog?!Im%p58ZQf4VGA5G!)!>qSV5}3(WNAOD{>Oz5SySe9hym^x=
z`{GF*)9Z@vB`}`SRd1D63iLmlJ2ul^-Xm*Xa%TE`tk;WcCdBPzyz|2FnRYP%cxqbY
z18WcTyycNn2?TF-uyXpXDfEG+&z`{Kl^gRJB1+kPKH2Dzbr<jYy80~($7J(fMeBa|
z%)ceDmR`b=Bi&0kYG)vt$tpDj0WNM@Z7;elkV`7}={)&-Pg>s$rdn;^Meqo|zGN1f
za+p(?%h=-5k)w>=haNU+@-LE!*zs{|hdU}^W0SX@USuRd6$c$d(HzIF%Oo6DDswOT
z89sOR2Mn!2RYk8{H5p_N2RoR9e`I$sE4^M8d~uYY=e{Buh7C?`!z)tkb{R(QfWef8
z=<q+v?D`4&oU|I9#4F|KZK7PEN2eT)uXmO=e;iX)Fm61)2qYU+8r~PfvCSWIL*{t{
z?BzXFwDk$vZp9_k6SlNaeW!{rv!#%votjvz*8G4tplWiAgse?g8MxFt2)?<cs>ID~
zxLV>f$H(g}gx@s<T~dVf<lVcbH8TAyF{RQE$%vawKRwa+QT7~u<}s8^C-UYZ?oXwm
zQ-v2Na!-4flw-6>AJ2h0nI+rn@p&a-n+*q^VjKWm>fO{uMQVz^)1e)%J9aPm;jseO
zf>~!>#m;~!r>s7|^PLvMg`Wf4vGcU6zZ*mYbabybWRQ&&^0@=98S>Pd$haKlf)Y41
zSBI)S$X-dap{kC2sIf<9EjM1@5>NQ@2#sseXu-{ym5bw<k@3(1970sYR@v}aHbMLT
z)W4D{?T*k#)+f{amn&G8GNA#BK;vXgn0CFQ*kL5se#hImO#<yrXl+yTM``zGf~1fn
z4X9B;2Eb3WJezDw^s#6dAE`IzFCm4365C0TGn`G%86jp$JyT!4GE)$>{}k&d*)Q$h
z+n$_*ac?Ei*W8rud}Q7F&b`w{kPvYD_t%OdIl0`kjABzsaQCm_R#N5mx48j!&uNH*
zV|$@{Zgt0h5f}X+dGEJG`kSI6VU>>7aAH26BrrD8C~-f)5mc9RfYn*f{&OoxEVK1o
zyIZ?fy+Y+i6(qYK?xzUbzM*9oLmU8@PU@xvzF9=JdHE<_g_*hoKB)T!59CO2kBHCI
zJWefQ65HnQPm~GRh@lpRR_$q36(<PqvV2U8xyT9#2`=c=u(z(>wD05D@3^JpgA02<
z5azUD#B9kanL36Tk{_wN{q2vsmYz46(S==W{m)6tyHxA4`lq?VFC4Y)$!0QFx#C%&
zq01XRWy8SmkSCvWKf`@oP-^jKKr=%fhC95f7#L)}GE_qS1k3+|V}HS}0C@jj>;Iy0
z{~LV!pYQ)`eaNVPZm&#EZ^MgpJ>Be$E$g52=~!Fa<)HzJK{FqU6P~{$^;IK3*=cLH
ziD8oZiSx>sKfs`VzVx4N@}I8%YZrg<#C$q*rPR-l8~nVJx^{Qqn!1i!u8LLQzW|ZK
BjY<Fj

literal 0
HcmV?d00001

diff --git a/docs/index.md b/docs/index.md
index b867c972b4b48..2f009417fafb0 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -81,6 +81,7 @@ options for deployment:
 * [Standalone Deploy Mode](spark-standalone.html): simplest way to deploy Spark on a private cluster
 * [Apache Mesos](running-on-mesos.html)
 * [Hadoop YARN](running-on-yarn.html)
+* [Kubernetes](running-on-kubernetes.html)
 
 # Where to Go from Here
 
@@ -112,7 +113,7 @@ options for deployment:
   * [Mesos](running-on-mesos.html): deploy a private cluster using
       [Apache Mesos](http://mesos.apache.org)
   * [YARN](running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN)
-  * [Kubernetes (experimental)](https://github.com/apache-spark-on-k8s/spark): deploy Spark on top of Kubernetes
+  * [Kubernetes](running-on-kubernetes.html): deploy Spark on top of Kubernetes
 
 **Other Documents:**
 
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
new file mode 100644
index 0000000000000..0048bd90b48ae
--- /dev/null
+++ b/docs/running-on-kubernetes.md
@@ -0,0 +1,578 @@
+---
+layout: global
+title: Running Spark on Kubernetes
+---
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+Spark can run on clusters managed by [Kubernetes](https://kubernetes.io). This feature makes use of native
+Kubernetes scheduler that has been added to Spark.
+
+# Prerequisites
+
+* A runnable distribution of Spark 2.3 or above.
+* A running Kubernetes cluster at version >= 1.6 with access configured to it using
+[kubectl](https://kubernetes.io/docs/user-guide/prereqs/).  If you do not already have a working Kubernetes cluster,
+you may setup a test cluster on your local machine using
+[minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
+  * We recommend using the latest release of minikube with the DNS addon enabled.
+* You must have appropriate permissions to list, create, edit and delete
+[pods](https://kubernetes.io/docs/user-guide/pods/) in your cluster. You can verify that you can list these resources
+by running `kubectl auth can-i <list|create|edit|delete> pods`.
+  * The service account credentials used by the driver pods must be allowed to create pods, services and configmaps.
+* You must have [Kubernetes DNS](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/) configured in your cluster.
+
+# How it works
+
+<p style="text-align: center;">
+  <img src="img/k8s-cluster-mode.png" title="Spark cluster components" alt="Spark cluster components" />
+</p>
+
+<code>spark-submit</code> can be directly used to submit a Spark application to a Kubernetes cluster.
+The submission mechanism works as follows:
+
+* Spark creates a Spark driver running within a [Kubernetes pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/).
+* The driver creates executors which are also running within Kubernetes pods and connects to them, and executes application code.
+* When the application completes, the executor pods terminate and are cleaned up, but the driver pod persists
+logs and remains in "completed" state in the Kubernetes API until it's eventually garbage collected or manually cleaned up.
+
+Note that in the completed state, the driver pod does *not* use any computational or memory resources.
+
+The driver and executor pod scheduling is handled by Kubernetes. It will be possible to affect Kubernetes scheduling
+decisions for driver and executor pods using advanced primitives like
+[node selectors](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
+and [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity)
+in a future release.
+
+# Submitting Applications to Kubernetes
+
+## Docker Images
+
+Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
+be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
+frequently used with Kubernetes. With Spark 2.3, there are Dockerfiles provided in the runnable distribution that can be customized
+and built for your usage.
+
+You may build these docker images from sources.
+There is a script, `sbin/build-push-docker-images.sh` that you can use to build and push
+customized Spark distribution images consisting of all the above components.
+
+Example usage is:
+
+    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag build
+    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag push
+
+Docker files are under the `kubernetes/dockerfiles/` directory and can be customized further before
+building using the supplied script, or manually.
+
+## Cluster Mode
+
+To launch Spark Pi in cluster mode,
+
+{% highlight bash %}
+$ bin/spark-submit \
+    --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
+    --deploy-mode cluster \
+    --name spark-pi \
+    --class org.apache.spark.examples.SparkPi \
+    --conf spark.executor.instances=5 \
+    --conf spark.kubernetes.driver.docker.image=<driver-image> \
+    --conf spark.kubernetes.executor.docker.image=<executor-image> \
+    local:///path/to/examples.jar
+{% endhighlight %}
+
+The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
+`spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
+master string with `k8s://` will cause the Spark application to launch on the Kubernetes cluster, with the API server
+being contacted at `api_server_url`. If no HTTP protocol is specified in the URL, it defaults to `https`. For example,
+setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
+connect without TLS on a different port, the master would be set to `k8s://http://example.com:8080`.
+
+In Kubernetes mode, the Spark application name that is specified by `spark.app.name` or the `--name` argument to
+`spark-submit` is used by default to name the Kubernetes resources created like drivers and executors. So, application names
+must consist of lower case alphanumeric characters, `-`, and `.`  and must start and end with an alphanumeric character.
+
+If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
+
+```bash
+kubectl cluster-info
+Kubernetes master is running at http://127.0.0.1:6443
+```
+
+In the above example, the specific Kubernetes cluster can be used with <code>spark-submit</code> by specifying
+`--master k8s://http://127.0.0.1:6443` as an argument to spark-submit. Additionally, it is also possible to use the
+authenticating proxy, `kubectl proxy` to communicate to the Kubernetes API.
+
+The local proxy can be started by:
+
+```bash
+kubectl proxy
+```
+
+If the local proxy is running at localhost:8001, `--master k8s://http://127.0.0.1:8001` can be used as the argument to
+spark-submit. Finally, notice that in the above example we specify a jar with a specific URI with a scheme of `local://`.
+This URI is the location of the example jar that is already in the Docker image.
+
+## Dependency Management
+
+If your application's dependencies are all hosted in remote locations like HDFS or HTTP servers, they may be referred to
+by their appropriate remote URIs. Also, application dependencies can be pre-mounted into custom-built Docker images.
+Those dependencies can be added to the classpath by referencing them with `local://` URIs and/or setting the
+`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles.
+
+## Introspection and Debugging
+
+These are the different ways in which you can investigate a running/completed Spark application, monitor progress, and
+take actions.
+
+### Accessing Logs
+
+Logs can be accessed using the Kubernetes API and the `kubectl` CLI. When a Spark application is running, it's possible
+to stream logs from the application using:
+
+```bash
+kubectl -n=<namespace> logs -f <driver-pod-name>
+```
+
+The same logs can also be accessed through the
+[Kubernetes dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) if installed on
+the cluster.
+
+### Accessing Driver UI
+
+The UI associated with any application can be accessed locally using
+[`kubectl port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/#forward-a-local-port-to-a-port-on-the-pod).
+
+```bash
+kubectl port-forward <driver-pod-name> 4040:4040
+```
+
+Then, the Spark driver UI can be accessed on `http://localhost:4040`.
+
+### Debugging 
+
+There may be several kinds of failures. If the Kubernetes API server rejects the request made from spark-submit, or the
+connection is refused for a different reason, the submission logic should indicate the error encountered. However, if there
+are errors during the running of the application, often, the best way to investigate may be through the Kubernetes CLI.
+
+To get some basic information about the scheduling decisions made around the driver pod, you can run:
+
+```bash
+kubectl describe pod <spark-driver-pod>
+```
+
+If the pod has encountered a runtime error, the status can be probed further using:
+
+```bash
+kubectl logs <spark-driver-pod>
+```
+
+Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark 
+application, includling all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of 
+the Spark application.
+
+## Kubernetes Features
+
+### Namespaces
+
+Kubernetes has the concept of [namespaces](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/).
+Namespaces are ways to divide cluster resources between multiple users (via resource quota). Spark on Kubernetes can
+use namespaces to launch Spark applications. This can be made use of through the `spark.kubernetes.namespace` configuration.
+
+Kubernetes allows using [ResourceQuota](https://kubernetes.io/docs/concepts/policy/resource-quotas/) to set limits on
+resources, number of objects, etc on individual namespaces. Namespaces and ResourceQuota can be used in combination by
+administrator to control sharing and resource allocation in a Kubernetes cluster running Spark applications.
+
+### RBAC
+
+In Kubernetes clusters with [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/) enabled, users can configure
+Kubernetes RBAC roles and service accounts used by the various Spark on Kubernetes components to access the Kubernetes
+API server.
+
+The Spark driver pod uses a Kubernetes service account to access the Kubernetes API server to create and watch executor
+pods. The service account used by the driver pod must have the appropriate permission for the driver to be able to do
+its work. Specifically, at minimum, the service account must be granted a
+[`Role` or `ClusterRole`](https://kubernetes.io/docs/admin/authorization/rbac/#role-and-clusterrole) that allows driver
+pods to create pods and services. By default, the driver pod is automatically assigned the `default` service account in
+the namespace specified by `spark.kubernetes.namespace`, if no service account is specified when the pod gets created.
+
+Depending on the version and setup of Kubernetes deployed, this `default` service account may or may not have the role
+that allows driver pods to create pods and services under the default Kubernetes
+[RBAC](https://kubernetes.io/docs/admin/authorization/rbac/) policies. Sometimes users may need to specify a custom
+service account that has the right role granted. Spark on Kubernetes supports specifying a custom service account to
+be used by the driver pod through the configuration property
+`spark.kubernetes.authenticate.driver.serviceAccountName=<service account name>`. For example to make the driver pod
+use the `spark` service account, a user simply adds the following option to the `spark-submit` command:
+
+```
+--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark
+```
+
+To create a custom service account, a user can use the `kubectl create serviceaccount` command. For example, the
+following command creates a service account named `spark`:
+
+```bash
+kubectl create serviceaccount spark
+```
+
+To grant a service account a `Role` or `ClusterRole`, a `RoleBinding` or `ClusterRoleBinding` is needed. To create
+a `RoleBinding` or `ClusterRoleBinding`, a user can use the `kubectl create rolebinding` (or `clusterrolebinding`
+for `ClusterRoleBinding`) command. For example, the following command creates an `edit` `ClusterRole` in the `default`
+namespace and grants it to the `spark` service account created above:
+
+```bash
+kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
+```
+
+Note that a `Role` can only be used to grant access to resources (like pods) within a single namespace, whereas a
+`ClusterRole` can be used to grant access to cluster-scoped resources (like nodes) as well as namespaced resources
+(like pods) across all namespaces. For Spark on Kubernetes, since the driver always creates executor pods in the
+same namespace, a `Role` is sufficient, although users may use a `ClusterRole` instead. For more information on
+RBAC authorization and how to configure Kubernetes service accounts for pods, please refer to
+[Using RBAC Authorization](https://kubernetes.io/docs/admin/authorization/rbac/) and
+[Configure Service Accounts for Pods](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/).
+
+## Client Mode
+
+Client mode is not currently supported.
+
+## Future Work
+
+There are several Spark on Kubernetes features that are currently being incubated in a fork -
+[apache-spark-on-k8s/spark](https://github.com/apache-spark-on-k8s/spark), which are expected to eventually make it into
+future versions of the spark-kubernetes integration.
+
+Some of these include:
+
+* PySpark
+* R
+* Dynamic Executor Scaling
+* Local File Dependency Management
+* Spark Application Management
+* Job Queues and Resource Management
+
+You can refer to the [documentation](https://apache-spark-on-k8s.github.io/userdocs/) if you want to try these features
+and provide feedback to the development team.
+
+# Configuration
+
+See the [configuration page](configuration.html) for information on Spark configurations.  The following configurations are
+specific to Spark on Kubernetes.
+
+#### Spark Properties
+
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.kubernetes.namespace</code></td>
+  <td><code>default</code></td>
+  <td>
+    The namespace that will be used for running the driver and executor pods.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Container image to use for the driver.
+    This is usually of the form `example.com/repo/spark-driver:v1.0.0`.
+    This configuration is required and must be provided by the user.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.container.image</code></td>
+  <td><code>(none)</code></td>
+  <td>
+    Container image to use for the executors.
+    This is usually of the form `example.com/repo/spark-executor:v1.0.0`.
+    This configuration is required and must be provided by the user.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.container.image.pullPolicy</code></td>
+  <td><code>IfNotPresent</code></td>
+  <td>
+    Container image pull policy used when pulling images within Kubernetes.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.allocation.batch.size</code></td>
+  <td><code>5</code></td>
+  <td>
+    Number of pods to launch at once in each round of executor pod allocation.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.allocation.batch.delay</code></td>
+  <td><code>1s</code></td>
+  <td>
+    Time to wait between each round of executor pod allocation. Specifying values less than 1 second may lead to
+    excessive CPU usage on the spark driver.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.submission.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS when starting the driver. This file
+    must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not provide
+    a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.submission.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client key file for authenticating against the Kubernetes API server when starting the driver. This file
+    must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not provide
+    a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.submission.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client cert file for authenticating against the Kubernetes API server when starting the driver. This
+    file must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not
+    provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.submission.oauthToken</code></td>
+  <td>(none)</td>
+  <td>
+    OAuth token to use when authenticating against the Kubernetes API server when starting the driver. Note
+    that unlike the other authentication options, this is expected to be the exact string value of the token to use for
+    the authentication.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.submission.oauthTokenFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the OAuth token file containing the token to use when authenticating against the Kubernetes API server when starting the driver.
+    This file must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not
+    provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS from the driver pod when requesting
+    executors. This file must be located on the submitting machine's disk, and will be uploaded to the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client key file for authenticating against the Kubernetes API server from the driver pod when requesting
+    executors. This file must be located on the submitting machine's disk, and will be uploaded to the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme). If this is specified, it is highly
+    recommended to set up TLS for the driver submission server, as this value is sensitive information that would be
+    passed to the driver pod in plaintext otherwise.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client cert file for authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. This file must be located on the submitting machine's disk, and will be uploaded to the
+    driver pod. Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.oauthToken</code></td>
+  <td>(none)</td>
+  <td>
+    OAuth token to use when authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. Note that unlike the other authentication options, this must be the exact string value of
+    the token to use for the authentication. This token value is uploaded to the driver pod. If this is specified, it is
+    highly recommended to set up TLS for the driver submission server, as this value is sensitive information that would
+    be passed to the driver pod in plaintext otherwise.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.oauthTokenFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the OAuth token file containing the token to use when authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. Note that unlike the other authentication options, this file must contain the exact string value of
+    the token to use for the authentication. This token value is uploaded to the driver pod. If this is specified, it is
+    highly recommended to set up TLS for the driver submission server, as this value is sensitive information that would
+    be passed to the driver pod in plaintext otherwise.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.mounted.caCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the CA cert file for connecting to the Kubernetes API server over TLS from the driver pod when requesting
+    executors. This path must be accessible from the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.mounted.clientKeyFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client key file for authenticating against the Kubernetes API server from the driver pod when requesting
+    executors. This path must be accessible from the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.mounted.clientCertFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the client cert file for authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. This path must be accessible from the driver pod.
+    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.mounted.oauthTokenFile</code></td>
+  <td>(none)</td>
+  <td>
+    Path to the file containing the OAuth token to use when authenticating against the Kubernetes API server from the driver pod when
+    requesting executors. This path must be accessible from the driver pod.
+    Note that unlike the other authentication options, this file must contain the exact string value of the token to use for the authentication.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.authenticate.driver.serviceAccountName</code></td>
+  <td><code>default</code></td>
+  <td>
+    Service account that is used when running the driver pod. The driver pod uses this service account when requesting
+    executor pods from the API server. Note that this cannot be specified alongside a CA cert file, client key file,
+    client cert file, and/or OAuth token.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.label.[LabelName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the label specified by <code>LabelName</code> to the driver pod.
+    For example, <code>spark.kubernetes.driver.label.something=true</code>.
+    Note that Spark also adds its own labels to the driver pod
+    for bookkeeping purposes.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.annotation.[AnnotationName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the annotation specified by <code>AnnotationName</code> to the driver pod.
+    For example, <code>spark.kubernetes.driver.annotation.something=true</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.label.[LabelName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the label specified by <code>LabelName</code> to the executor pods.
+    For example, <code>spark.kubernetes.executor.label.something=true</code>.
+    Note that Spark also adds its own labels to the driver pod
+    for bookkeeping purposes.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.annotation.[AnnotationName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the annotation specified by <code>AnnotationName</code> to the executor pods.
+    For example, <code>spark.kubernetes.executor.annotation.something=true</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.pod.name</code></td>
+  <td>(none)</td>
+  <td>
+    Name of the driver pod. If not set, the driver pod name is set to "spark.app.name" suffixed by the current timestamp
+    to avoid name conflicts.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.podNamePrefix</code></td>
+  <td>(none)</td>
+  <td>
+    Prefix for naming the executor pods.
+    If not set, the executor pod name is set to driver pod name suffixed by an integer.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.lostCheck.maxAttempts</code></td>
+  <td><code>10</code></td>
+  <td>
+    Number of times that the driver will try to ascertain the loss reason for a specific executor.
+    The loss reason is used to ascertain whether the executor failure is due to a framework or an application error
+    which in turn decides whether the executor is removed and replaced, or placed into a failed state for debugging.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
+  <td><code>true</code></td>
+  <td>
+    In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
+    false, the launcher has a "fire-and-forget" behavior when launching the Spark job.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.report.interval</code></td>
+  <td><code>1s</code></td>
+  <td>
+    Interval between reports of the current Spark job status in cluster mode.
+  </td>
+</tr>
+<tr>
+   <td><code>spark.kubernetes.driver.limit.cores</code></td>
+   <td>(none)</td>
+   <td>
+     Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for the driver pod.
+   </td>
+ </tr>
+ <tr>
+   <td><code>spark.kubernetes.executor.limit.cores</code></td>
+   <td>(none)</td>
+   <td>
+     Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for each executor pod launched for the Spark Application.
+   </td>
+ </tr>
+ <tr>
+   <td><code>spark.kubernetes.node.selector.[labelKey]</code></td>
+   <td>(none)</td>
+   <td>
+     Adds to the node selector of the driver pod and executor pods, with key <code>labelKey</code> and the value as the
+     configuration's value. For example, setting <code>spark.kubernetes.node.selector.identifier</code> to <code>myIdentifier</code>
+     will result in the driver pod and executors having a node selector with key <code>identifier</code> and value
+      <code>myIdentifier</code>. Multiple node selector keys can be added by setting multiple configurations with this prefix.
+    </td>
+  </tr>
+ <tr>
+   <td><code>spark.kubernetes.driverEnv.[EnvironmentVariableName]</code></td>
+   <td>(none)</td>
+   <td>
+     Add the environment variable specified by <code>EnvironmentVariableName</code> to
+     the Driver process. The user can specify multiple of these to set multiple environment variables.
+   </td>
+ </tr>
+  <tr>
+    <td><code>spark.kubernetes.mountDependencies.jarsDownloadDir</code></td>
+    <td><code>/var/spark-data/spark-jars</code></td>
+    <td>
+      Location to download jars to in the driver and executors.
+      This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
+    </td>
+  </tr>
+   <tr>
+     <td><code>spark.kubernetes.mountDependencies.filesDownloadDir</code></td>
+     <td><code>/var/spark-data/spark-files</code></td>
+     <td>
+       Location to download jars to in the driver and executors.
+       This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
+     </td>
+   </tr>
+</table>
\ No newline at end of file
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 7e2386f33b583..e7edec5990363 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -18,7 +18,9 @@ Spark application's configuration (driver, executors, and the AM when running in
 
 There are two deploy modes that can be used to launch Spark applications on YARN. In `cluster` mode, the Spark driver runs inside an application master process which is managed by YARN on the cluster, and the client can go away after initiating the application. In `client` mode, the driver runs in the client process, and the application master is only used for requesting resources from YARN.
 
-Unlike [Spark standalone](spark-standalone.html) and [Mesos](running-on-mesos.html) modes, in which the master's address is specified in the `--master` parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration. Thus, the `--master` parameter is `yarn`.
+Unlike other cluster managers supported by Spark in which the master's address is specified in the `--master`
+parameter, in YARN mode the ResourceManager's address is picked up from the Hadoop configuration.
+Thus, the `--master` parameter is `yarn`.
 
 To launch a Spark application in `cluster` mode:
 
diff --git a/docs/submitting-applications.md b/docs/submitting-applications.md
index 866d6e527549c..0473ab73a5e6c 100644
--- a/docs/submitting-applications.md
+++ b/docs/submitting-applications.md
@@ -127,6 +127,16 @@ export HADOOP_CONF_DIR=XXX
   http://path/to/examples.jar \
   1000
 
+# Run on a Kubernetes cluster in cluster deploy mode
+./bin/spark-submit \
+  --class org.apache.spark.examples.SparkPi \
+  --master k8s://xx.yy.zz.ww:443 \
+  --deploy-mode cluster \
+  --executor-memory 20G \
+  --num-executors 50 \
+  http://path/to/examples.jar \
+  1000
+
 {% endhighlight %}
 
 # Master URLs
@@ -155,6 +165,12 @@ The master URL passed to Spark can be in one of the following formats:
         <code>client</code> or <code>cluster</code> mode depending on the value of <code>--deploy-mode</code>.
         The cluster location will be found based on the <code>HADOOP_CONF_DIR</code> or <code>YARN_CONF_DIR</code> variable.
 </td></tr>
+<tr><td> <code>k8s://HOST:PORT</code> </td><td> Connect to a <a href="running-on-kubernetes.html">Kubernetes</a> cluster in
+        <code>cluster</code> mode. Client mode is currently unsupported and will be supported in future releases.
+        The <code>HOST</code> and <code>PORT</code> refer to the [Kubernetes API Server](https://kubernetes.io/docs/reference/generated/kube-apiserver/).
+        It connects using TLS by default. In order to force it to use an unsecured connection, you can use
+        <code>k8s://http://HOST:PORT</code>.
+</td></tr>
 </table>
 
 
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
new file mode 100755
index 0000000000000..4546e98dc2074
--- /dev/null
+++ b/sbin/build-push-docker-images.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script builds and pushes docker images when run from a release of Spark
+# with Kubernetes support.
+
+declare -A path=( [spark-driver]=kubernetes/dockerfiles/driver/Dockerfile \
+                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile )
+
+function build {
+  docker build -t spark-base -f kubernetes/dockerfiles/spark-base/Dockerfile .
+  for image in "${!path[@]}"; do
+    docker build -t ${REPO}/$image:${TAG} -f ${path[$image]} .
+  done
+}
+
+
+function push {
+  for image in "${!path[@]}"; do
+    docker push ${REPO}/$image:${TAG}
+  done
+}
+
+function usage {
+  echo "This script must be run from a runnable distribution of Apache Spark."
+  echo "Usage: ./sbin/build-push-docker-images.sh -r <repo> -t <tag> build"
+  echo "       ./sbin/build-push-docker-images.sh -r <repo> -t <tag> push"
+  echo "for example: ./sbin/build-push-docker-images.sh -r docker.io/myrepo -t v2.3.0 push"
+}
+
+if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
+  usage
+  exit 0
+fi
+
+while getopts r:t: option
+do
+ case "${option}"
+ in
+ r) REPO=${OPTARG};;
+ t) TAG=${OPTARG};;
+ esac
+done
+
+if [ -z "$REPO" ] || [ -z "$TAG" ]; then
+    usage
+else
+  case "${@: -1}" in
+    build) build;;
+    push) push;;
+    *) usage;;
+  esac
+fi

From c0abb1d994bda50d964c555163cdfca5a7e56f64 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 22 Dec 2017 09:25:39 +0800
Subject: [PATCH 173/356] [SPARK-22854][UI] Read Spark version from event logs.

The code was ignoring SparkListenerLogStart, which was added
somewhat recently to record the Spark version used to generate
an event log.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20049 from vanzin/SPARK-22854.
---
 .../scala/org/apache/spark/status/AppStatusListener.scala  | 7 ++++++-
 .../org/apache/spark/status/AppStatusListenerSuite.scala   | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 4db797e1d24c6..5253297137323 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -48,7 +48,7 @@ private[spark] class AppStatusListener(
 
   import config._
 
-  private val sparkVersion = SPARK_VERSION
+  private var sparkVersion = SPARK_VERSION
   private var appInfo: v1.ApplicationInfo = null
   private var appSummary = new AppSummary(0, 0)
   private var coresPerTask: Int = 1
@@ -90,6 +90,11 @@ private[spark] class AppStatusListener(
     }
   }
 
+  override def onOtherEvent(event: SparkListenerEvent): Unit = event match {
+    case SparkListenerLogStart(version) => sparkVersion = version
+    case _ =>
+  }
+
   override def onApplicationStart(event: SparkListenerApplicationStart): Unit = {
     assert(event.appId.isDefined, "Application without IDs are not supported.")
 
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 9cf4f7efb24a8..c0b3a79fe981e 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -103,6 +103,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
   test("scheduler events") {
     val listener = new AppStatusListener(store, conf, true)
 
+    listener.onOtherEvent(SparkListenerLogStart("TestSparkVersion"))
+
     // Start the application.
     time += 1
     listener.onApplicationStart(SparkListenerApplicationStart(
@@ -125,6 +127,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
       assert(attempt.endTime.getTime() === -1L)
       assert(attempt.sparkUser === "user")
       assert(!attempt.completed)
+      assert(attempt.appSparkVersion === "TestSparkVersion")
     }
 
     // Start a couple of executors.

From c6f01cadede490bde987d067becef14442f1e4a1 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 22 Dec 2017 10:13:26 +0800
Subject: [PATCH 174/356] [SPARK-22750][SQL] Reuse mutable states when possible

## What changes were proposed in this pull request?

The PR introduces a new method `addImmutableStateIfNotExists ` to `CodeGenerator` to allow reusing and sharing the same global variable between different Expressions. This helps reducing the number of global variables needed, which is important to limit the impact on the constant pool.

## How was this patch tested?

added UTs

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19940 from mgaido91/SPARK-22750.
---
 .../MonotonicallyIncreasingID.scala           |  3 +-
 .../expressions/SparkPartitionID.scala        |  3 +-
 .../expressions/codegen/CodeGenerator.scala   | 40 +++++++++++++++++++
 .../expressions/datetimeExpressions.scala     | 12 ++++--
 .../expressions/objects/objects.scala         | 24 +++++++----
 .../expressions/CodeGenerationSuite.scala     | 12 ++++++
 6 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
index 784eaf8195194..11fb579dfa88c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/MonotonicallyIncreasingID.scala
@@ -66,7 +66,8 @@ case class MonotonicallyIncreasingID() extends LeafExpression with Nondeterminis
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val countTerm = ctx.addMutableState(ctx.JAVA_LONG, "count")
-    val partitionMaskTerm = ctx.addMutableState(ctx.JAVA_LONG, "partitionMask")
+    val partitionMaskTerm = "partitionMask"
+    ctx.addImmutableStateIfNotExists(ctx.JAVA_LONG, partitionMaskTerm)
     ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
     ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
index 736ca37c6d54a..a160b9b275290 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SparkPartitionID.scala
@@ -43,7 +43,8 @@ case class SparkPartitionID() extends LeafExpression with Nondeterministic {
   override protected def evalInternal(input: InternalRow): Int = partitionId
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val idTerm = ctx.addMutableState(ctx.JAVA_INT, "partitionId")
+    val idTerm = "partitionId"
+    ctx.addImmutableStateIfNotExists(ctx.JAVA_INT, idTerm)
     ctx.addPartitionInitializationStatement(s"$idTerm = partitionIndex;")
     ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = $idTerm;", isNull = "false")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 9adf632ddcde8..d6eccadcfb63e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -207,6 +207,14 @@ class CodegenContext {
 
   }
 
+  /**
+   * A map containing the mutable states which have been defined so far using
+   * `addImmutableStateIfNotExists`. Each entry contains the name of the mutable state as key and
+   * its Java type and init code as value.
+   */
+  private val immutableStates: mutable.Map[String, (String, String)] =
+    mutable.Map.empty[String, (String, String)]
+
   /**
    * Add a mutable state as a field to the generated class. c.f. the comments above.
    *
@@ -265,6 +273,38 @@ class CodegenContext {
     }
   }
 
+  /**
+   * Add an immutable state as a field to the generated class only if it does not exist yet a field
+   * with that name. This helps reducing the number of the generated class' fields, since the same
+   * variable can be reused by many functions.
+   *
+   * Even though the added variables are not declared as final, they should never be reassigned in
+   * the generated code to prevent errors and unexpected behaviors.
+   *
+   * Internally, this method calls `addMutableState`.
+   *
+   * @param javaType Java type of the field.
+   * @param variableName Name of the field.
+   * @param initFunc Function includes statement(s) to put into the init() method to initialize
+   *                 this field. The argument is the name of the mutable state variable.
+   */
+  def addImmutableStateIfNotExists(
+      javaType: String,
+      variableName: String,
+      initFunc: String => String = _ => ""): Unit = {
+    val existingImmutableState = immutableStates.get(variableName)
+    if (existingImmutableState.isEmpty) {
+      addMutableState(javaType, variableName, initFunc, useFreshName = false, forceInline = true)
+      immutableStates(variableName) = (javaType, initFunc(variableName))
+    } else {
+      val (prevJavaType, prevInitCode) = existingImmutableState.get
+      assert(prevJavaType == javaType, s"$variableName has already been defined with type " +
+        s"$prevJavaType and now it is tried to define again with type $javaType.")
+      assert(prevInitCode == initFunc(variableName), s"$variableName has already been defined " +
+        s"with different initialization statements.")
+    }
+  }
+
   /**
    * Add buffer variable which stores data coming from an [[InternalRow]]. This methods guarantees
    * that the variable is safely stored, which is important for (potentially) byte array backed
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 59c3e3d9947a3..7a674ea7f4d76 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -443,7 +443,8 @@ case class DayOfWeek(child: Expression) extends UnaryExpression with ImplicitCas
     nullSafeCodeGen(ctx, ev, time => {
       val cal = classOf[Calendar].getName
       val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-      val c = ctx.addMutableState(cal, "cal",
+      val c = "calDayOfWeek"
+      ctx.addImmutableStateIfNotExists(cal, c,
         v => s"""$v = $cal.getInstance($dtu.getTimeZone("UTC"));""")
       s"""
         $c.setTimeInMillis($time * 1000L * 3600L * 24L);
@@ -484,8 +485,9 @@ case class WeekOfYear(child: Expression) extends UnaryExpression with ImplicitCa
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, time => {
       val cal = classOf[Calendar].getName
+      val c = "calWeekOfYear"
       val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
-      val c = ctx.addMutableState(cal, "cal", v =>
+      ctx.addImmutableStateIfNotExists(cal, c, v =>
         s"""
            |$v = $cal.getInstance($dtu.getTimeZone("UTC"));
            |$v.setFirstDayOfWeek($cal.MONDAY);
@@ -1017,7 +1019,8 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
         val tzTerm = ctx.addMutableState(tzClass, "tz",
           v => s"""$v = $dtu.getTimeZone("$tz");""")
-        val utcTerm = ctx.addMutableState(tzClass, "utc",
+        val utcTerm = "tzUTC"
+        ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
         val eval = left.genCode(ctx)
         ev.copy(code = s"""
@@ -1193,7 +1196,8 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
         val tzTerm = ctx.addMutableState(tzClass, "tz",
           v => s"""$v = $dtu.getTimeZone("$tz");""")
-        val utcTerm = ctx.addMutableState(tzClass, "utc",
+        val utcTerm = "tzUTC"
+        ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
         val eval = left.genCode(ctx)
         ev.copy(code = s"""
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index a59aad5be8715..4af813456b790 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -1148,17 +1148,21 @@ case class EncodeUsingSerializer(child: Expression, kryo: Boolean)
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Code to initialize the serializer.
-    val (serializerClass, serializerInstanceClass) = {
+    val (serializer, serializerClass, serializerInstanceClass) = {
       if (kryo) {
-        (classOf[KryoSerializer].getName, classOf[KryoSerializerInstance].getName)
+        ("kryoSerializer",
+          classOf[KryoSerializer].getName,
+          classOf[KryoSerializerInstance].getName)
       } else {
-        (classOf[JavaSerializer].getName, classOf[JavaSerializerInstance].getName)
+        ("javaSerializer",
+          classOf[JavaSerializer].getName,
+          classOf[JavaSerializerInstance].getName)
       }
     }
     // try conf from env, otherwise create a new one
     val env = s"${classOf[SparkEnv].getName}.get()"
     val sparkConf = s"new ${classOf[SparkConf].getName}()"
-    val serializer = ctx.addMutableState(serializerInstanceClass, "serializerForEncode", v =>
+    ctx.addImmutableStateIfNotExists(serializerInstanceClass, serializer, v =>
       s"""
          |if ($env == null) {
          |  $v = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
@@ -1193,17 +1197,21 @@ case class DecodeUsingSerializer[T](child: Expression, tag: ClassTag[T], kryo: B
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     // Code to initialize the serializer.
-    val (serializerClass, serializerInstanceClass) = {
+    val (serializer, serializerClass, serializerInstanceClass) = {
       if (kryo) {
-        (classOf[KryoSerializer].getName, classOf[KryoSerializerInstance].getName)
+        ("kryoSerializer",
+          classOf[KryoSerializer].getName,
+          classOf[KryoSerializerInstance].getName)
       } else {
-        (classOf[JavaSerializer].getName, classOf[JavaSerializerInstance].getName)
+        ("javaSerializer",
+          classOf[JavaSerializer].getName,
+          classOf[JavaSerializerInstance].getName)
       }
     }
     // try conf from env, otherwise create a new one
     val env = s"${classOf[SparkEnv].getName}.get()"
     val sparkConf = s"new ${classOf[SparkConf].getName}()"
-    val serializer = ctx.addMutableState(serializerInstanceClass, "serializerForDecode", v =>
+    ctx.addImmutableStateIfNotExists(serializerInstanceClass, serializer, v =>
       s"""
          |if ($env == null) {
          |  $v = ($serializerInstanceClass) new $serializerClass($sparkConf).newInstance();
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
index b1a44528e64d7..676ba3956ddc8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala
@@ -424,4 +424,16 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper {
     assert(ctx2.arrayCompactedMutableStates("InternalRow[]").getCurrentIndex == 10)
     assert(ctx2.mutableStateInitCode.size == CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT + 10)
   }
+
+  test("SPARK-22750: addImmutableStateIfNotExists") {
+    val ctx = new CodegenContext
+    val mutableState1 = "field1"
+    val mutableState2 = "field2"
+    ctx.addImmutableStateIfNotExists("int", mutableState1)
+    ctx.addImmutableStateIfNotExists("int", mutableState1)
+    ctx.addImmutableStateIfNotExists("String", mutableState2)
+    ctx.addImmutableStateIfNotExists("int", mutableState1)
+    ctx.addImmutableStateIfNotExists("String", mutableState2)
+    assert(ctx.inlinedMutableStates.length == 2)
+  }
 }

From a36b78b0e420b909bde0cec4349cdc2103853b91 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Thu, 21 Dec 2017 20:20:04 -0600
Subject: [PATCH 175/356] [SPARK-22450][CORE][MLLIB][FOLLOWUP] safely register
 class for mllib - LabeledPoint/VectorWithNorm/TreePoint

## What changes were proposed in this pull request?
register following classes in Kryo:
`org.apache.spark.mllib.regression.LabeledPoint`
`org.apache.spark.mllib.clustering.VectorWithNorm`
`org.apache.spark.ml.feature.LabeledPoint`
`org.apache.spark.ml.tree.impl.TreePoint`

`org.apache.spark.ml.tree.impl.BaggedPoint` seems also need to be registered, but I don't know how to do it in this safe way.
WeichenXu123 cloud-fan

## How was this patch tested?
added tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #19950 from zhengruifeng/labeled_kryo.
---
 .../spark/serializer/KryoSerializer.scala     | 27 +++++++------
 ...InstanceSuit.scala => InstanceSuite.scala} | 24 ++++++------
 .../spark/ml/feature/LabeledPointSuite.scala  | 39 +++++++++++++++++++
 .../spark/ml/tree/impl/TreePointSuite.scala   | 35 +++++++++++++++++
 .../spark/mllib/clustering/KMeansSuite.scala  | 18 ++++++++-
 .../mllib/regression/LabeledPointSuite.scala  | 18 ++++++++-
 6 files changed, 135 insertions(+), 26 deletions(-)
 rename mllib/src/test/scala/org/apache/spark/ml/feature/{InstanceSuit.scala => InstanceSuite.scala} (79%)
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/LabeledPointSuite.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreePointSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
index 2259d1a2d555d..538ae05e4eea1 100644
--- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala
@@ -181,20 +181,25 @@ class KryoSerializer(conf: SparkConf)
 
     // We can't load those class directly in order to avoid unnecessary jar dependencies.
     // We load them safely, ignore it if the class not found.
-    Seq("org.apache.spark.mllib.linalg.Vector",
-      "org.apache.spark.mllib.linalg.DenseVector",
-      "org.apache.spark.mllib.linalg.SparseVector",
-      "org.apache.spark.mllib.linalg.Matrix",
-      "org.apache.spark.mllib.linalg.DenseMatrix",
-      "org.apache.spark.mllib.linalg.SparseMatrix",
-      "org.apache.spark.ml.linalg.Vector",
+    Seq(
+      "org.apache.spark.ml.feature.Instance",
+      "org.apache.spark.ml.feature.LabeledPoint",
+      "org.apache.spark.ml.feature.OffsetInstance",
+      "org.apache.spark.ml.linalg.DenseMatrix",
       "org.apache.spark.ml.linalg.DenseVector",
-      "org.apache.spark.ml.linalg.SparseVector",
       "org.apache.spark.ml.linalg.Matrix",
-      "org.apache.spark.ml.linalg.DenseMatrix",
       "org.apache.spark.ml.linalg.SparseMatrix",
-      "org.apache.spark.ml.feature.Instance",
-      "org.apache.spark.ml.feature.OffsetInstance"
+      "org.apache.spark.ml.linalg.SparseVector",
+      "org.apache.spark.ml.linalg.Vector",
+      "org.apache.spark.ml.tree.impl.TreePoint",
+      "org.apache.spark.mllib.clustering.VectorWithNorm",
+      "org.apache.spark.mllib.linalg.DenseMatrix",
+      "org.apache.spark.mllib.linalg.DenseVector",
+      "org.apache.spark.mllib.linalg.Matrix",
+      "org.apache.spark.mllib.linalg.SparseMatrix",
+      "org.apache.spark.mllib.linalg.SparseVector",
+      "org.apache.spark.mllib.linalg.Vector",
+      "org.apache.spark.mllib.regression.LabeledPoint"
     ).foreach { name =>
       try {
         val clazz = Utils.classForName(name)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuit.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala
similarity index 79%
rename from mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuit.scala
rename to mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala
index 88c85a9425e78..cca7399b4b9c5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuit.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/InstanceSuite.scala
@@ -17,31 +17,29 @@
 
 package org.apache.spark.ml.feature
 
-import scala.reflect.ClassTag
-
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.serializer.KryoSerializer
 
-class InstanceSuit extends SparkFunSuite{
+class InstanceSuite extends SparkFunSuite{
   test("Kryo class register") {
     val conf = new SparkConf(false)
     conf.set("spark.kryo.registrationRequired", "true")
 
-    val ser = new KryoSerializer(conf)
-    val serInstance = new KryoSerializer(conf).newInstance()
-
-    def check[T: ClassTag](t: T) {
-      assert(serInstance.deserialize[T](serInstance.serialize(t)) === t)
-    }
+    val ser = new KryoSerializer(conf).newInstance()
 
     val instance1 = Instance(19.0, 2.0, Vectors.dense(1.0, 7.0))
     val instance2 = Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse)
+    Seq(instance1, instance2).foreach { i =>
+      val i2 = ser.deserialize[Instance](ser.serialize(i))
+      assert(i === i2)
+    }
+
     val oInstance1 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0))
     val oInstance2 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse)
-    check(instance1)
-    check(instance2)
-    check(oInstance1)
-    check(oInstance2)
+    Seq(oInstance1, oInstance2).foreach { o =>
+      val o2 = ser.deserialize[OffsetInstance](ser.serialize(o))
+      assert(o === o2)
+    }
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/LabeledPointSuite.scala
new file mode 100644
index 0000000000000..05c7a58ee5ffd
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/LabeledPointSuite.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.serializer.KryoSerializer
+
+class LabeledPointSuite extends SparkFunSuite {
+  test("Kryo class register") {
+    val conf = new SparkConf(false)
+    conf.set("spark.kryo.registrationRequired", "true")
+
+    val ser = new KryoSerializer(conf).newInstance()
+
+    val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0)))
+    val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0)))
+
+    Seq(labeled1, labeled2).foreach { l =>
+      val l2 = ser.deserialize[LabeledPoint](ser.serialize(l))
+      assert(l === l2)
+    }
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreePointSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreePointSuite.scala
new file mode 100644
index 0000000000000..f41abe48f2c58
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/TreePointSuite.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.tree.impl
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.KryoSerializer
+
+class TreePointSuite extends SparkFunSuite {
+  test("Kryo class register") {
+    val conf = new SparkConf(false)
+    conf.set("spark.kryo.registrationRequired", "true")
+
+    val ser = new KryoSerializer(conf).newInstance()
+
+    val point = new TreePoint(1.0, Array(1, 2, 3))
+    val point2 = ser.deserialize[TreePoint](ser.serialize(point))
+    assert(point.label === point2.label)
+    assert(point.binnedFeatures === point2.binnedFeatures)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 48bd41dc3e3bf..00d7e2f2d3864 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -19,10 +19,11 @@ package org.apache.spark.mllib.clustering
 
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 import org.apache.spark.mllib.util.TestingUtils._
+import org.apache.spark.serializer.KryoSerializer
 import org.apache.spark.util.Utils
 
 class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -311,6 +312,21 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(returnModel.clusterCenters(1) === initialModel.clusterCenters(1))
   }
 
+  test("Kryo class register") {
+    val conf = new SparkConf(false)
+    conf.set("spark.kryo.registrationRequired", "true")
+
+    val ser = new KryoSerializer(conf).newInstance()
+
+    val vec1 = new VectorWithNorm(Vectors.dense(Array(1.0, 2.0)))
+    val vec2 = new VectorWithNorm(Vectors.sparse(10, Array(5, 8), Array(1.0, 2.0)))
+
+    Seq(vec1, vec2).foreach { v =>
+      val v2 = ser.deserialize[VectorWithNorm](ser.serialize(v))
+      assert(v2.norm === v.norm)
+      assert(v2.vector === v.vector)
+    }
+  }
 }
 
 object KMeansSuite extends SparkFunSuite {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
index 252a068dcd72f..c1449ece740d4 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LabeledPointSuite.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
 import org.apache.spark.mllib.linalg.Vectors
+import org.apache.spark.serializer.KryoSerializer
 
 class LabeledPointSuite extends SparkFunSuite {
 
@@ -53,4 +54,19 @@ class LabeledPointSuite extends SparkFunSuite {
       assert(p1 === LabeledPoint.fromML(p2))
     }
   }
+
+  test("Kryo class register") {
+    val conf = new SparkConf(false)
+    conf.set("spark.kryo.registrationRequired", "true")
+
+    val ser = new KryoSerializer(conf).newInstance()
+
+    val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0)))
+    val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0)))
+
+    Seq(labeled1, labeled2).foreach { l =>
+      val l2 = ser.deserialize[LabeledPoint](ser.serialize(l))
+      assert(l === l2)
+    }
+  }
 }

From 22e1849bcfb3ef988f4f9a5c2783bfc7ec001694 Mon Sep 17 00:00:00 2001
From: Anirudh Ramanathan <ramanathana@google.com>
Date: Thu, 21 Dec 2017 21:03:10 -0800
Subject: [PATCH 176/356] [SPARK-22866][K8S] Fix path issue in Kubernetes
 dockerfile

## What changes were proposed in this pull request?

The path was recently changed in https://github.com/apache/spark/pull/19946, but the dockerfile was not updated.
This is a trivial 1 line fix.

## How was this patch tested?

`./sbin/build-push-docker-images.sh -r spark-repo -t latest build`

cc/ vanzin mridulm rxin jiangxb1987 liyinan926

Author: Anirudh Ramanathan <ramanathana@google.com>
Author: foxish <ramanathana@google.com>

Closes #20051 from foxish/patch-1.
---
 .../kubernetes/docker/src/main/dockerfiles/driver/Dockerfile  | 2 +-
 .../docker/src/main/dockerfiles/executor/Dockerfile           | 2 +-
 .../docker/src/main/dockerfiles/spark-base/Dockerfile         | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index d16349559466d..9b682f8673c69 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -22,7 +22,7 @@ FROM spark-base
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f dockerfiles/spark-base/Dockerfile .
+# docker build -t spark-driver:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
 
 COPY examples /opt/spark/examples
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 0e38169b8efdc..168cd4cb6c57a 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -22,7 +22,7 @@ FROM spark-base
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f dockerfiles/spark-base/Dockerfile .
+# docker build -t spark-executor:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
 
 COPY examples /opt/spark/examples
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
index 20316c9c5098a..222e777db3a82 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
@@ -22,7 +22,7 @@ FROM openjdk:8-alpine
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-base:latest -f dockerfiles/spark-base/Dockerfile .
+# docker build -t spark-base:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
 
 RUN set -ex && \
     apk upgrade --no-cache && \
@@ -38,7 +38,7 @@ COPY jars /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY dockerfiles/spark-base/entrypoint.sh /opt/
+COPY kubernetes/dockerfiles/spark-base/entrypoint.sh /opt/
 
 ENV SPARK_HOME /opt/spark
 

From 8df1da396f64bb7fe76d73cd01498fdf3b8ed964 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 21 Dec 2017 21:38:16 -0800
Subject: [PATCH 177/356] [SPARK-22862] Docs on lazy elimination of columns
 missing from an encoder

This behavior has confused some users, so lets clarify it.

Author: Michael Armbrust <michael@databricks.com>

Closes #20048 from marmbrus/datasetAsDocs.
---
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index ef00562672a7e..209b800fdc6f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -401,6 +401,10 @@ class Dataset[T] private[sql](
    * If the schema of the Dataset does not match the desired `U` type, you can use `select`
    * along with `alias` or `as` to rearrange or rename as required.
    *
+   * Note that `as[]` only changes the view of the data that is passed into typed operations,
+   * such as `map()`, and does not eagerly project away any columns that are not present in
+   * the specified class.
+   *
    * @group basic
    * @since 1.6.0
    */

From 13190a4f60c081a68812df6df1d8262779cd6fcb Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 22 Dec 2017 20:09:51 +0900
Subject: [PATCH 178/356] [SPARK-22874][PYSPARK][SQL] Modify checking pandas
 version to use LooseVersion.

## What changes were proposed in this pull request?

Currently we check pandas version by capturing if `ImportError` for the specific imports is raised or not but we can compare `LooseVersion` of the version strings as the same as we're checking pyarrow version.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20054 from ueshin/issues/SPARK-22874.
---
 python/pyspark/sql/dataframe.py |  4 ++--
 python/pyspark/sql/session.py   | 15 +++++++--------
 python/pyspark/sql/tests.py     |  7 ++++---
 python/pyspark/sql/types.py     | 33 +++++++++++++--------------------
 python/pyspark/sql/udf.py       |  4 ++--
 python/pyspark/sql/utils.py     | 11 ++++++++++-
 6 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 440684d3edfa6..95eca76fa9888 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -1906,9 +1906,9 @@ def toPandas(self):
         if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
             try:
                 from pyspark.sql.types import _check_dataframe_localize_timestamps
-                from pyspark.sql.utils import _require_minimum_pyarrow_version
+                from pyspark.sql.utils import require_minimum_pyarrow_version
                 import pyarrow
-                _require_minimum_pyarrow_version()
+                require_minimum_pyarrow_version()
                 tables = self._collectAsArrow()
                 if tables:
                     table = pyarrow.concat_tables(tables)
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 86db16eca7889..6e5eec48e8aca 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -493,15 +493,14 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
         data types will be used to coerce the data in Pandas to Arrow conversion.
         """
         from pyspark.serializers import ArrowSerializer, _create_batch
-        from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
-            _old_pandas_exception_message, TimestampType
-        from pyspark.sql.utils import _require_minimum_pyarrow_version
-        try:
-            from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
-        except ImportError as e:
-            raise ImportError(_old_pandas_exception_message(e))
+        from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
+        from pyspark.sql.utils import require_minimum_pandas_version, \
+            require_minimum_pyarrow_version
+
+        require_minimum_pandas_version()
+        require_minimum_pyarrow_version()
 
-        _require_minimum_pyarrow_version()
+        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
 
         # Determine arrow types to coerce data when creating batches
         if isinstance(schema, StructType):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6fdfda1cc831b..b977160af566d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -53,7 +53,8 @@
 try:
     import pandas
     try:
-        import pandas.api
+        from pyspark.sql.utils import require_minimum_pandas_version
+        require_minimum_pandas_version()
         _have_pandas = True
     except:
         _have_old_pandas = True
@@ -2600,7 +2601,7 @@ def test_to_pandas(self):
     @unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
     def test_to_pandas_old(self):
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
+            with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
                 self._to_pandas()
 
     @unittest.skipIf(not _have_pandas, "Pandas not installed")
@@ -2643,7 +2644,7 @@ def test_create_dataframe_from_old_pandas(self):
         pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
                             "d": [pd.Timestamp.now().date()]})
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
+            with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
                 self.spark.createDataFrame(pdf)
 
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 46d9a417414b5..063264a89379c 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1678,13 +1678,6 @@ def from_arrow_schema(arrow_schema):
          for field in arrow_schema])
 
 
-def _old_pandas_exception_message(e):
-    """ Create an error message for importing old Pandas.
-    """
-    msg = "note: Pandas (>=0.19.2) must be installed and available on calling Python process"
-    return "%s\n%s" % (_exception_message(e), msg)
-
-
 def _check_dataframe_localize_timestamps(pdf, timezone):
     """
     Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
@@ -1693,10 +1686,10 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
     :param timezone: the timezone to convert. if None then use local timezone
     :return pandas.DataFrame where any timezone aware columns have been converted to tz-naive
     """
-    try:
-        from pandas.api.types import is_datetime64tz_dtype
-    except ImportError as e:
-        raise ImportError(_old_pandas_exception_message(e))
+    from pyspark.sql.utils import require_minimum_pandas_version
+    require_minimum_pandas_version()
+
+    from pandas.api.types import is_datetime64tz_dtype
     tz = timezone or 'tzlocal()'
     for column, series in pdf.iteritems():
         # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
@@ -1714,10 +1707,10 @@ def _check_series_convert_timestamps_internal(s, timezone):
     :param timezone: the timezone to convert. if None then use local timezone
     :return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
     """
-    try:
-        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
-    except ImportError as e:
-        raise ImportError(_old_pandas_exception_message(e))
+    from pyspark.sql.utils import require_minimum_pandas_version
+    require_minimum_pandas_version()
+
+    from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
         tz = timezone or 'tzlocal()'
@@ -1737,11 +1730,11 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
     :param to_timezone: the timezone to convert to. if None then use local timezone
     :return pandas.Series where if it is a timestamp, has been converted to tz-naive
     """
-    try:
-        import pandas as pd
-        from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
-    except ImportError as e:
-        raise ImportError(_old_pandas_exception_message(e))
+    from pyspark.sql.utils import require_minimum_pandas_version
+    require_minimum_pandas_version()
+
+    import pandas as pd
+    from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
     from_tz = from_timezone or 'tzlocal()'
     to_tz = to_timezone or 'tzlocal()'
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 50c87ba1ac882..123138117fdc3 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -37,9 +37,9 @@ def _create_udf(f, returnType, evalType):
     if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF or \
             evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
         import inspect
-        from pyspark.sql.utils import _require_minimum_pyarrow_version
+        from pyspark.sql.utils import require_minimum_pyarrow_version
 
-        _require_minimum_pyarrow_version()
+        require_minimum_pyarrow_version()
         argspec = inspect.getargspec(f)
 
         if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF and len(argspec.args) == 0 and \
diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index cc7dabb64b3ec..fb7d42a35d8f4 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -112,7 +112,16 @@ def toJArray(gateway, jtype, arr):
     return jarr
 
 
-def _require_minimum_pyarrow_version():
+def require_minimum_pandas_version():
+    """ Raise ImportError if minimum version of Pandas is not installed
+    """
+    from distutils.version import LooseVersion
+    import pandas
+    if LooseVersion(pandas.__version__) < LooseVersion('0.19.2'):
+        raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process")
+
+
+def require_minimum_pyarrow_version():
     """ Raise ImportError if minimum version of pyarrow is not installed
     """
     from distutils.version import LooseVersion

From d23dc5b8ef6c6aee0a31a304eefeb6ddb1c26c0f Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Fri, 22 Dec 2017 14:05:57 -0800
Subject: [PATCH 179/356] [SPARK-22346][ML] VectorSizeHint Transformer for
 using VectorAssembler in StructuredSteaming

## What changes were proposed in this pull request?

A new VectorSizeHint transformer was added. This transformer is meant to be used as a pipeline stage ahead of VectorAssembler, on vector columns, so that VectorAssembler can join vectors in a streaming context where the size of the input vectors is otherwise not known.

## How was this patch tested?

Unit tests.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Bago Amirbekian <bago@databricks.com>

Closes #19746 from MrBago/vector-size-hint.
---
 .../spark/ml/feature/VectorSizeHint.scala     | 195 ++++++++++++++++++
 .../ml/feature/VectorSizeHintSuite.scala      | 189 +++++++++++++++++
 2 files changed, 384 insertions(+)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
new file mode 100644
index 0000000000000..1fe3cfc74c76d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.SparkException
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
+import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol}
+import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
+import org.apache.spark.sql.{Column, DataFrame, Dataset}
+import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.types.StructType
+
+/**
+ * :: Experimental ::
+ * A feature transformer that adds size information to the metadata of a vector column.
+ * VectorAssembler needs size information for its input columns and cannot be used on streaming
+ * dataframes without this metadata.
+ *
+ */
+@Experimental
+@Since("2.3.0")
+class VectorSizeHint @Since("2.3.0") (@Since("2.3.0") override val uid: String)
+  extends Transformer with HasInputCol with HasHandleInvalid with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("vectSizeHint"))
+
+  /**
+   * The size of Vectors in `inputCol`.
+   * @group param
+   */
+  @Since("2.3.0")
+  val size: IntParam = new IntParam(
+    this,
+    "size",
+    "Size of vectors in column.",
+    {s: Int => s >= 0})
+
+  /** group getParam */
+  @Since("2.3.0")
+  def getSize: Int = getOrDefault(size)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setSize(value: Int): this.type = set(size, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /**
+   * Param for how to handle invalid entries. Invalid vectors include nulls and vectors with the
+   * wrong size. The options are `skip` (filter out rows with invalid vectors), `error` (throw an
+   * error) and `optimistic` (do not check the vector size, and keep all rows). `error` by default.
+   *
+   * Note: Users should take care when setting this param to `optimistic`. The use of the
+   * `optimistic` option will prevent the transformer from validating the sizes of vectors in
+   * `inputCol`. A mismatch between the metadata of a column and its contents could result in
+   * unexpected behaviour or errors when using that column.
+   *
+   * @group param
+   */
+  @Since("2.3.0")
+  override val handleInvalid: Param[String] = new Param[String](
+    this,
+    "handleInvalid",
+    "How to handle invalid vectors in inputCol. Invalid vectors include nulls and vectors with " +
+      "the wrong size. The options are `skip` (filter out rows with invalid vectors), `error` " +
+      "(throw an error) and `optimistic` (do not check the vector size, and keep all rows). " +
+      "`error` by default.",
+    ParamValidators.inArray(VectorSizeHint.supportedHandleInvalids))
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, VectorSizeHint.ERROR_INVALID)
+
+  @Since("2.3.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val localInputCol = getInputCol
+    val localSize = getSize
+    val localHandleInvalid = getHandleInvalid
+
+    val group = AttributeGroup.fromStructField(dataset.schema(localInputCol))
+    val newGroup = validateSchemaAndSize(dataset.schema, group)
+    if (localHandleInvalid == VectorSizeHint.OPTIMISTIC_INVALID && group.size == localSize) {
+      dataset.toDF()
+    } else {
+      val newCol: Column = localHandleInvalid match {
+        case VectorSizeHint.OPTIMISTIC_INVALID => col(localInputCol)
+        case VectorSizeHint.ERROR_INVALID =>
+          val checkVectorSizeUDF = udf { vector: Vector =>
+            if (vector == null) {
+              throw new SparkException(s"Got null vector in VectorSizeHint, set `handleInvalid` " +
+                s"to 'skip' to filter invalid rows.")
+            }
+            if (vector.size != localSize) {
+              throw new SparkException(s"VectorSizeHint Expecting a vector of size $localSize but" +
+                s" got ${vector.size}")
+            }
+            vector
+          }.asNondeterministic()
+          checkVectorSizeUDF(col(localInputCol))
+        case VectorSizeHint.SKIP_INVALID =>
+          val checkVectorSizeUDF = udf { vector: Vector =>
+            if (vector != null && vector.size == localSize) {
+              vector
+            } else {
+              null
+            }
+          }
+          checkVectorSizeUDF(col(localInputCol))
+      }
+
+      val res = dataset.withColumn(localInputCol, newCol.as(localInputCol, newGroup.toMetadata()))
+      if (localHandleInvalid == VectorSizeHint.SKIP_INVALID) {
+        res.na.drop(Array(localInputCol))
+      } else {
+        res
+      }
+    }
+  }
+
+  /**
+   * Checks that schema can be updated with new size and returns a new attribute group with
+   * updated size.
+   */
+  private def validateSchemaAndSize(schema: StructType, group: AttributeGroup): AttributeGroup = {
+    // This will throw a NoSuchElementException if params are not set.
+    val localSize = getSize
+    val localInputCol = getInputCol
+
+    val inputColType = schema(getInputCol).dataType
+    require(
+      inputColType.isInstanceOf[VectorUDT],
+      s"Input column, $getInputCol must be of Vector type, got $inputColType"
+    )
+    group.size match {
+      case `localSize` => group
+      case -1 => new AttributeGroup(localInputCol, localSize)
+      case _ =>
+        val msg = s"Trying to set size of vectors in `$localInputCol` to $localSize but size " +
+          s"already set to ${group.size}."
+        throw new IllegalArgumentException(msg)
+    }
+  }
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    val fieldIndex = schema.fieldIndex(getInputCol)
+    val fields = schema.fields.clone()
+    val inputField = fields(fieldIndex)
+    val group = AttributeGroup.fromStructField(inputField)
+    val newGroup = validateSchemaAndSize(schema, group)
+    fields(fieldIndex) = inputField.copy(metadata = newGroup.toMetadata())
+    StructType(fields)
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+}
+
+/** :: Experimental :: */
+@Experimental
+@Since("2.3.0")
+object VectorSizeHint extends DefaultParamsReadable[VectorSizeHint] {
+
+  private[feature] val OPTIMISTIC_INVALID = "optimistic"
+  private[feature] val ERROR_INVALID = "error"
+  private[feature] val SKIP_INVALID = "skip"
+  private[feature] val supportedHandleInvalids: Array[String] =
+    Array(OPTIMISTIC_INVALID, ERROR_INVALID, SKIP_INVALID)
+
+  @Since("2.3.0")
+  override def load(path: String): VectorSizeHint = super.load(path)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
new file mode 100644
index 0000000000000..f6c9a76599fae
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/VectorSizeHintSuite.scala
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.attribute.AttributeGroup
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming.StreamTest
+
+class VectorSizeHintSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("Test Param Validators") {
+    intercept[IllegalArgumentException] (new VectorSizeHint().setHandleInvalid("invalidValue"))
+    intercept[IllegalArgumentException] (new VectorSizeHint().setSize(-3))
+  }
+
+  test("Required params must be set before transform.") {
+    val data = Seq((Vectors.dense(1, 2), 0)).toDF("vector", "intValue")
+
+    val noSizeTransformer = new VectorSizeHint().setInputCol("vector")
+    intercept[NoSuchElementException] (noSizeTransformer.transform(data))
+    intercept[NoSuchElementException] (noSizeTransformer.transformSchema(data.schema))
+
+    val noInputColTransformer = new VectorSizeHint().setSize(2)
+    intercept[NoSuchElementException] (noInputColTransformer.transform(data))
+    intercept[NoSuchElementException] (noInputColTransformer.transformSchema(data.schema))
+  }
+
+  test("Adding size to column of vectors.") {
+
+    val size = 3
+    val vectorColName = "vector"
+    val denseVector = Vectors.dense(1, 2, 3)
+    val sparseVector = Vectors.sparse(size, Array(), Array())
+
+    val data = Seq(denseVector, denseVector, sparseVector).map(Tuple1.apply)
+    val dataFrame = data.toDF(vectorColName)
+    assert(
+      AttributeGroup.fromStructField(dataFrame.schema(vectorColName)).size == -1,
+      s"This test requires that column '$vectorColName' not have size metadata.")
+
+    for (handleInvalid <- VectorSizeHint.supportedHandleInvalids) {
+      val transformer = new VectorSizeHint()
+        .setInputCol(vectorColName)
+        .setSize(size)
+        .setHandleInvalid(handleInvalid)
+      val withSize = transformer.transform(dataFrame)
+      assert(
+        AttributeGroup.fromStructField(withSize.schema(vectorColName)).size == size,
+        "Transformer did not add expected size data.")
+      val numRows = withSize.collect().length
+      assert(numRows === data.length, s"Expecting ${data.length} rows, got $numRows.")
+    }
+  }
+
+  test("Size hint preserves attributes.") {
+
+    val size = 3
+    val vectorColName = "vector"
+    val data = Seq((1, 2, 3), (2, 3, 3))
+    val dataFrame = data.toDF("x", "y", "z")
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("x", "y", "z"))
+      .setOutputCol(vectorColName)
+    val dataFrameWithMetadata = assembler.transform(dataFrame)
+    val group = AttributeGroup.fromStructField(dataFrameWithMetadata.schema(vectorColName))
+
+    for (handleInvalid <- VectorSizeHint.supportedHandleInvalids) {
+      val transformer = new VectorSizeHint()
+        .setInputCol(vectorColName)
+        .setSize(size)
+        .setHandleInvalid(handleInvalid)
+      val withSize = transformer.transform(dataFrameWithMetadata)
+
+      val newGroup = AttributeGroup.fromStructField(withSize.schema(vectorColName))
+      assert(newGroup.size === size, "Column has incorrect size metadata.")
+      assert(
+        newGroup.attributes.get === group.attributes.get,
+        "VectorSizeHint did not preserve attributes.")
+      withSize.collect
+    }
+  }
+
+  test("Size mismatch between current and target size raises an error.") {
+    val size = 4
+    val vectorColName = "vector"
+    val data = Seq((1, 2, 3), (2, 3, 3))
+    val dataFrame = data.toDF("x", "y", "z")
+
+    val assembler = new VectorAssembler()
+      .setInputCols(Array("x", "y", "z"))
+      .setOutputCol(vectorColName)
+    val dataFrameWithMetadata = assembler.transform(dataFrame)
+
+    for (handleInvalid <- VectorSizeHint.supportedHandleInvalids) {
+      val transformer = new VectorSizeHint()
+        .setInputCol(vectorColName)
+        .setSize(size)
+        .setHandleInvalid(handleInvalid)
+      intercept[IllegalArgumentException](transformer.transform(dataFrameWithMetadata))
+    }
+  }
+
+  test("Handle invalid does the right thing.") {
+
+    val vector = Vectors.dense(1, 2, 3)
+    val short = Vectors.dense(2)
+    val dataWithNull = Seq(vector, null).map(Tuple1.apply).toDF("vector")
+    val dataWithShort = Seq(vector, short).map(Tuple1.apply).toDF("vector")
+
+    val sizeHint = new VectorSizeHint()
+      .setInputCol("vector")
+      .setHandleInvalid("error")
+      .setSize(3)
+
+    intercept[SparkException](sizeHint.transform(dataWithNull).collect())
+    intercept[SparkException](sizeHint.transform(dataWithShort).collect())
+
+    sizeHint.setHandleInvalid("skip")
+    assert(sizeHint.transform(dataWithNull).count() === 1)
+    assert(sizeHint.transform(dataWithShort).count() === 1)
+
+    sizeHint.setHandleInvalid("optimistic")
+    assert(sizeHint.transform(dataWithNull).count() === 2)
+    assert(sizeHint.transform(dataWithShort).count() === 2)
+  }
+
+  test("read/write") {
+    val sizeHint = new VectorSizeHint()
+      .setInputCol("myInputCol")
+      .setSize(11)
+      .setHandleInvalid("skip")
+    testDefaultReadWrite(sizeHint)
+  }
+}
+
+class VectorSizeHintStreamingSuite extends StreamTest {
+
+  import testImplicits._
+
+  test("Test assemble vectors with size hint in streaming.") {
+    val a = Vectors.dense(0, 1, 2)
+    val b = Vectors.sparse(4, Array(0, 3), Array(3, 6))
+
+    val stream = MemoryStream[(Vector, Vector)]
+    val streamingDF = stream.toDS.toDF("a", "b")
+    val sizeHintA = new VectorSizeHint()
+      .setSize(3)
+      .setInputCol("a")
+    val sizeHintB = new VectorSizeHint()
+      .setSize(4)
+      .setInputCol("b")
+    val vectorAssembler = new VectorAssembler()
+      .setInputCols(Array("a", "b"))
+      .setOutputCol("assembled")
+    val pipeline = new Pipeline().setStages(Array(sizeHintA, sizeHintB, vectorAssembler))
+    val output = pipeline.fit(streamingDF).transform(streamingDF).select("assembled")
+
+    val expected = Vectors.dense(0, 1, 2, 3, 0, 0, 6)
+
+    testStream (output) (
+      AddData(stream, (a, b), (a, b)),
+      CheckAnswer(Tuple1(expected), Tuple1(expected))
+    )
+  }
+}

From 8941a4abcada873c26af924e129173dc33d66d71 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Fri, 22 Dec 2017 23:05:03 -0800
Subject: [PATCH 180/356] [SPARK-22789] Map-only continuous processing
 execution

## What changes were proposed in this pull request?

Basic continuous execution, supporting map/flatMap/filter, with commits and advancement through RPC.

## How was this patch tested?

new unit-ish tests (exercising execution end to end)

Author: Jose Torres <jose@databricks.com>

Closes #19984 from jose-torres/continuous-impl.
---
 project/MimaExcludes.scala                    |   5 +
 .../UnsupportedOperationChecker.scala         |  25 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  21 ++
 .../sources/v2/reader/ContinuousReader.java   |   6 +
 .../sources/v2/reader/MicroBatchReader.java   |   6 +
 .../apache/spark/sql/streaming/Trigger.java   |  54 +++
 .../spark/sql/execution/SparkStrategies.scala |   7 +
 .../datasources/v2/DataSourceV2ScanExec.scala |  20 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |  60 ++-
 .../streaming/BaseStreamingSource.java        |   8 -
 .../execution/streaming/HDFSMetadataLog.scala |  14 +
 .../streaming/MicroBatchExecution.scala       |  44 ++-
 .../sql/execution/streaming/OffsetSeq.scala   |   2 +-
 .../streaming/ProgressReporter.scala          |  10 +-
 .../streaming/RateSourceProvider.scala        |   9 +-
 .../streaming/RateStreamOffset.scala          |   5 +-
 .../spark/sql/execution/streaming/Sink.scala  |   2 +-
 .../sql/execution/streaming/Source.scala      |   2 +-
 .../execution/streaming/StreamExecution.scala |  20 +-
 .../execution/streaming/StreamProgress.scala  |  19 +-
 .../streaming/StreamingRelation.scala         |  47 +++
 .../ContinuousDataSourceRDDIter.scala         | 217 +++++++++++
 .../continuous/ContinuousExecution.scala      | 349 ++++++++++++++++++
 .../ContinuousRateStreamSource.scala          |  11 +-
 .../continuous/ContinuousTrigger.scala        |  70 ++++
 .../continuous/EpochCoordinator.scala         | 191 ++++++++++
 .../sources/RateStreamSourceV2.scala          |  19 +-
 .../streaming/sources/memoryV2.scala          |  13 +
 .../sql/streaming/DataStreamReader.scala      |  38 +-
 .../sql/streaming/DataStreamWriter.scala      |  19 +-
 .../sql/streaming/StreamingQueryManager.scala |  45 ++-
 .../org/apache/spark/sql/QueryTest.scala      |  56 ++-
 .../streaming/RateSourceV2Suite.scala         |  30 +-
 .../spark/sql/streaming/StreamSuite.scala     |  17 +-
 .../spark/sql/streaming/StreamTest.scala      |  55 ++-
 .../continuous/ContinuousSuite.scala          | 316 ++++++++++++++++
 36 files changed, 1682 insertions(+), 150 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTrigger.scala
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9902fedb65d59..81584af6813ea 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,11 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // SPARK-22789: Map-only continuous processing execution
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$8"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$6"),
+    ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$9"),
+
     // SPARK-22372: Make cluster submission use SparkApplication.
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.getSecretKeyFromUserCredentials"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkHadoopUtil.isYarnMode"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
index 04502d04d9509..b55043c270644 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/UnsupportedOperationChecker.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, MonotonicallyIncreasingID}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet, CurrentDate, CurrentTimestamp, MonotonicallyIncreasingID}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
 import org.apache.spark.sql.catalyst.plans._
@@ -339,6 +339,29 @@ object UnsupportedOperationChecker {
     }
   }
 
+  def checkForContinuous(plan: LogicalPlan, outputMode: OutputMode): Unit = {
+    checkForStreaming(plan, outputMode)
+
+    plan.foreachUp { implicit subPlan =>
+      subPlan match {
+        case (_: Project | _: Filter | _: MapElements | _: MapPartitions |
+              _: DeserializeToObject | _: SerializeFromObject) =>
+        case node if node.nodeName == "StreamingRelationV2" =>
+        case node =>
+          throwError(s"Continuous processing does not support ${node.nodeName} operations.")
+      }
+
+      subPlan.expressions.foreach { e =>
+        if (e.collectLeaves().exists {
+          case (_: CurrentTimestamp | _: CurrentDate) => true
+          case _ => false
+        }) {
+          throwError(s"Continuous processing does not support current time operations.")
+        }
+      }
+    }
+  }
+
   private def throwErrorIf(
       condition: Boolean,
       msg: String)(implicit operator: LogicalPlan): Unit = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index bdc8d92e84079..84fe4bb711a4e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1044,6 +1044,22 @@ object SQLConf {
         "When this conf is not set, the value from `spark.redaction.string.regex` is used.")
       .fallbackConf(org.apache.spark.internal.config.STRING_REDACTION_PATTERN)
 
+  val CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE =
+    buildConf("spark.sql.streaming.continuous.executorQueueSize")
+    .internal()
+    .doc("The size (measured in number of rows) of the queue used in continuous execution to" +
+      " buffer the results of a ContinuousDataReader.")
+    .intConf
+    .createWithDefault(1024)
+
+  val CONTINUOUS_STREAMING_EXECUTOR_POLL_INTERVAL_MS =
+    buildConf("spark.sql.streaming.continuous.executorPollIntervalMs")
+      .internal()
+      .doc("The interval at which continuous execution readers will poll to check whether" +
+        " the epoch has advanced on the driver.")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefault(100)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1357,6 +1373,11 @@ class SQLConf extends Serializable with Logging {
 
   def replaceExceptWithFilter: Boolean = getConf(REPLACE_EXCEPT_WITH_FILTER)
 
+  def continuousStreamingExecutorQueueSize: Int = getConf(CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE)
+
+  def continuousStreamingExecutorPollIntervalMs: Long =
+    getConf(CONTINUOUS_STREAMING_EXECUTOR_POLL_INTERVAL_MS)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
index 1baf82c2df762..34141d6cd85fd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
@@ -65,4 +65,10 @@ public interface ContinuousReader extends BaseStreamingSource, DataSourceV2Reade
     default boolean needsReconfiguration() {
         return false;
     }
+
+    /**
+     * Informs the source that Spark has completed processing all data for offsets less than or
+     * equal to `end` and will only request offsets greater than `end` in the future.
+     */
+    void commit(Offset end);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
index 438e3f55b7bcf..bd15c07d87f6c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
@@ -61,4 +61,10 @@ public interface MicroBatchReader extends DataSourceV2Reader, BaseStreamingSourc
      * @throws IllegalArgumentException if the JSON does not encode a valid offset for this reader
      */
     Offset deserializeOffset(String json);
+
+    /**
+     * Informs the source that Spark has completed processing all data for offsets less than or
+     * equal to `end` and will only request offsets greater than `end` in the future.
+     */
+    void commit(Offset end);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java b/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
index d31790a285687..33ae9a9e87668 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/streaming/Trigger.java
@@ -22,6 +22,7 @@
 import scala.concurrent.duration.Duration;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger;
 import org.apache.spark.sql.execution.streaming.OneTimeTrigger$;
 
 /**
@@ -95,4 +96,57 @@ public static Trigger ProcessingTime(String interval) {
   public static Trigger Once() {
     return OneTimeTrigger$.MODULE$;
   }
+
+  /**
+   * A trigger that continuously processes streaming data, asynchronously checkpointing at
+   * the specified interval.
+   *
+   * @since 2.3.0
+   */
+  public static Trigger Continuous(long intervalMs) {
+    return ContinuousTrigger.apply(intervalMs);
+  }
+
+  /**
+   * A trigger that continuously processes streaming data, asynchronously checkpointing at
+   * the specified interval.
+   *
+   * {{{
+   *    import java.util.concurrent.TimeUnit
+   *    df.writeStream.trigger(Trigger.Continuous(10, TimeUnit.SECONDS))
+   * }}}
+   *
+   * @since 2.3.0
+   */
+  public static Trigger Continuous(long interval, TimeUnit timeUnit) {
+    return ContinuousTrigger.create(interval, timeUnit);
+  }
+
+  /**
+   * (Scala-friendly)
+   * A trigger that continuously processes streaming data, asynchronously checkpointing at
+   * the specified interval.
+   *
+   * {{{
+   *    import scala.concurrent.duration._
+   *    df.writeStream.trigger(Trigger.Continuous(10.seconds))
+   * }}}
+   * @since 2.3.0
+   */
+  public static Trigger Continuous(Duration interval) {
+    return ContinuousTrigger.apply(interval);
+  }
+
+  /**
+   * A trigger that continuously processes streaming data, asynchronously checkpointing at
+   * the specified interval.
+   *
+   * {{{
+   *    df.writeStream.trigger(Trigger.Continuous("10 seconds"))
+   * }}}
+   * @since 2.3.0
+   */
+  public static Trigger Continuous(String interval) {
+    return ContinuousTrigger.apply(interval);
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 9e713cd7bbe2b..8c6c324d456c7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -31,8 +31,10 @@ import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.sources.MemoryPlanV2
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.StreamingQuery
+import org.apache.spark.sql.types.StructType
 
 /**
  * Converts a logical plan into zero or more SparkPlans.  This API is exposed for experimenting
@@ -374,6 +376,8 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         StreamingRelationExec(s.sourceName, s.output) :: Nil
       case s: StreamingExecutionRelation =>
         StreamingRelationExec(s.toString, s.output) :: Nil
+      case s: StreamingRelationV2 =>
+        StreamingRelationExec(s.sourceName, s.output) :: Nil
       case _ => Nil
     }
   }
@@ -404,6 +408,9 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       case MemoryPlan(sink, output) =>
         val encoder = RowEncoder(sink.schema)
         LocalTableScanExec(output, sink.allData.map(r => encoder.toRow(r).copy())) :: Nil
+      case MemoryPlanV2(sink, output) =>
+        val encoder = RowEncoder(StructType.fromAttributes(output))
+        LocalTableScanExec(output, sink.allData.map(r => encoder.toRow(r).copy())) :: Nil
 
       case logical.Distinct(child) =>
         throw new IllegalStateException(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index 3f243dc44e043..e4fca1b10dfad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousDataSourceRDD, ContinuousExecution, EpochCoordinatorRef, SetReaderPartitions}
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.StructType
 
@@ -52,10 +54,20 @@ case class DataSourceV2ScanExec(
         }.asJava
     }
 
-    val inputRDD = new DataSourceRDD(sparkContext, readTasks)
-      .asInstanceOf[RDD[InternalRow]]
+    val inputRDD = reader match {
+      case _: ContinuousReader =>
+        EpochCoordinatorRef.get(
+          sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+          .askSync[Unit](SetReaderPartitions(readTasks.size()))
+
+        new ContinuousDataSourceRDD(sparkContext, sqlContext, readTasks)
+
+      case _ =>
+        new DataSourceRDD(sparkContext, readTasks)
+    }
+
     val numOutputRows = longMetric("numOutputRows")
-    inputRDD.map { r =>
+    inputRDD.asInstanceOf[RDD[InternalRow]].map { r =>
       numOutputRows += 1
       r
     }
@@ -73,7 +85,7 @@ class RowToUnsafeRowReadTask(rowReadTask: ReadTask[Row], schema: StructType)
   }
 }
 
-class RowToUnsafeDataReader(rowReader: DataReader[Row], encoder: ExpressionEncoder[Row])
+class RowToUnsafeDataReader(val rowReader: DataReader[Row], encoder: ExpressionEncoder[Row])
   extends DataReader[UnsafeRow] {
 
   override def next: Boolean = rowReader.next
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index b72d15ed15aed..1862da8892cb2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
-import org.apache.spark.{SparkException, TaskContext}
+import org.apache.spark.{SparkEnv, SparkException, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.Row
@@ -26,6 +26,8 @@ import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
@@ -58,10 +60,22 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
       s"The input RDD has ${messages.length} partitions.")
 
     try {
+      val runTask = writer match {
+        case w: ContinuousWriter =>
+          EpochCoordinatorRef.get(
+            sparkContext.getLocalProperty(ContinuousExecution.RUN_ID_KEY), sparkContext.env)
+            .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))
+
+          (context: TaskContext, iter: Iterator[InternalRow]) =>
+            DataWritingSparkTask.runContinuous(writeTask, context, iter)
+        case _ =>
+          (context: TaskContext, iter: Iterator[InternalRow]) =>
+            DataWritingSparkTask.run(writeTask, context, iter)
+      }
+
       sparkContext.runJob(
         rdd,
-        (context: TaskContext, iter: Iterator[InternalRow]) =>
-          DataWritingSparkTask.run(writeTask, context, iter),
+        runTask,
         rdd.partitions.indices,
         (index, message: WriterCommitMessage) => messages(index) = message
       )
@@ -70,6 +84,8 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
       writer.commit(messages)
       logInfo(s"Data source writer $writer committed.")
     } catch {
+      case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
+        // Interruption is how continuous queries are ended, so accept and ignore the exception.
       case cause: Throwable =>
         logError(s"Data source writer $writer is aborting.")
         try {
@@ -109,6 +125,44 @@ object DataWritingSparkTask extends Logging {
       logError(s"Writer for partition ${context.partitionId()} aborted.")
     })
   }
+
+  def runContinuous(
+      writeTask: DataWriterFactory[InternalRow],
+      context: TaskContext,
+      iter: Iterator[InternalRow]): WriterCommitMessage = {
+    val dataWriter = writeTask.createDataWriter(context.partitionId(), context.attemptNumber())
+    val epochCoordinator = EpochCoordinatorRef.get(
+      context.getLocalProperty(ContinuousExecution.RUN_ID_KEY),
+      SparkEnv.get)
+    val currentMsg: WriterCommitMessage = null
+    var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
+
+    do {
+      // write the data and commit this writer.
+      Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
+        try {
+          iter.foreach(dataWriter.write)
+          logInfo(s"Writer for partition ${context.partitionId()} is committing.")
+          val msg = dataWriter.commit()
+          logInfo(s"Writer for partition ${context.partitionId()} committed.")
+          epochCoordinator.send(
+            CommitPartitionEpoch(context.partitionId(), currentEpoch, msg)
+          )
+          currentEpoch += 1
+        } catch {
+          case _: InterruptedException =>
+            // Continuous shutdown always involves an interrupt. Just finish the task.
+        }
+      })(catchBlock = {
+        // If there is an error, abort this writer
+        logError(s"Writer for partition ${context.partitionId()} is aborting.")
+        dataWriter.abort()
+        logError(s"Writer for partition ${context.partitionId()} aborted.")
+      })
+    } while (!context.isInterrupted())
+
+    currentMsg
+  }
 }
 
 class InternalRowDataWriterFactory(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
index 3a02cbfe7afe3..c44b8af2552f0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/BaseStreamingSource.java
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.streaming;
 
-import org.apache.spark.sql.sources.v2.reader.Offset;
-
 /**
  * The shared interface between V1 streaming sources and V2 streaming readers.
  *
@@ -26,12 +24,6 @@
  * directly, and will be removed in future versions.
  */
 public interface BaseStreamingSource {
-  /**
-   * Informs the source that Spark has completed processing all data for offsets less than or
-   * equal to `end` and will only request offsets greater than `end` in the future.
-   */
-  void commit(Offset end);
-
   /** Stop this source and free any resources it has allocated. */
   void stop();
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
index 43cf0ef1da8ca..6e8154d58d4c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/HDFSMetadataLog.scala
@@ -266,6 +266,20 @@ class HDFSMetadataLog[T <: AnyRef : ClassTag](sparkSession: SparkSession, path:
     }
   }
 
+  /**
+   * Removes all log entries later than thresholdBatchId (exclusive).
+   */
+  def purgeAfter(thresholdBatchId: Long): Unit = {
+    val batchIds = fileManager.list(metadataPath, batchFilesFilter)
+      .map(f => pathToBatchId(f.getPath))
+
+    for (batchId <- batchIds if batchId > thresholdBatchId) {
+      val path = batchIdToPath(batchId)
+      fileManager.delete(path)
+      logTrace(s"Removed metadata log file: $path")
+    }
+  }
+
   private def createFileManager(): FileManager = {
     val hadoopConf = sparkSession.sessionState.newHadoopConf()
     try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 4a3de8bae4bc9..20f9810faa5c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.sources.v2.MicroBatchReadSupport
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
@@ -41,6 +42,8 @@ class MicroBatchExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
+  @volatile protected var sources: Seq[BaseStreamingSource] = Seq.empty
+
   private val triggerExecutor = trigger match {
     case t: ProcessingTime => ProcessingTimeExecutor(t, triggerClock)
     case OneTimeTrigger => OneTimeExecutor()
@@ -53,6 +56,7 @@ class MicroBatchExecution(
         s"but the current thread was ${Thread.currentThread}")
     var nextSourceId = 0L
     val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
+    val v2ToExecutionRelationMap = MutableMap[StreamingRelationV2, StreamingExecutionRelation]()
     val _logicalPlan = analyzedPlan.transform {
       case streamingRelation@StreamingRelation(dataSource, _, output) =>
         toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
@@ -64,6 +68,17 @@ class MicroBatchExecution(
           // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
+      case s @ StreamingRelationV2(v2DataSource, _, _, output, v1DataSource)
+          if !v2DataSource.isInstanceOf[MicroBatchReadSupport] =>
+        v2ToExecutionRelationMap.getOrElseUpdate(s, {
+          // Materialize source to avoid creating it in every batch
+          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+          val source = v1DataSource.createSource(metadataPath)
+          nextSourceId += 1
+          // We still need to use the previous `output` instead of `source.schema` as attributes in
+          // "df.logicalPlan" has already used attributes of the previous `output`.
+          StreamingExecutionRelation(source, output)(sparkSession)
+        })
     }
     sources = _logicalPlan.collect { case s: StreamingExecutionRelation => s.source }
     uniqueSources = sources.distinct
@@ -170,12 +185,14 @@ class MicroBatchExecution(
                * Make a call to getBatch using the offsets from previous batch.
                * because certain sources (e.g., KafkaSource) assume on restart the last
                * batch will be executed before getOffset is called again. */
-              availableOffsets.foreach { ao: (Source, Offset) =>
-                val (source, end) = ao
-                if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
-                  val start = committedOffsets.get(source)
-                  source.getBatch(start, end)
-                }
+              availableOffsets.foreach {
+                case (source: Source, end: Offset) =>
+                  if (committedOffsets.get(source).map(_ != end).getOrElse(true)) {
+                    val start = committedOffsets.get(source)
+                    source.getBatch(start, end)
+                  }
+                case nonV1Tuple =>
+                  throw new IllegalStateException(s"Unexpected V2 source in $nonV1Tuple")
               }
               currentBatchId = latestCommittedBatchId + 1
               committedOffsets ++= availableOffsets
@@ -219,11 +236,12 @@ class MicroBatchExecution(
     val hasNewData = {
       awaitProgressLock.lock()
       try {
-        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map { s =>
-          updateStatusMessage(s"Getting offsets from $s")
-          reportTimeTaken("getOffset") {
-            (s, s.getOffset)
-          }
+        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map {
+          case s: Source =>
+            updateStatusMessage(s"Getting offsets from $s")
+            reportTimeTaken("getOffset") {
+              (s, s.getOffset)
+            }
         }.toMap
         availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
 
@@ -298,7 +316,7 @@ class MicroBatchExecution(
           val prevBatchOff = offsetLog.get(currentBatchId - 1)
           if (prevBatchOff.isDefined) {
             prevBatchOff.get.toStreamProgress(sources).foreach {
-              case (src, off) => src.commit(off)
+              case (src: Source, off) => src.commit(off)
             }
           } else {
             throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
@@ -331,7 +349,7 @@ class MicroBatchExecution(
     // Request unprocessed data from all sources.
     newData = reportTimeTaken("getBatch") {
       availableOffsets.flatMap {
-        case (source, available)
+        case (source: Source, available)
           if committedOffsets.get(source).map(_ != available).getOrElse(true) =>
           val current = committedOffsets.get(source)
           val batch = source.getBatch(current, available)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
index 4e0a468b962a2..a1b63a6de3823 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/OffsetSeq.scala
@@ -38,7 +38,7 @@ case class OffsetSeq(offsets: Seq[Option[Offset]], metadata: Option[OffsetSeqMet
    * This method is typically used to associate a serialized offset with actual sources (which
    * cannot be serialized).
    */
-  def toStreamProgress(sources: Seq[Source]): StreamProgress = {
+  def toStreamProgress(sources: Seq[BaseStreamingSource]): StreamProgress = {
     assert(sources.size == offsets.size)
     new StreamProgress ++ sources.zip(offsets).collect { case (s, Some(o)) => (s, o) }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index b1c3a8ab235ab..1c9043613cb69 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -42,7 +42,7 @@ import org.apache.spark.util.Clock
 trait ProgressReporter extends Logging {
 
   case class ExecutionStats(
-    inputRows: Map[Source, Long],
+    inputRows: Map[BaseStreamingSource, Long],
     stateOperators: Seq[StateOperatorProgress],
     eventTimeStats: Map[String, String])
 
@@ -53,11 +53,11 @@ trait ProgressReporter extends Logging {
   protected def triggerClock: Clock
   protected def logicalPlan: LogicalPlan
   protected def lastExecution: QueryExecution
-  protected def newData: Map[Source, DataFrame]
+  protected def newData: Map[BaseStreamingSource, DataFrame]
   protected def availableOffsets: StreamProgress
   protected def committedOffsets: StreamProgress
-  protected def sources: Seq[Source]
-  protected def sink: Sink
+  protected def sources: Seq[BaseStreamingSource]
+  protected def sink: BaseStreamingSink
   protected def offsetSeqMetadata: OffsetSeqMetadata
   protected def currentBatchId: Long
   protected def sparkSession: SparkSession
@@ -230,7 +230,7 @@ trait ProgressReporter extends Logging {
     }
     val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
     val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
-    val numInputRows: Map[Source, Long] =
+    val numInputRows: Map[BaseStreamingSource, Long] =
       if (allLogicalPlanLeaves.size == allExecPlanLeaves.size) {
         val execLeafToSource = allLogicalPlanLeaves.zip(allExecPlanLeaves).flatMap {
           case (lp, ep) => logicalPlanLeafToSource.get(lp).map { source => ep -> source }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 41761324cf6ac..3f85fa913f28c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -52,7 +52,7 @@ import org.apache.spark.util.{ManualClock, SystemClock}
  *    be resource constrained, and `numPartitions` can be tweaked to help reach the desired speed.
  */
 class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
-  with DataSourceV2 with MicroBatchReadSupport with ContinuousReadSupport{
+  with DataSourceV2 with ContinuousReadSupport {
 
   override def sourceSchema(
       sqlContext: SQLContext,
@@ -107,13 +107,6 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
     )
   }
 
-  override def createMicroBatchReader(
-      schema: Optional[StructType],
-      checkpointLocation: String,
-      options: DataSourceV2Options): MicroBatchReader = {
-    new RateStreamV2Reader(options)
-  }
-
   override def createContinuousReader(
       schema: Optional[StructType],
       checkpointLocation: String,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
index 726d8574af52b..65d6d18936167 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
@@ -22,8 +22,11 @@ import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.sources.v2
 
-case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, (Long, Long)])
+case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, ValueRunTimeMsPair])
   extends v2.reader.Offset {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
   override val json = Serialization.write(partitionToValueAndRunTimeMs)
 }
+
+
+case class ValueRunTimeMsPair(value: Long, runTimeMs: Long)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
index d10cd3044ecdf..34bc085d920c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Sink.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.DataFrame
  * exactly once semantics a sink must be idempotent in the face of multiple attempts to add the same
  * batch.
  */
-trait Sink {
+trait Sink extends BaseStreamingSink {
 
   /**
    * Adds a batch of data to this sink. The data for a given `batchId` is deterministic and if
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
index 311942f6dbd84..dbbd59e06909c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Source.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.types.StructType
  * monotonically increasing notion of progress that can be represented as an [[Offset]]. Spark
  * will regularly query each [[Source]] to see if any more data is available.
  */
-trait Source {
+trait Source extends BaseStreamingSource {
 
   /** Returns the schema of the data from this source */
   def schema: StructType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 129995dcf3607..3e76bf7b7ca8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -44,6 +44,7 @@ trait State
 case object INITIALIZING extends State
 case object ACTIVE extends State
 case object TERMINATED extends State
+case object RECONFIGURING extends State
 
 /**
  * Manages the execution of a streaming Spark SQL query that is occurring in a separate thread.
@@ -59,7 +60,7 @@ abstract class StreamExecution(
     override val name: String,
     private val checkpointRoot: String,
     analyzedPlan: LogicalPlan,
-    val sink: Sink,
+    val sink: BaseStreamingSink,
     val trigger: Trigger,
     val triggerClock: Clock,
     val outputMode: OutputMode,
@@ -147,30 +148,25 @@ abstract class StreamExecution(
    * Pretty identified string of printing in logs. Format is
    * If name is set "queryName [id = xyz, runId = abc]" else "[id = xyz, runId = abc]"
    */
-  private val prettyIdString =
+  protected val prettyIdString =
     Option(name).map(_ + " ").getOrElse("") + s"[id = $id, runId = $runId]"
 
-  /**
-   * All stream sources present in the query plan. This will be set when generating logical plan.
-   */
-  @volatile protected var sources: Seq[Source] = Seq.empty
-
   /**
    * A list of unique sources in the query plan. This will be set when generating logical plan.
    */
-  @volatile protected var uniqueSources: Seq[Source] = Seq.empty
+  @volatile protected var uniqueSources: Seq[BaseStreamingSource] = Seq.empty
 
   /** Defines the internal state of execution */
-  private val state = new AtomicReference[State](INITIALIZING)
+  protected val state = new AtomicReference[State](INITIALIZING)
 
   @volatile
   var lastExecution: IncrementalExecution = _
 
   /** Holds the most recent input data for each source. */
-  protected var newData: Map[Source, DataFrame] = _
+  protected var newData: Map[BaseStreamingSource, DataFrame] = _
 
   @volatile
-  private var streamDeathCause: StreamingQueryException = null
+  protected var streamDeathCause: StreamingQueryException = null
 
   /* Get the call site in the caller thread; will pass this into the micro batch thread */
   private val callSite = Utils.getCallSite()
@@ -389,7 +385,7 @@ abstract class StreamExecution(
   }
 
   /** Stops all streaming sources safely. */
-  private def stopSources(): Unit = {
+  protected def stopSources(): Unit = {
     uniqueSources.foreach { source =>
       try {
         source.stop()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
index a3f3662e6f4c9..8531070b1bc49 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamProgress.scala
@@ -23,25 +23,28 @@ import scala.collection.{immutable, GenTraversableOnce}
  * A helper class that looks like a Map[Source, Offset].
  */
 class StreamProgress(
-    val baseMap: immutable.Map[Source, Offset] = new immutable.HashMap[Source, Offset])
-  extends scala.collection.immutable.Map[Source, Offset] {
+    val baseMap: immutable.Map[BaseStreamingSource, Offset] =
+        new immutable.HashMap[BaseStreamingSource, Offset])
+  extends scala.collection.immutable.Map[BaseStreamingSource, Offset] {
 
-  def toOffsetSeq(source: Seq[Source], metadata: OffsetSeqMetadata): OffsetSeq = {
+  def toOffsetSeq(source: Seq[BaseStreamingSource], metadata: OffsetSeqMetadata): OffsetSeq = {
     OffsetSeq(source.map(get), Some(metadata))
   }
 
   override def toString: String =
     baseMap.map { case (k, v) => s"$k: $v"}.mkString("{", ",", "}")
 
-  override def +[B1 >: Offset](kv: (Source, B1)): Map[Source, B1] = baseMap + kv
+  override def +[B1 >: Offset](kv: (BaseStreamingSource, B1)): Map[BaseStreamingSource, B1] = {
+    baseMap + kv
+  }
 
-  override def get(key: Source): Option[Offset] = baseMap.get(key)
+  override def get(key: BaseStreamingSource): Option[Offset] = baseMap.get(key)
 
-  override def iterator: Iterator[(Source, Offset)] = baseMap.iterator
+  override def iterator: Iterator[(BaseStreamingSource, Offset)] = baseMap.iterator
 
-  override def -(key: Source): Map[Source, Offset] = baseMap - key
+  override def -(key: BaseStreamingSource): Map[BaseStreamingSource, Offset] = baseMap - key
 
-  def ++(updates: GenTraversableOnce[(Source, Offset)]): StreamProgress = {
+  def ++(updates: GenTraversableOnce[(BaseStreamingSource, Offset)]): StreamProgress = {
     new StreamProgress(baseMap ++ updates)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index 6b82c78ea653d..0ca2e7854d94b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}
 
 object StreamingRelation {
   def apply(dataSource: DataSource): StreamingRelation = {
@@ -75,6 +76,52 @@ case class StreamingExecutionRelation(
   )
 }
 
+// We have to pack in the V1 data source as a shim, for the case when a source implements
+// continuous processing (which is always V2) but only has V1 microbatch support. We don't
+// know at read time whether the query is conntinuous or not, so we need to be able to
+// swap a V1 relation back in.
+/**
+ * Used to link a [[DataSourceV2]] into a streaming
+ * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]]. This is only used for creating
+ * a streaming [[org.apache.spark.sql.DataFrame]] from [[org.apache.spark.sql.DataFrameReader]],
+ * and should be converted before passing to [[StreamExecution]].
+ */
+case class StreamingRelationV2(
+    dataSource: DataSourceV2,
+    sourceName: String,
+    extraOptions: Map[String, String],
+    output: Seq[Attribute],
+    v1DataSource: DataSource)(session: SparkSession)
+  extends LeafNode {
+  override def isStreaming: Boolean = true
+  override def toString: String = sourceName
+
+  override def computeStats(): Statistics = Statistics(
+    sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
+  )
+}
+
+/**
+ * Used to link a [[DataSourceV2]] into a continuous processing execution.
+ */
+case class ContinuousExecutionRelation(
+    source: ContinuousReadSupport,
+    extraOptions: Map[String, String],
+    output: Seq[Attribute])(session: SparkSession)
+  extends LeafNode {
+
+  override def isStreaming: Boolean = true
+  override def toString: String = source.toString
+
+  // There's no sensible value here. On the execution path, this relation will be
+  // swapped out with microbatches. But some dataframe operations (in particular explain) do lead
+  // to this node surviving analysis. So we satisfy the LeafNode contract with the session default
+  // value.
+  override def computeStats(): Statistics = Statistics(
+    sizeInBytes = BigInt(session.sessionState.conf.defaultSizeInBytes)
+  )
+}
+
 /**
  * A dummy physical plan for [[StreamingRelation]] to support
  * [[org.apache.spark.sql.Dataset.explain]]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
new file mode 100644
index 0000000000000..89fb2ace20917
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.concurrent.{ArrayBlockingQueue, BlockingQueue, TimeUnit}
+import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
+
+import scala.collection.JavaConverters._
+
+import org.apache.spark._
+import org.apache.spark.internal.Logging
+import org.apache.spark.rdd.RDD
+import org.apache.spark.rpc.RpcEndpointRef
+import org.apache.spark.sql.{Row, SQLContext}
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceRDDPartition, RowToUnsafeDataReader}
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous._
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.streaming.ProcessingTime
+import org.apache.spark.util.{SystemClock, ThreadUtils}
+
+class ContinuousDataSourceRDD(
+    sc: SparkContext,
+    sqlContext: SQLContext,
+    @transient private val readTasks: java.util.List[ReadTask[UnsafeRow]])
+  extends RDD[UnsafeRow](sc, Nil) {
+
+  private val dataQueueSize = sqlContext.conf.continuousStreamingExecutorQueueSize
+  private val epochPollIntervalMs = sqlContext.conf.continuousStreamingExecutorPollIntervalMs
+
+  override protected def getPartitions: Array[Partition] = {
+    readTasks.asScala.zipWithIndex.map {
+      case (readTask, index) => new DataSourceRDDPartition(index, readTask)
+    }.toArray
+  }
+
+  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
+    val reader = split.asInstanceOf[DataSourceRDDPartition].readTask.createDataReader()
+
+    val runId = context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)
+
+    // This queue contains two types of messages:
+    // * (null, null) representing an epoch boundary.
+    // * (row, off) containing a data row and its corresponding PartitionOffset.
+    val queue = new ArrayBlockingQueue[(UnsafeRow, PartitionOffset)](dataQueueSize)
+
+    val epochPollFailed = new AtomicBoolean(false)
+    val epochPollExecutor = ThreadUtils.newDaemonSingleThreadScheduledExecutor(
+      s"epoch-poll--${runId}--${context.partitionId()}")
+    val epochPollRunnable = new EpochPollRunnable(queue, context, epochPollFailed)
+    epochPollExecutor.scheduleWithFixedDelay(
+      epochPollRunnable, 0, epochPollIntervalMs, TimeUnit.MILLISECONDS)
+
+    // Important sequencing - we must get start offset before the data reader thread begins
+    val startOffset = ContinuousDataSourceRDD.getBaseReader(reader).getOffset
+
+    val dataReaderFailed = new AtomicBoolean(false)
+    val dataReaderThread = new DataReaderThread(reader, queue, context, dataReaderFailed)
+    dataReaderThread.setDaemon(true)
+    dataReaderThread.start()
+
+    context.addTaskCompletionListener(_ => {
+      reader.close()
+      dataReaderThread.interrupt()
+      epochPollExecutor.shutdown()
+    })
+
+    val epochEndpoint = EpochCoordinatorRef.get(runId, SparkEnv.get)
+    new Iterator[UnsafeRow] {
+      private val POLL_TIMEOUT_MS = 1000
+
+      private var currentEntry: (UnsafeRow, PartitionOffset) = _
+      private var currentOffset: PartitionOffset = startOffset
+      private var currentEpoch =
+        context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
+
+      override def hasNext(): Boolean = {
+        while (currentEntry == null) {
+          if (context.isInterrupted() || context.isCompleted()) {
+            currentEntry = (null, null)
+          }
+          if (dataReaderFailed.get()) {
+            throw new SparkException("data read failed", dataReaderThread.failureReason)
+          }
+          if (epochPollFailed.get()) {
+            throw new SparkException("epoch poll failed", epochPollRunnable.failureReason)
+          }
+          currentEntry = queue.poll(POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS)
+        }
+
+        currentEntry match {
+          // epoch boundary marker
+          case (null, null) =>
+            epochEndpoint.send(ReportPartitionOffset(
+              context.partitionId(),
+              currentEpoch,
+              currentOffset))
+            currentEpoch += 1
+            currentEntry = null
+            false
+          // real row
+          case (_, offset) =>
+            currentOffset = offset
+            true
+        }
+      }
+
+      override def next(): UnsafeRow = {
+        if (currentEntry == null) throw new NoSuchElementException("No current row was set")
+        val r = currentEntry._1
+        currentEntry = null
+        r
+      }
+    }
+  }
+
+  override def getPreferredLocations(split: Partition): Seq[String] = {
+    split.asInstanceOf[DataSourceRDDPartition].readTask.preferredLocations()
+  }
+}
+
+case class EpochPackedPartitionOffset(epoch: Long) extends PartitionOffset
+
+class EpochPollRunnable(
+    queue: BlockingQueue[(UnsafeRow, PartitionOffset)],
+    context: TaskContext,
+    failedFlag: AtomicBoolean)
+  extends Thread with Logging {
+  private[continuous] var failureReason: Throwable = _
+
+  private val epochEndpoint = EpochCoordinatorRef.get(
+    context.getLocalProperty(ContinuousExecution.RUN_ID_KEY), SparkEnv.get)
+  private var currentEpoch = context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong
+
+  override def run(): Unit = {
+    try {
+      val newEpoch = epochEndpoint.askSync[Long](GetCurrentEpoch)
+      for (i <- currentEpoch to newEpoch - 1) {
+        queue.put((null, null))
+        logDebug(s"Sent marker to start epoch ${i + 1}")
+      }
+      currentEpoch = newEpoch
+    } catch {
+      case t: Throwable =>
+        failureReason = t
+        failedFlag.set(true)
+        throw t
+    }
+  }
+}
+
+class DataReaderThread(
+    reader: DataReader[UnsafeRow],
+    queue: BlockingQueue[(UnsafeRow, PartitionOffset)],
+    context: TaskContext,
+    failedFlag: AtomicBoolean)
+  extends Thread(
+    s"continuous-reader--${context.partitionId()}--" +
+    s"${context.getLocalProperty(ContinuousExecution.RUN_ID_KEY)}") {
+  private[continuous] var failureReason: Throwable = _
+
+  override def run(): Unit = {
+    val baseReader = ContinuousDataSourceRDD.getBaseReader(reader)
+    try {
+      while (!context.isInterrupted && !context.isCompleted()) {
+        if (!reader.next()) {
+          // Check again, since reader.next() might have blocked through an incoming interrupt.
+          if (!context.isInterrupted && !context.isCompleted()) {
+            throw new IllegalStateException(
+              "Continuous reader reported no elements! Reader should have blocked waiting.")
+          } else {
+            return
+          }
+        }
+
+        queue.put((reader.get().copy(), baseReader.getOffset))
+      }
+    } catch {
+      case _: InterruptedException if context.isInterrupted() =>
+        // Continuous shutdown always involves an interrupt; do nothing and shut down quietly.
+
+      case t: Throwable =>
+        failureReason = t
+        failedFlag.set(true)
+        // Don't rethrow the exception in this thread. It's not needed, and the default Spark
+        // exception handler will kill the executor.
+    }
+  }
+}
+
+object ContinuousDataSourceRDD {
+  private[continuous] def getBaseReader(reader: DataReader[UnsafeRow]): ContinuousDataReader[_] = {
+    reader match {
+      case r: ContinuousDataReader[UnsafeRow] => r
+      case wrapped: RowToUnsafeDataReader =>
+        wrapped.rowReader.asInstanceOf[ContinuousDataReader[Row]]
+      case _ =>
+        throw new IllegalStateException(s"Unknown continuous reader type ${reader.getClass}")
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
new file mode 100644
index 0000000000000..1c35b06bd4b85
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.concurrent.TimeUnit
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
+import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SQLExecution
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, ContinuousWriteSupport, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.writer.ContinuousWriter
+import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.{Clock, Utils}
+
+class ContinuousExecution(
+    sparkSession: SparkSession,
+    name: String,
+    checkpointRoot: String,
+    analyzedPlan: LogicalPlan,
+    sink: ContinuousWriteSupport,
+    trigger: Trigger,
+    triggerClock: Clock,
+    outputMode: OutputMode,
+    extraOptions: Map[String, String],
+    deleteCheckpointOnStop: Boolean)
+  extends StreamExecution(
+    sparkSession, name, checkpointRoot, analyzedPlan, sink,
+    trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
+
+  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
+  override protected def sources: Seq[BaseStreamingSource] = continuousSources
+
+  override lazy val logicalPlan: LogicalPlan = {
+    assert(queryExecutionThread eq Thread.currentThread,
+      "logicalPlan must be initialized in StreamExecutionThread " +
+        s"but the current thread was ${Thread.currentThread}")
+    val toExecutionRelationMap = MutableMap[StreamingRelationV2, ContinuousExecutionRelation]()
+    analyzedPlan.transform {
+      case r @ StreamingRelationV2(
+          source: ContinuousReadSupport, _, extraReaderOptions, output, _) =>
+        toExecutionRelationMap.getOrElseUpdate(r, {
+          ContinuousExecutionRelation(source, extraReaderOptions, output)(sparkSession)
+        })
+      case StreamingRelationV2(_, sourceName, _, _, _) =>
+        throw new AnalysisException(
+          s"Data source $sourceName does not support continuous processing.")
+    }
+  }
+
+  private val triggerExecutor = trigger match {
+    case ContinuousTrigger(t) => ProcessingTimeExecutor(ProcessingTime(t), triggerClock)
+    case _ => throw new IllegalStateException(s"Unsupported type of trigger: $trigger")
+  }
+
+  override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
+    do {
+      try {
+        runContinuous(sparkSessionForStream)
+      } catch {
+        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
+          // swallow exception and run again
+          state.set(ACTIVE)
+      }
+    } while (state.get() == ACTIVE)
+  }
+
+  /**
+   * Populate the start offsets to start the execution at the current offsets stored in the sink
+   * (i.e. avoid reprocessing data that we have already processed). This function must be called
+   * before any processing occurs and will populate the following fields:
+   *  - currentBatchId
+   *  - committedOffsets
+   *  The basic structure of this method is as follows:
+   *
+   *  Identify (from the commit log) the latest epoch that has committed
+   *  IF last epoch exists THEN
+   *    Get end offsets for the epoch
+   *    Set those offsets as the current commit progress
+   *    Set the next epoch ID as the last + 1
+   *    Return the end offsets of the last epoch as start for the next one
+   *    DONE
+   *  ELSE
+   *    Start a new query log
+   *  DONE
+   */
+  private def getStartOffsets(sparkSessionToRunBatches: SparkSession): OffsetSeq = {
+    // Note that this will need a slight modification for exactly once. If ending offsets were
+    // reported but not committed for any epochs, we must replay exactly to those offsets.
+    // For at least once, we can just ignore those reports and risk duplicates.
+    commitLog.getLatest() match {
+      case Some((latestEpochId, _)) =>
+        val nextOffsets = offsetLog.get(latestEpochId).getOrElse {
+          throw new IllegalStateException(
+            s"Batch $latestEpochId was committed without end epoch offsets!")
+        }
+        committedOffsets = nextOffsets.toStreamProgress(sources)
+
+        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
+        // a previous run. We can't allow commits to an epoch that a previous run reached but
+        // this run has not.
+        offsetLog.purgeAfter(latestEpochId)
+
+        currentBatchId = latestEpochId + 1
+        logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
+        nextOffsets
+      case None =>
+        // We are starting this stream for the first time. Offsets are all None.
+        logInfo(s"Starting new streaming query.")
+        currentBatchId = 0
+        OffsetSeq.fill(continuousSources.map(_ => null): _*)
+    }
+  }
+
+  /**
+   * Do a continuous run.
+   * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
+   */
+  private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
+    // A list of attributes that will need to be updated.
+    val replacements = new ArrayBuffer[(Attribute, Attribute)]
+    // Translate from continuous relation to the underlying data source.
+    var nextSourceId = 0
+    continuousSources = logicalPlan.collect {
+      case ContinuousExecutionRelation(dataSource, extraReaderOptions, output) =>
+        val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+        nextSourceId += 1
+
+        dataSource.createContinuousReader(
+          java.util.Optional.empty[StructType](),
+          metadataPath,
+          new DataSourceV2Options(extraReaderOptions.asJava))
+    }
+    uniqueSources = continuousSources.distinct
+
+    val offsets = getStartOffsets(sparkSessionForQuery)
+
+    var insertedSourceId = 0
+    val withNewSources = logicalPlan transform {
+      case ContinuousExecutionRelation(_, _, output) =>
+        val reader = continuousSources(insertedSourceId)
+        insertedSourceId += 1
+        val newOutput = reader.readSchema().toAttributes
+
+        assert(output.size == newOutput.size,
+          s"Invalid reader: ${Utils.truncatedString(output, ",")} != " +
+            s"${Utils.truncatedString(newOutput, ",")}")
+        replacements ++= output.zip(newOutput)
+
+        val loggedOffset = offsets.offsets(0)
+        val realOffset = loggedOffset.map(off => reader.deserializeOffset(off.json))
+        reader.setOffset(java.util.Optional.ofNullable(realOffset.orNull))
+        DataSourceV2Relation(newOutput, reader)
+    }
+
+    // Rewire the plan to use the new attributes that were returned by the source.
+    val replacementMap = AttributeMap(replacements)
+    val triggerLogicalPlan = withNewSources transformAllExpressions {
+      case a: Attribute if replacementMap.contains(a) =>
+        replacementMap(a).withMetadata(a.metadata)
+      case (_: CurrentTimestamp | _: CurrentDate) =>
+        throw new IllegalStateException(
+          "CurrentTimestamp and CurrentDate not yet supported for continuous processing")
+    }
+
+    val writer = sink.createContinuousWriter(
+      s"$runId",
+      triggerLogicalPlan.schema,
+      outputMode,
+      new DataSourceV2Options(extraOptions.asJava))
+    val withSink = WriteToDataSourceV2(writer.get(), triggerLogicalPlan)
+
+    val reader = withSink.collect {
+      case DataSourceV2Relation(_, r: ContinuousReader) => r
+    }.head
+
+    reportTimeTaken("queryPlanning") {
+      lastExecution = new IncrementalExecution(
+        sparkSessionForQuery,
+        withSink,
+        outputMode,
+        checkpointFile("state"),
+        runId,
+        currentBatchId,
+        offsetSeqMetadata)
+      lastExecution.executedPlan // Force the lazy generation of execution plan
+    }
+
+    sparkSession.sparkContext.setLocalProperty(
+      ContinuousExecution.START_EPOCH_KEY, currentBatchId.toString)
+    sparkSession.sparkContext.setLocalProperty(
+      ContinuousExecution.RUN_ID_KEY, runId.toString)
+
+    // Use the parent Spark session for the endpoint since it's where this query ID is registered.
+    val epochEndpoint =
+      EpochCoordinatorRef.create(
+        writer.get(), reader, this, currentBatchId, sparkSession, SparkEnv.get)
+    val epochUpdateThread = new Thread(new Runnable {
+      override def run: Unit = {
+        try {
+          triggerExecutor.execute(() => {
+            startTrigger()
+
+            if (reader.needsReconfiguration()) {
+              state.set(RECONFIGURING)
+              stopSources()
+              if (queryExecutionThread.isAlive) {
+                sparkSession.sparkContext.cancelJobGroup(runId.toString)
+                queryExecutionThread.interrupt()
+                // No need to join - this thread is about to end anyway.
+              }
+              false
+            } else if (isActive) {
+              currentBatchId = epochEndpoint.askSync[Long](IncrementAndGetEpoch)
+              logInfo(s"New epoch $currentBatchId is starting.")
+              true
+            } else {
+              false
+            }
+          })
+        } catch {
+          case _: InterruptedException =>
+            // Cleanly stop the query.
+            return
+        }
+      }
+    }, s"epoch update thread for $prettyIdString")
+
+    try {
+      epochUpdateThread.setDaemon(true)
+      epochUpdateThread.start()
+
+      reportTimeTaken("runContinuous") {
+        SQLExecution.withNewExecutionId(
+          sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
+      }
+    } finally {
+      SparkEnv.get.rpcEnv.stop(epochEndpoint)
+
+      epochUpdateThread.interrupt()
+      epochUpdateThread.join()
+    }
+  }
+
+  /**
+   * Report ending partition offsets for the given reader at the given epoch.
+   */
+  def addOffset(
+      epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
+    assert(continuousSources.length == 1, "only one continuous source supported currently")
+
+    if (partitionOffsets.contains(null)) {
+      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
+      // there's nothing meaningful to add to the offset log.
+    }
+    val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
+    synchronized {
+      if (queryExecutionThread.isAlive) {
+        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      } else {
+        return
+      }
+    }
+  }
+
+  /**
+   * Mark the specified epoch as committed. All readers must have reported end offsets for the epoch
+   * before this is called.
+   */
+  def commit(epoch: Long): Unit = {
+    assert(continuousSources.length == 1, "only one continuous source supported currently")
+    assert(offsetLog.get(epoch).isDefined, s"offset for epoch $epoch not reported before commit")
+    synchronized {
+      if (queryExecutionThread.isAlive) {
+        commitLog.add(epoch)
+        val offset = offsetLog.get(epoch).get.offsets(0).get
+        committedOffsets ++= Seq(continuousSources(0) -> offset)
+      } else {
+        return
+      }
+    }
+
+    if (minLogEntriesToMaintain < currentBatchId) {
+      offsetLog.purge(currentBatchId - minLogEntriesToMaintain)
+      commitLog.purge(currentBatchId - minLogEntriesToMaintain)
+    }
+
+    awaitProgressLock.lock()
+    try {
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
+    }
+  }
+
+  /**
+   * Blocks the current thread until execution has committed at or after the specified epoch.
+   */
+  private[sql] def awaitEpoch(epoch: Long): Unit = {
+    def notDone = {
+      val latestCommit = commitLog.getLatest()
+      latestCommit match {
+        case Some((latestEpoch, _)) =>
+          latestEpoch < epoch
+        case None => true
+      }
+    }
+
+    while (notDone) {
+      awaitProgressLock.lock()
+      try {
+        awaitProgressLockCondition.await(100, TimeUnit.MILLISECONDS)
+        if (streamDeathCause != null) {
+          throw streamDeathCause
+        }
+      } finally {
+        awaitProgressLock.unlock()
+      }
+    }
+  }
+}
+
+object ContinuousExecution {
+  val START_EPOCH_KEY = "__continuous_start_epoch"
+  val RUN_ID_KEY = "__run_id"
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index 4c3a1ee201ac1..89a8562b4b59e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -25,7 +25,7 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamOffset}
+import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.execution.streaming.sources.RateStreamSourceV2
 import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
@@ -47,13 +47,14 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
   override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
     assert(offsets.length == numPartitions)
     val tuples = offsets.map {
-      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) => (i, (currVal, nextRead))
+      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) =>
+        (i, ValueRunTimeMsPair(currVal, nextRead))
     }
     RateStreamOffset(Map(tuples: _*))
   }
 
   override def deserializeOffset(json: String): Offset = {
-    RateStreamOffset(Serialization.read[Map[Int, (Long, Long)]](json))
+    RateStreamOffset(Serialization.read[Map[Int, ValueRunTimeMsPair]](json))
   }
 
   override def readSchema(): StructType = RateSourceProvider.SCHEMA
@@ -85,8 +86,8 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
       RateStreamReadTask(
-        start._1, // starting row value
-        start._2, // starting time in ms
+        start.value,
+        start.runTimeMs,
         i,
         numPartitions,
         perPartitionRate)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTrigger.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTrigger.scala
new file mode 100644
index 0000000000000..90e1766c4d9f1
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousTrigger.scala
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.concurrent.TimeUnit
+
+import scala.concurrent.duration.Duration
+
+import org.apache.commons.lang3.StringUtils
+
+import org.apache.spark.annotation.{Experimental, InterfaceStability}
+import org.apache.spark.sql.streaming.{ProcessingTime, Trigger}
+import org.apache.spark.unsafe.types.CalendarInterval
+
+/**
+ * A [[Trigger]] that continuously processes streaming data, asynchronously checkpointing at
+ * the specified interval.
+ */
+@InterfaceStability.Evolving
+case class ContinuousTrigger(intervalMs: Long) extends Trigger {
+  require(intervalMs >= 0, "the interval of trigger should not be negative")
+}
+
+private[sql] object ContinuousTrigger {
+  def apply(interval: String): ContinuousTrigger = {
+    if (StringUtils.isBlank(interval)) {
+      throw new IllegalArgumentException(
+        "interval cannot be null or blank.")
+    }
+    val cal = if (interval.startsWith("interval")) {
+      CalendarInterval.fromString(interval)
+    } else {
+      CalendarInterval.fromString("interval " + interval)
+    }
+    if (cal == null) {
+      throw new IllegalArgumentException(s"Invalid interval: $interval")
+    }
+    if (cal.months > 0) {
+      throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval")
+    }
+    new ContinuousTrigger(cal.microseconds / 1000)
+  }
+
+  def apply(interval: Duration): ContinuousTrigger = {
+    ContinuousTrigger(interval.toMillis)
+  }
+
+  def create(interval: String): ContinuousTrigger = {
+    apply(interval)
+  }
+
+  def create(interval: Long, unit: TimeUnit): ContinuousTrigger = {
+    ContinuousTrigger(unit.toMillis(interval))
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
new file mode 100644
index 0000000000000..7f1e8abd79b99
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.streaming.continuous
+
+import java.util.concurrent.atomic.AtomicLong
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkEnv
+import org.apache.spark.internal.Logging
+import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.streaming.StreamingQueryWrapper
+import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, PartitionOffset}
+import org.apache.spark.sql.sources.v2.writer.{ContinuousWriter, WriterCommitMessage}
+import org.apache.spark.util.RpcUtils
+
+private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
+
+// Driver epoch trigger message
+/**
+ * Atomically increment the current epoch and get the new value.
+ */
+private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
+
+// Init messages
+/**
+ * Set the reader and writer partition counts. Tasks may not be started until the coordinator
+ * has acknowledged these messages.
+ */
+private[sql] case class SetReaderPartitions(numPartitions: Int) extends EpochCoordinatorMessage
+case class SetWriterPartitions(numPartitions: Int) extends EpochCoordinatorMessage
+
+// Partition task messages
+/**
+ * Get the current epoch.
+ */
+private[sql] case object GetCurrentEpoch extends EpochCoordinatorMessage
+/**
+ * Commit a partition at the specified epoch with the given message.
+ */
+private[sql] case class CommitPartitionEpoch(
+    partitionId: Int,
+    epoch: Long,
+    message: WriterCommitMessage) extends EpochCoordinatorMessage
+/**
+ * Report that a partition is ending the specified epoch at the specified offset.
+ */
+private[sql] case class ReportPartitionOffset(
+    partitionId: Int,
+    epoch: Long,
+    offset: PartitionOffset) extends EpochCoordinatorMessage
+
+
+/** Helper object used to create reference to [[EpochCoordinator]]. */
+private[sql] object EpochCoordinatorRef extends Logging {
+  private def endpointName(runId: String) = s"EpochCoordinator-$runId"
+
+  /**
+   * Create a reference to a new [[EpochCoordinator]].
+   */
+  def create(
+      writer: ContinuousWriter,
+      reader: ContinuousReader,
+      query: ContinuousExecution,
+      startEpoch: Long,
+      session: SparkSession,
+      env: SparkEnv): RpcEndpointRef = synchronized {
+    val coordinator = new EpochCoordinator(
+      writer, reader, query, startEpoch, session, env.rpcEnv)
+    val ref = env.rpcEnv.setupEndpoint(endpointName(query.runId.toString()), coordinator)
+    logInfo("Registered EpochCoordinator endpoint")
+    ref
+  }
+
+  def get(runId: String, env: SparkEnv): RpcEndpointRef = synchronized {
+    val rpcEndpointRef = RpcUtils.makeDriverRef(endpointName(runId), env.conf, env.rpcEnv)
+    logDebug("Retrieved existing EpochCoordinator endpoint")
+    rpcEndpointRef
+  }
+}
+
+/**
+ * Handles three major epoch coordination tasks for continuous processing:
+ *
+ * * Maintains a local epoch counter (the "driver epoch"), incremented by IncrementAndGetEpoch
+ *   and pollable from executors by GetCurrentEpoch. Note that this epoch is *not* immediately
+ *   reflected anywhere in ContinuousExecution.
+ * * Collates ReportPartitionOffset messages, and forwards to ContinuousExecution when all
+ *   readers have ended a given epoch.
+ * * Collates CommitPartitionEpoch messages, and forwards to ContinuousExecution when all readers
+ *   have both committed and reported an end offset for a given epoch.
+ */
+private[continuous] class EpochCoordinator(
+    writer: ContinuousWriter,
+    reader: ContinuousReader,
+    query: ContinuousExecution,
+    startEpoch: Long,
+    session: SparkSession,
+    override val rpcEnv: RpcEnv)
+  extends ThreadSafeRpcEndpoint with Logging {
+
+  private var numReaderPartitions: Int = _
+  private var numWriterPartitions: Int = _
+
+  private var currentDriverEpoch = startEpoch
+
+  // (epoch, partition) -> message
+  private val partitionCommits =
+    mutable.Map[(Long, Int), WriterCommitMessage]()
+  // (epoch, partition) -> offset
+  private val partitionOffsets =
+    mutable.Map[(Long, Int), PartitionOffset]()
+
+  private def resolveCommitsAtEpoch(epoch: Long) = {
+    val thisEpochCommits =
+      partitionCommits.collect { case ((e, _), msg) if e == epoch => msg }
+    val nextEpochOffsets =
+      partitionOffsets.collect { case ((e, _), o) if e == epoch => o }
+
+    if (thisEpochCommits.size == numWriterPartitions &&
+      nextEpochOffsets.size == numReaderPartitions) {
+      logDebug(s"Epoch $epoch has received commits from all partitions. Committing globally.")
+      // Sequencing is important here. We must commit to the writer before recording the commit
+      // in the query, or we will end up dropping the commit if we restart in the middle.
+      writer.commit(epoch, thisEpochCommits.toArray)
+      query.commit(epoch)
+
+      // Cleanup state from before this epoch, now that we know all partitions are forever past it.
+      for (k <- partitionCommits.keys.filter { case (e, _) => e < epoch }) {
+        partitionCommits.remove(k)
+      }
+      for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
+        partitionCommits.remove(k)
+      }
+    }
+  }
+
+  override def receive: PartialFunction[Any, Unit] = {
+    case CommitPartitionEpoch(partitionId, epoch, message) =>
+      logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
+      if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
+        partitionCommits.put((epoch, partitionId), message)
+        resolveCommitsAtEpoch(epoch)
+      }
+
+    case ReportPartitionOffset(partitionId, epoch, offset) =>
+      partitionOffsets.put((epoch, partitionId), offset)
+      val thisEpochOffsets =
+        partitionOffsets.collect { case ((e, _), o) if e == epoch => o }
+      if (thisEpochOffsets.size == numReaderPartitions) {
+        logDebug(s"Epoch $epoch has offsets reported from all partitions: $thisEpochOffsets")
+        query.addOffset(epoch, reader, thisEpochOffsets.toSeq)
+        resolveCommitsAtEpoch(epoch)
+      }
+  }
+
+  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case GetCurrentEpoch =>
+      val result = currentDriverEpoch
+      logDebug(s"Epoch $result")
+      context.reply(result)
+
+    case IncrementAndGetEpoch =>
+      currentDriverEpoch += 1
+      context.reply(currentDriverEpoch)
+
+    case SetReaderPartitions(numPartitions) =>
+      numReaderPartitions = numPartitions
+      context.reply(())
+
+    case SetWriterPartitions(numPartitions) =>
+      numWriterPartitions = numPartitions
+      context.reply(())
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 45dc7d75cbc8d..1c66aed8690a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -27,7 +27,7 @@ import org.json4s.jackson.Serialization
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.execution.streaming.RateStreamOffset
+import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
@@ -71,7 +71,7 @@ class RateStreamV2Reader(options: DataSourceV2Options)
       val currentTime = clock.getTimeMillis()
       RateStreamOffset(
         this.start.partitionToValueAndRunTimeMs.map {
-          case startOffset @ (part, (currentVal, currentReadTime)) =>
+          case startOffset @ (part, ValueRunTimeMsPair(currentVal, currentReadTime)) =>
             // Calculate the number of rows we should advance in this partition (based on the
             // current time), and output a corresponding offset.
             val readInterval = currentTime - currentReadTime
@@ -79,9 +79,9 @@ class RateStreamV2Reader(options: DataSourceV2Options)
             if (numNewRows <= 0) {
               startOffset
             } else {
-              (part,
-                (currentVal + (numNewRows * numPartitions),
-                currentReadTime + (numNewRows * msPerPartitionBetweenRows)))
+              (part, ValueRunTimeMsPair(
+                  currentVal + (numNewRows * numPartitions),
+                  currentReadTime + (numNewRows * msPerPartitionBetweenRows)))
             }
         }
       )
@@ -98,15 +98,15 @@ class RateStreamV2Reader(options: DataSourceV2Options)
   }
 
   override def deserializeOffset(json: String): Offset = {
-    RateStreamOffset(Serialization.read[Map[Int, (Long, Long)]](json))
+    RateStreamOffset(Serialization.read[Map[Int, ValueRunTimeMsPair]](json))
   }
 
   override def createReadTasks(): java.util.List[ReadTask[Row]] = {
     val startMap = start.partitionToValueAndRunTimeMs
     val endMap = end.partitionToValueAndRunTimeMs
     endMap.keys.toSeq.map { part =>
-      val (endVal, _) = endMap(part)
-      val (startVal, startTimeMs) = startMap(part)
+      val ValueRunTimeMsPair(endVal, _) = endMap(part)
+      val ValueRunTimeMsPair(startVal, startTimeMs) = startMap(part)
 
       val packedRows = mutable.ListBuffer[(Long, Long)]()
       var outVal = startVal + numPartitions
@@ -158,7 +158,8 @@ object RateStreamSourceV2 {
         // by the increment that will later be applied. The first row output in each
         // partition will have a value equal to the partition index.
         (i,
-          ((i - numPartitions).toLong,
+          ValueRunTimeMsPair(
+            (i - numPartitions).toLong,
             creationTimeMs))
       }.toMap)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 94c5dd63089b1..972248d5e4df8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -25,6 +25,8 @@ import scala.util.control.NonFatal
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.sources.v2.{ContinuousWriteSupport, DataSourceV2, DataSourceV2Options, MicroBatchWriteSupport}
@@ -177,3 +179,14 @@ class MemoryDataWriter(partition: Int, outputMode: OutputMode)
 
   override def abort(): Unit = {}
 }
+
+
+/**
+ * Used to query the data that has been written into a [[MemorySink]].
+ */
+case class MemoryPlanV2(sink: MemorySinkV2, override val output: Seq[Attribute]) extends LeafNode {
+  private val sizePerRow = output.map(_.dataType.defaultSize).sum
+
+  override def computeStats(): Statistics = Statistics(sizePerRow * sink.allData.size)
+}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 41aa02c2b5e35..f17935e86f459 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -26,8 +26,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession}
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.execution.streaming.StreamingRelation
+import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
+import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Options, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.Utils
 
 /**
  * Interface used to load a streaming `Dataset` from external storage systems (e.g. file systems,
@@ -153,13 +155,33 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
         "read files of Hive data source directly.")
     }
 
-    val dataSource =
-      DataSource(
-        sparkSession,
-        userSpecifiedSchema = userSpecifiedSchema,
-        className = source,
-        options = extraOptions.toMap)
-    Dataset.ofRows(sparkSession, StreamingRelation(dataSource))
+    val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).newInstance()
+    val options = new DataSourceV2Options(extraOptions.asJava)
+    // We need to generate the V1 data source so we can pass it to the V2 relation as a shim.
+    // We can't be sure at this point whether we'll actually want to use V2, since we don't know the
+    // writer or whether the query is continuous.
+    val v1DataSource = DataSource(
+      sparkSession,
+      userSpecifiedSchema = userSpecifiedSchema,
+      className = source,
+      options = extraOptions.toMap)
+    ds match {
+      case s: ContinuousReadSupport =>
+        val tempReader = s.createContinuousReader(
+          java.util.Optional.ofNullable(userSpecifiedSchema.orNull),
+          Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
+          options)
+        // Generate the V1 node to catch errors thrown within generation.
+        StreamingRelation(v1DataSource)
+        Dataset.ofRows(
+          sparkSession,
+          StreamingRelationV2(
+            s, source, extraOptions.toMap,
+            tempReader.readSchema().toAttributes, v1DataSource)(sparkSession))
+      case _ =>
+        // Code path for data source v1.
+        Dataset.ofRows(sparkSession, StreamingRelation(v1DataSource))
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index 0be69b98abc8a..db588ae282f38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -26,7 +26,9 @@ import org.apache.spark.sql.{AnalysisException, Dataset, ForeachWriter}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.execution.streaming.{ForeachSink, MemoryPlan, MemorySink}
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
+import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -240,14 +242,23 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
       if (extraOptions.get("queryName").isEmpty) {
         throw new AnalysisException("queryName must be specified for memory sink")
       }
-      val sink = new MemorySink(df.schema, outputMode)
-      val resultDf = Dataset.ofRows(df.sparkSession, new MemoryPlan(sink))
+      val (sink, resultDf) = trigger match {
+        case _: ContinuousTrigger =>
+          val s = new MemorySinkV2()
+          val r = Dataset.ofRows(df.sparkSession, new MemoryPlanV2(s, df.schema.toAttributes))
+          (s, r)
+        case _ =>
+          val s = new MemorySink(df.schema, outputMode)
+          val r = Dataset.ofRows(df.sparkSession, new MemoryPlan(s))
+          (s, r)
+      }
       val chkpointLoc = extraOptions.get("checkpointLocation")
       val recoverFromChkpoint = outputMode == OutputMode.Complete()
       val query = df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         chkpointLoc,
         df,
+        extraOptions.toMap,
         sink,
         outputMode,
         useTempCheckpointLocation = true,
@@ -262,6 +273,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
+        extraOptions.toMap,
         sink,
         outputMode,
         useTempCheckpointLocation = true,
@@ -277,6 +289,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
+        extraOptions.toMap,
         dataSource.createSink(outputMode),
         outputMode,
         useTempCheckpointLocation = source == "console",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index 555d6e23f9385..e808ffaa96410 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -29,8 +29,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.v2.ContinuousWriteSupport
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
@@ -188,7 +190,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
       userSpecifiedName: Option[String],
       userSpecifiedCheckpointLocation: Option[String],
       df: DataFrame,
-      sink: Sink,
+      extraOptions: Map[String, String],
+      sink: BaseStreamingSink,
       outputMode: OutputMode,
       useTempCheckpointLocation: Boolean,
       recoverFromCheckpointLocation: Boolean,
@@ -237,16 +240,32 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           "is not supported in streaming DataFrames/Datasets and will be disabled.")
     }
 
-    new StreamingQueryWrapper(new MicroBatchExecution(
-      sparkSession,
-      userSpecifiedName.orNull,
-      checkpointLocation,
-      analyzedPlan,
-      sink,
-      trigger,
-      triggerClock,
-      outputMode,
-      deleteCheckpointOnStop))
+    sink match {
+      case v1Sink: Sink =>
+        new StreamingQueryWrapper(new MicroBatchExecution(
+          sparkSession,
+          userSpecifiedName.orNull,
+          checkpointLocation,
+          analyzedPlan,
+          v1Sink,
+          trigger,
+          triggerClock,
+          outputMode,
+          deleteCheckpointOnStop))
+      case v2Sink: ContinuousWriteSupport =>
+        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
+        new StreamingQueryWrapper(new ContinuousExecution(
+          sparkSession,
+          userSpecifiedName.orNull,
+          checkpointLocation,
+          analyzedPlan,
+          v2Sink,
+          trigger,
+          triggerClock,
+          outputMode,
+          extraOptions,
+          deleteCheckpointOnStop))
+    }
   }
 
   /**
@@ -269,7 +288,8 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
       userSpecifiedName: Option[String],
       userSpecifiedCheckpointLocation: Option[String],
       df: DataFrame,
-      sink: Sink,
+      extraOptions: Map[String, String],
+      sink: BaseStreamingSink,
       outputMode: OutputMode,
       useTempCheckpointLocation: Boolean = false,
       recoverFromCheckpointLocation: Boolean = true,
@@ -279,6 +299,7 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
       userSpecifiedName,
       userSpecifiedCheckpointLocation,
       df,
+      extraOptions,
       sink,
       outputMode,
       useTempCheckpointLocation,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
index fcaca3d75b74f..9fb8be423614b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/QueryTest.scala
@@ -297,31 +297,47 @@ object QueryTest {
     })
   }
 
+  private def genError(
+      expectedAnswer: Seq[Row],
+      sparkAnswer: Seq[Row],
+      isSorted: Boolean = false): String = {
+    val getRowType: Option[Row] => String = row =>
+      row.map(row =>
+        if (row.schema == null) {
+          "struct<>"
+        } else {
+          s"${row.schema.catalogString}"
+        }).getOrElse("struct<>")
+
+    s"""
+       |== Results ==
+       |${
+      sideBySide(
+        s"== Correct Answer - ${expectedAnswer.size} ==" +:
+          getRowType(expectedAnswer.headOption) +:
+          prepareAnswer(expectedAnswer, isSorted).map(_.toString()),
+        s"== Spark Answer - ${sparkAnswer.size} ==" +:
+          getRowType(sparkAnswer.headOption) +:
+          prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n")
+    }
+    """.stripMargin
+  }
+
+  def includesRows(
+      expectedRows: Seq[Row],
+      sparkAnswer: Seq[Row]): Option[String] = {
+    if (!prepareAnswer(expectedRows, true).toSet.subsetOf(prepareAnswer(sparkAnswer, true).toSet)) {
+      return Some(genError(expectedRows, sparkAnswer, true))
+    }
+    None
+  }
+
   def sameRows(
       expectedAnswer: Seq[Row],
       sparkAnswer: Seq[Row],
       isSorted: Boolean = false): Option[String] = {
     if (prepareAnswer(expectedAnswer, isSorted) != prepareAnswer(sparkAnswer, isSorted)) {
-      val getRowType: Option[Row] => String = row =>
-        row.map(row =>
-            if (row.schema == null) {
-              "struct<>"
-            } else {
-                s"${row.schema.catalogString}"
-            }).getOrElse("struct<>")
-
-      val errorMessage =
-        s"""
-         |== Results ==
-         |${sideBySide(
-        s"== Correct Answer - ${expectedAnswer.size} ==" +:
-         getRowType(expectedAnswer.headOption) +:
-         prepareAnswer(expectedAnswer, isSorted).map(_.toString()),
-        s"== Spark Answer - ${sparkAnswer.size} ==" +:
-         getRowType(sparkAnswer.headOption) +:
-         prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n")}
-        """.stripMargin
-      return Some(errorMessage)
+      return Some(genError(expectedAnswer, sparkAnswer, isSorted))
     }
     None
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index 6514c5f0fdfeb..dc833b2ccaa22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -29,16 +29,6 @@ import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Optio
 import org.apache.spark.sql.streaming.StreamTest
 
 class RateSourceV2Suite extends StreamTest {
-  test("microbatch in registry") {
-    DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
-      case ds: MicroBatchReadSupport =>
-        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
-        assert(reader.isInstanceOf[RateStreamV2Reader])
-      case _ =>
-        throw new IllegalStateException("Could not find v2 read support for rate")
-    }
-  }
-
   test("microbatch - numPartitions propagated") {
     val reader = new RateStreamV2Reader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
@@ -49,8 +39,8 @@ class RateSourceV2Suite extends StreamTest {
 
   test("microbatch - set offset") {
     val reader = new RateStreamV2Reader(DataSourceV2Options.empty())
-    val startOffset = RateStreamOffset(Map((0, (0, 1000))))
-    val endOffset = RateStreamOffset(Map((0, (0, 2000))))
+    val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
+    val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
     assert(reader.getStartOffset() == startOffset)
     assert(reader.getEndOffset() == endOffset)
@@ -63,15 +53,15 @@ class RateSourceV2Suite extends StreamTest {
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     reader.getStartOffset() match {
       case r: RateStreamOffset =>
-        assert(r.partitionToValueAndRunTimeMs(0)._2 == reader.creationTimeMs)
+        assert(r.partitionToValueAndRunTimeMs(0).runTimeMs == reader.creationTimeMs)
       case _ => throw new IllegalStateException("unexpected offset type")
     }
     reader.getEndOffset() match {
       case r: RateStreamOffset =>
         // End offset may be a bit beyond 100 ms/9 rows after creation if the wait lasted
         // longer than 100ms. It should never be early.
-        assert(r.partitionToValueAndRunTimeMs(0)._1 >= 9)
-        assert(r.partitionToValueAndRunTimeMs(0)._2 >= reader.creationTimeMs + 100)
+        assert(r.partitionToValueAndRunTimeMs(0).value >= 9)
+        assert(r.partitionToValueAndRunTimeMs(0).runTimeMs >= reader.creationTimeMs + 100)
 
       case _ => throw new IllegalStateException("unexpected offset type")
     }
@@ -80,8 +70,8 @@ class RateSourceV2Suite extends StreamTest {
   test("microbatch - predetermined batch size") {
     val reader = new RateStreamV2Reader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
-    val startOffset = RateStreamOffset(Map((0, (0, 1000))))
-    val endOffset = RateStreamOffset(Map((0, (20, 2000))))
+    val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
+    val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
     val tasks = reader.createReadTasks()
     assert(tasks.size == 1)
@@ -93,8 +83,8 @@ class RateSourceV2Suite extends StreamTest {
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
     val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
-      case (part, (currentVal, currentReadTime)) =>
-        (part, (currentVal + 33, currentReadTime + 1000))
+      case (part, ValueRunTimeMsPair(currentVal, currentReadTime)) =>
+        (part, ValueRunTimeMsPair(currentVal + 33, currentReadTime + 1000))
     }.toMap)
 
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -135,7 +125,7 @@ class RateSourceV2Suite extends StreamTest {
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)
-          ._2
+          .runTimeMs
         val r = t.createDataReader().asInstanceOf[RateStreamDataReader]
         for (rowIndex <- 0 to 9) {
           r.next()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 755490308b5b9..c65e5d3dd75c2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -77,10 +77,23 @@ class StreamSuite extends StreamTest {
   }
 
   test("StreamingRelation.computeStats") {
+    withTempDir { dir =>
+      val df = spark.readStream.format("csv").schema(StructType(Seq())).load(dir.getCanonicalPath)
+      val streamingRelation = df.logicalPlan collect {
+        case s: StreamingRelation => s
+      }
+      assert(streamingRelation.nonEmpty, "cannot find StreamingRelation")
+      assert(
+        streamingRelation.head.computeStats.sizeInBytes ==
+          spark.sessionState.conf.defaultSizeInBytes)
+    }
+  }
+
+  test("StreamingRelationV2.computeStats") {
     val streamingRelation = spark.readStream.format("rate").load().logicalPlan collect {
-      case s: StreamingRelation => s
+      case s: StreamingRelationV2 => s
     }
-    assert(streamingRelation.nonEmpty, "cannot find StreamingRelation")
+    assert(streamingRelation.nonEmpty, "cannot find StreamingExecutionRelation")
     assert(
       streamingRelation.head.computeStats.sizeInBytes == spark.sessionState.conf.defaultSizeInBytes)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 71a474ef63e84..fb9ebc81dd750 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -33,11 +33,14 @@ import org.scalatest.exceptions.TestFailedDueToTimeoutException
 import org.scalatest.time.Span
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.SparkEnv
 import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
 import org.apache.spark.sql.test.SharedSQLContext
@@ -168,6 +171,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
   }
 
+  case class CheckAnswerRowsContains(expectedAnswer: Seq[Row], lastOnly: Boolean = false)
+    extends StreamAction with StreamMustBeRunning {
+    override def toString: String = s"$operatorName: ${expectedAnswer.mkString(",")}"
+    private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
+  }
+
   case class CheckAnswerRowsByFunc(checkFunction: Row => Unit, lastOnly: Boolean)
       extends StreamAction with StreamMustBeRunning {
     override def toString: String = s"$operatorName: ${checkFunction.toString()}"
@@ -237,6 +246,25 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
       AssertOnQuery(query => { func(query); true })
   }
 
+  object AwaitEpoch {
+    def apply(epoch: Long): AssertOnQuery =
+      Execute {
+        case s: ContinuousExecution => s.awaitEpoch(epoch)
+        case _ => throw new IllegalStateException("microbatch cannot await epoch")
+      }
+  }
+
+  object IncrementEpoch {
+    def apply(): AssertOnQuery =
+      Execute {
+        case s: ContinuousExecution =>
+          val newEpoch = EpochCoordinatorRef.get(s.runId.toString, SparkEnv.get)
+            .askSync[Long](IncrementAndGetEpoch)
+          s.awaitEpoch(newEpoch - 1)
+        case _ => throw new IllegalStateException("microbatch cannot increment epoch")
+      }
+  }
+
   /**
    * Executes the specified actions on the given streaming DataFrame and provides helpful
    * error messages in the case of failures or incorrect answers.
@@ -246,7 +274,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
    */
   def testStream(
       _stream: Dataset[_],
-      outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = synchronized {
+      outputMode: OutputMode = OutputMode.Append,
+      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -259,7 +288,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     var currentStream: StreamExecution = null
     var lastStream: StreamExecution = null
     val awaiting = new mutable.HashMap[Int, Offset]() // source index -> offset to wait for
-    val sink = new MemorySink(stream.schema, outputMode)
+    val sink = if (useV2Sink) new MemorySinkV2 else new MemorySink(stream.schema, outputMode)
     val resetConfValues = mutable.Map[String, Option[String]]()
 
     @volatile
@@ -308,7 +337,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         ""
       }
 
-    def testState =
+    def testState = {
+      val sinkDebugString = sink match {
+        case s: MemorySink => s.toDebugString
+        case s: MemorySinkV2 => s.toDebugString
+      }
       s"""
          |== Progress ==
          |$testActions
@@ -321,12 +354,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
          |${if (streamThreadDeathCause != null) stackTraceToString(streamThreadDeathCause) else ""}
          |
          |== Sink ==
-         |${sink.toDebugString}
+         |$sinkDebugString
          |
          |
          |== Plan ==
          |${if (currentStream != null) currentStream.lastExecution else ""}
          """.stripMargin
+    }
 
     def verify(condition: => Boolean, message: String): Unit = {
       if (!condition) {
@@ -383,7 +417,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
         }
       }
 
-      try if (lastOnly) sink.latestBatchData else sink.allData catch {
+      val (latestBatchData, allData) = sink match {
+        case s: MemorySink => (s.latestBatchData, s.allData)
+        case s: MemorySinkV2 => (s.latestBatchData, s.allData)
+      }
+      try if (lastOnly) latestBatchData else allData catch {
         case e: Exception =>
           failTest("Exception while getting data from sink", e)
       }
@@ -423,6 +461,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   None,
                   Some(metadataRoot),
                   stream,
+                  Map(),
                   sink,
                   outputMode,
                   trigger = trigger,
@@ -594,6 +633,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
               error => failTest(error)
             }
 
+          case CheckAnswerRowsContains(expectedAnswer, lastOnly) =>
+            val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
+            QueryTest.includesRows(expectedAnswer, sparkAnswer).foreach {
+              error => failTest(error)
+            }
+
           case CheckAnswerRowsByFunc(checkFunction, lastOnly) =>
             val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
             sparkAnswer.foreach { row =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
new file mode 100644
index 0000000000000..eda0d8ad48313
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.streaming.continuous
+
+import java.io.{File, InterruptedIOException, IOException, UncheckedIOException}
+import java.nio.channels.ClosedByInterruptException
+import java.util.concurrent.{CountDownLatch, ExecutionException, TimeoutException, TimeUnit}
+
+import scala.reflect.ClassTag
+import scala.util.control.ControlThrowable
+
+import com.google.common.util.concurrent.UncheckedExecutionException
+import org.apache.commons.io.FileUtils
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SparkContext, SparkEnv}
+import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.plans.logical.Range
+import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
+import org.apache.spark.sql.execution.command.ExplainCommand
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanExec, WriteToDataSourceV2Exec}
+import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous._
+import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
+import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreConf, StateStoreId, StateStoreProvider}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.sources.StreamSourceProvider
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.streaming.util.StreamManualClock
+import org.apache.spark.sql.test.TestSparkSession
+import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
+
+class ContinuousSuiteBase extends StreamTest {
+  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
+  override protected def createSparkSession = new TestSparkSession(
+    new SparkContext(
+      "local[10]",
+      "continuous-stream-test-sql-context",
+      sparkConf.set("spark.sql.testkey", "true")))
+
+  protected def waitForRateSourceTriggers(query: StreamExecution, numTriggers: Int): Unit = {
+    query match {
+      case s: ContinuousExecution =>
+        assert(numTriggers >= 2, "must wait for at least 2 triggers to ensure query is initialized")
+        val reader = s.lastExecution.executedPlan.collectFirst {
+          case DataSourceV2ScanExec(_, r: ContinuousRateStreamReader) => r
+        }.get
+
+        val deltaMs = numTriggers * 1000 + 300
+        while (System.currentTimeMillis < reader.creationTime + deltaMs) {
+          Thread.sleep(reader.creationTime + deltaMs - System.currentTimeMillis)
+        }
+    }
+  }
+
+  // A continuous trigger that will only fire the initial time for the duration of a test.
+  // This allows clean testing with manual epoch advancement.
+  protected val longContinuousTrigger = Trigger.Continuous("1 hour")
+}
+
+class ContinuousSuite extends ContinuousSuiteBase {
+  import testImplicits._
+
+  test("basic rate source") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 10).map(Row(_))),
+      StopStream,
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(2),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 20).map(Row(_))),
+      StopStream)
+  }
+
+  test("map") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+      .map(r => r.getLong(0) * 2)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      Execute(waitForRateSourceTriggers(_, 4)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 40, 2).map(Row(_))))
+  }
+
+  test("flatMap") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+      .flatMap(r => Seq(0, r.getLong(0), r.getLong(0) * 2))
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      Execute(waitForRateSourceTriggers(_, 4)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 20).flatMap(n => Seq(0, n, n * 2)).map(Row(_))))
+  }
+
+  test("filter") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+      .where('value > 5)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      Execute(waitForRateSourceTriggers(_, 4)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(6, 20).map(Row(_))))
+  }
+
+  test("deduplicate") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+      .dropDuplicates()
+
+    val except = intercept[AnalysisException] {
+      testStream(df, useV2Sink = true)(StartStream(longContinuousTrigger))
+    }
+
+    assert(except.message.contains(
+      "Continuous processing does not support Deduplicate operations."))
+  }
+
+  test("timestamp") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select(current_timestamp())
+
+    val except = intercept[AnalysisException] {
+      testStream(df, useV2Sink = true)(StartStream(longContinuousTrigger))
+    }
+
+    assert(except.message.contains(
+      "Continuous processing does not support current time operations."))
+  }
+
+  test("repeatedly restart") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "5")
+      .load()
+      .select('value)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 10).map(Row(_))),
+      StopStream,
+      StartStream(longContinuousTrigger),
+      StopStream,
+      StartStream(longContinuousTrigger),
+      StopStream,
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(2),
+      Execute(waitForRateSourceTriggers(_, 2)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 20).map(Row(_))),
+      StopStream)
+  }
+
+  test("query without test harness") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "2")
+      .option("rowsPerSecond", "2")
+      .load()
+      .select('value)
+    val query = df.writeStream
+      .format("memory")
+      .queryName("noharness")
+      .trigger(Trigger.Continuous(100))
+      .start()
+    val continuousExecution =
+      query.asInstanceOf[StreamingQueryWrapper].streamingQuery.asInstanceOf[ContinuousExecution]
+    continuousExecution.awaitEpoch(0)
+    waitForRateSourceTriggers(continuousExecution, 2)
+    query.stop()
+
+    val results = spark.read.table("noharness").collect()
+    assert(Set(0, 1, 2, 3).map(Row(_)).subsetOf(results.toSet))
+  }
+}
+
+class ContinuousStressSuite extends ContinuousSuiteBase {
+  import testImplicits._
+
+  test("only one epoch") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "500")
+      .load()
+      .select('value)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(longContinuousTrigger),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 201)),
+      IncrementEpoch(),
+      Execute { query =>
+        val data = query.sink.asInstanceOf[MemorySinkV2].allData
+        val vals = data.map(_.getLong(0)).toSet
+        assert(scala.Range(0, 25000).forall { i =>
+          vals.contains(i)
+        })
+      })
+  }
+
+  test("automatic epoch advancement") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "500")
+      .load()
+      .select('value)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(0),
+      Execute(waitForRateSourceTriggers(_, 201)),
+      IncrementEpoch(),
+      CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
+  }
+
+  test("restarts") {
+    val df = spark.readStream
+      .format("rate")
+      .option("numPartitions", "5")
+      .option("rowsPerSecond", "500")
+      .load()
+      .select('value)
+
+    testStream(df, useV2Sink = true)(
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(10),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(20),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(21),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(22),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(25),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      StopStream,
+      StartStream(Trigger.Continuous(2012)),
+      AwaitEpoch(50),
+      CheckAnswerRowsContains(scala.Range(0, 25000).map(Row(_))))
+  }
+}

From 86db9b2d7d3c3c8e6c27b05cb36318dba9138e78 Mon Sep 17 00:00:00 2001
From: chetkhatri <ckhatrimanjal@gmail.com>
Date: Sat, 23 Dec 2017 08:13:34 -0600
Subject: [PATCH 181/356] [SPARK-22833][IMPROVEMENT] in SparkHive Scala
 Examples

## What changes were proposed in this pull request?

SparkHive Scala Examples Improvement made:
* Writing DataFrame / DataSet to Hive Managed , Hive External table using different storage format.
* Implementation of Partition, Reparition, Coalesce with appropriate example.

## How was this patch tested?
* Patch has been tested manually and by running ./dev/run-tests.

Author: chetkhatri <ckhatrimanjal@gmail.com>

Closes #20018 from chetkhatri/scala-sparkhive-examples.
---
 .../examples/sql/hive/SparkHiveExample.scala  | 38 +++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
index e5f75d53edc86..51df5dd8e3600 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -19,8 +19,7 @@ package org.apache.spark.examples.sql.hive
 // $example on:spark_hive$
 import java.io.File
 
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{Row, SaveMode, SparkSession}
 // $example off:spark_hive$
 
 object SparkHiveExample {
@@ -102,8 +101,41 @@ object SparkHiveExample {
     // |  4| val_4|  4| val_4|
     // |  5| val_5|  5| val_5|
     // ...
-    // $example off:spark_hive$
 
+    // Create Hive managed table with Parquet
+    sql("CREATE TABLE records(key int, value string) STORED AS PARQUET")
+    // Save DataFrame to Hive managed table as Parquet format
+    val hiveTableDF = sql("SELECT * FROM records")
+    hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records")
+    // Create External Hive table with Parquet
+    sql("CREATE EXTERNAL TABLE records(key int, value string) " +
+      "STORED AS PARQUET LOCATION '/user/hive/warehouse/'")
+    // to make Hive Parquet format compatible with Spark Parquet format
+    spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
+
+    // Multiple Parquet files could be created accordingly to volume of data under directory given.
+    val hiveExternalTableLocation = "/user/hive/warehouse/database_name.db/records"
+
+    // Save DataFrame to Hive External table as compatible Parquet format
+    hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation)
+
+    // Turn on flag for Dynamic Partitioning
+    spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
+    spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
+
+    // You can create partitions in Hive table, so downstream queries run much faster.
+    hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key")
+      .parquet(hiveExternalTableLocation)
+
+    // Reduce number of files for each partition by repartition
+    hiveTableDF.repartition($"key").write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
+
+    // Control the number of files in each partition by coalesce
+    hiveTableDF.coalesce(10).write.mode(SaveMode.Overwrite)
+      .partitionBy("key").parquet(hiveExternalTableLocation)
+    // $example off:spark_hive$
+    
     spark.stop()
   }
 }

From ea2642eb0ebcf02e8fba727a82c140c9f2284725 Mon Sep 17 00:00:00 2001
From: CNRui <13266776177@163.com>
Date: Sat, 23 Dec 2017 08:18:08 -0600
Subject: [PATCH 182/356] [SPARK-20694][EXAMPLES] Update
 SQLDataSourceExample.scala

## What changes were proposed in this pull request?

Create table using the right DataFrame. peopleDF->usersDF

peopleDF:
+----+-------+
| age|   name|
+----+-------+
usersDF:
+------+--------------+----------------+
|  name|favorite_color|favorite_numbers|
+------+--------------+----------------+

## How was this patch tested?

Manually tested.

Author: CNRui <13266776177@163.com>

Closes #20052 from CNRui/patch-2.
---
 .../apache/spark/examples/sql/SQLDataSourceExample.scala    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
index f9477969a4bb5..7d83aacb11548 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -67,15 +67,15 @@ object SQLDataSourceExample {
     usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")
     // $example off:write_partitioning$
     // $example on:write_partition_and_bucket$
-    peopleDF
+    usersDF
       .write
       .partitionBy("favorite_color")
       .bucketBy(42, "name")
-      .saveAsTable("people_partitioned_bucketed")
+      .saveAsTable("users_partitioned_bucketed")
     // $example off:write_partition_and_bucket$
 
     spark.sql("DROP TABLE IF EXISTS people_bucketed")
-    spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed")
+    spark.sql("DROP TABLE IF EXISTS users_partitioned_bucketed")
   }
 
   private def runBasicParquetExample(spark: SparkSession): Unit = {

From f6084a88f0fe69111df8a016bc81c9884d3d3402 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 24 Dec 2017 01:16:12 +0900
Subject: [PATCH 183/356] [HOTFIX] Fix Scala style checks

## What changes were proposed in this pull request?

This PR fixes a style that broke the build.

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20065 from HyukjinKwon/minor-style.
---
 .../org/apache/spark/examples/sql/hive/SparkHiveExample.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
index 51df5dd8e3600..b193bd595127c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -135,7 +135,7 @@ object SparkHiveExample {
     hiveTableDF.coalesce(10).write.mode(SaveMode.Overwrite)
       .partitionBy("key").parquet(hiveExternalTableLocation)
     // $example off:spark_hive$
-    
+
     spark.stop()
   }
 }

From aeb45df668a97a2d48cfd4079ed62601390979ba Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 24 Dec 2017 01:18:11 +0900
Subject: [PATCH 184/356] [SPARK-22844][R] Adds date_trunc in R API

## What changes were proposed in this pull request?

This PR adds `date_trunc` in R API as below:

```r
> df <- createDataFrame(list(list(a = as.POSIXlt("2012-12-13 12:34:00"))))
> head(select(df, date_trunc("hour", df$a)))
  date_trunc(hour, a)
1 2012-12-13 12:00:00
```

## How was this patch tested?

Unit tests added in `R/pkg/tests/fulltests/test_sparkSQL.R`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20031 from HyukjinKwon/r-datetrunc.
---
 R/pkg/NAMESPACE                       |  1 +
 R/pkg/R/functions.R                   | 34 +++++++++++++++++++++++----
 R/pkg/R/generics.R                    |  5 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R |  3 +++
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 57838f52eac3f..dce64e1e607c8 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -230,6 +230,7 @@ exportMethods("%<=>%",
               "date_add",
               "date_format",
               "date_sub",
+              "date_trunc",
               "datediff",
               "dayofmonth",
               "dayofweek",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 237ef061e8071..3a96f94e269f4 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -40,10 +40,17 @@ NULL
 #'
 #' @param x Column to compute on. In \code{window}, it must be a time Column of
 #'          \code{TimestampType}.
-#' @param format For \code{to_date} and \code{to_timestamp}, it is the string to use to parse
-#'               Column \code{x} to DateType or TimestampType. For \code{trunc}, it is the string
-#'               to use to specify the truncation method. For example, "year", "yyyy", "yy" for
-#'               truncate by year, or "month", "mon", "mm" for truncate by month.
+#' @param format The format for the given dates or timestamps in Column \code{x}. See the
+#'               format used in the following methods:
+#'               \itemize{
+#'               \item \code{to_date} and \code{to_timestamp}: it is the string to use to parse
+#'                    Column \code{x} to DateType or TimestampType.
+#'               \item \code{trunc}: it is the string to use to specify the truncation method.
+#'                    For example, "year", "yyyy", "yy" for truncate by year, or "month", "mon",
+#'                    "mm" for truncate by month.
+#'               \item \code{date_trunc}: it is similar with \code{trunc}'s but additionally
+#'                    supports "day", "dd", "second", "minute", "hour", "week" and "quarter".
+#'               }
 #' @param ... additional argument(s).
 #' @name column_datetime_functions
 #' @rdname column_datetime_functions
@@ -3478,3 +3485,22 @@ setMethod("trunc",
                               x@jc, as.character(format))
             column(jc)
           })
+
+#' @details
+#' \code{date_trunc}: Returns timestamp truncated to the unit specified by the format.
+#'
+#' @rdname column_datetime_functions
+#' @aliases date_trunc date_trunc,character,Column-method
+#' @export
+#' @examples
+#'
+#' \dontrun{
+#' head(select(df, df$time, date_trunc("hour", df$time), date_trunc("minute", df$time),
+#'             date_trunc("week", df$time), date_trunc("quarter", df$time)))}
+#' @note date_trunc since 2.3.0
+setMethod("date_trunc",
+          signature(format = "character", x = "Column"),
+          function(format, x) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "date_trunc", format, x@jc)
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 8fcf269087c7d..5ddaa669f9205 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1043,6 +1043,11 @@ setGeneric("date_format", function(y, x) { standardGeneric("date_format") })
 #' @name NULL
 setGeneric("date_sub", function(y, x) { standardGeneric("date_sub") })
 
+#' @rdname column_datetime_functions
+#' @export
+#' @name NULL
+setGeneric("date_trunc", function(format, x) { standardGeneric("date_trunc") })
+
 #' @rdname column_datetime_functions
 #' @export
 #' @name NULL
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index d87f5d2705732..6cc0188dae95f 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1418,6 +1418,8 @@ test_that("column functions", {
   c22 <- not(c)
   c23 <- trunc(c, "year") + trunc(c, "yyyy") + trunc(c, "yy") +
     trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
+  c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) +
+    date_trunc("quarter", c)
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
@@ -1729,6 +1731,7 @@ test_that("date functions on a DataFrame", {
   expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
   expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
   expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
+  expect_equal(collect(select(df2, month(date_trunc("yyyy", df2$b))))[, 1], c(1, 1))
 
   l3 <- list(list(a = 1000), list(a = -1000))
   df3 <- createDataFrame(l3)

From 1219d7a4343e837749c56492d382b3c814b97271 Mon Sep 17 00:00:00 2001
From: Shivaram Venkataraman <shivaram@cs.berkeley.edu>
Date: Sat, 23 Dec 2017 10:27:14 -0800
Subject: [PATCH 185/356] [SPARK-22889][SPARKR] Set overwrite=T when install
 SparkR in tests

## What changes were proposed in this pull request?

Since all CRAN checks go through the same machine, if there is an older partial download or partial install of Spark left behind the tests fail. This PR overwrites the install files when running tests. This shouldn't affect Jenkins as `SPARK_HOME` is set when running Jenkins tests.

## How was this patch tested?

Test manually by running `R CMD check --as-cran`

Author: Shivaram Venkataraman <shivaram@cs.berkeley.edu>

Closes #20060 from shivaram/sparkr-overwrite-cran.
---
 R/pkg/tests/run-all.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index 63812ba70bb50..94d75188fb948 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -27,7 +27,10 @@ if (.Platform$OS.type == "windows") {
 
 # Setup global test environment
 # Install Spark first to set SPARK_HOME
-install.spark()
+
+# NOTE(shivaram): We set overwrite to handle any old tar.gz files or directories left behind on
+# CRAN machines. For Jenkins we should already have SPARK_HOME set.
+install.spark(overwrite = TRUE)
 
 sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R")
 sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db")

From 0bf1a74a773c79e66a67055298a36af477b9e21a Mon Sep 17 00:00:00 2001
From: sujithjay <sujith@logistimo.com>
Date: Sun, 24 Dec 2017 11:14:30 -0800
Subject: [PATCH 186/356] [SPARK-22465][CORE] Add a safety-check to RDD
 defaultPartitioner

## What changes were proposed in this pull request?
In choosing a Partitioner to use for a cogroup-like operation between a number of RDDs, the default behaviour was if some of the RDDs already have a partitioner, we choose the one amongst them with the maximum number of partitions.

This behaviour, in some cases, could hit the 2G limit (SPARK-6235). To illustrate one such scenario, consider two RDDs:
rDD1: with smaller data and smaller number of partitions, alongwith a Partitioner.
rDD2: with much larger data and a larger number of partitions, without a Partitioner.

The cogroup of these two RDDs could hit the 2G limit, as a larger amount of data is shuffled into a smaller number of partitions.

This PR introduces a safety-check wherein the Partitioner is chosen only if either of the following conditions are met:
1. if the number of partitions of the RDD associated with the Partitioner is greater than or equal to the max number of upstream partitions; or
2. if the number of partitions of the RDD associated with the Partitioner is less than and within a single order of magnitude of the max number of upstream partitions.

## How was this patch tested?
Unit tests in PartitioningSuite and PairRDDFunctionsSuite

Author: sujithjay <sujith@logistimo.com>

Closes #20002 from sujithjay/SPARK-22465.
---
 .../scala/org/apache/spark/Partitioner.scala  | 32 +++++++++++++++++--
 .../org/apache/spark/PartitioningSuite.scala  | 27 ++++++++++++++++
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 22 +++++++++++++
 3 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index debbd8d7c26c9..437bbaae1968b 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -21,6 +21,7 @@ import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
 
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
+import scala.math.log10
 import scala.reflect.ClassTag
 import scala.util.hashing.byteswap32
 
@@ -42,7 +43,9 @@ object Partitioner {
   /**
    * Choose a partitioner to use for a cogroup-like operation between a number of RDDs.
    *
-   * If any of the RDDs already has a partitioner, choose that one.
+   * If any of the RDDs already has a partitioner, and the number of partitions of the
+   * partitioner is either greater than or is less than and within a single order of
+   * magnitude of the max number of upstream partitions, choose that one.
    *
    * Otherwise, we use a default HashPartitioner. For the number of partitions, if
    * spark.default.parallelism is set, then we'll use the value from SparkContext
@@ -57,8 +60,15 @@ object Partitioner {
   def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
     val rdds = (Seq(rdd) ++ others)
     val hasPartitioner = rdds.filter(_.partitioner.exists(_.numPartitions > 0))
-    if (hasPartitioner.nonEmpty) {
-      hasPartitioner.maxBy(_.partitions.length).partitioner.get
+
+    val hasMaxPartitioner: Option[RDD[_]] = if (hasPartitioner.nonEmpty) {
+      Some(hasPartitioner.maxBy(_.partitions.length))
+    } else {
+      None
+    }
+
+    if (isEligiblePartitioner(hasMaxPartitioner, rdds)) {
+      hasMaxPartitioner.get.partitioner.get
     } else {
       if (rdd.context.conf.contains("spark.default.parallelism")) {
         new HashPartitioner(rdd.context.defaultParallelism)
@@ -67,6 +77,22 @@ object Partitioner {
       }
     }
   }
+
+  /**
+   * Returns true if the number of partitions of the RDD is either greater
+   * than or is less than and within a single order of magnitude of the
+   * max number of upstream partitions;
+   * otherwise, returns false
+   */
+  private def isEligiblePartitioner(
+     hasMaxPartitioner: Option[RDD[_]],
+     rdds: Seq[RDD[_]]): Boolean = {
+    if (hasMaxPartitioner.isEmpty) {
+      return false
+    }
+    val maxPartitions = rdds.map(_.partitions.length).max
+    log10(maxPartitions) - log10(hasMaxPartitioner.get.getNumPartitions) < 1
+  }
 }
 
 /**
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index dfe4c25670ce0..155ca17db726b 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -259,6 +259,33 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
     val partitioner = new RangePartitioner(22, rdd)
     assert(partitioner.numPartitions === 3)
   }
+
+  test("defaultPartitioner") {
+    val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 150)
+    val rdd2 = sc
+      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+      .partitionBy(new HashPartitioner(10))
+    val rdd3 = sc
+      .parallelize(Array((1, 6), (7, 8), (3, 10), (5, 12), (13, 14)))
+      .partitionBy(new HashPartitioner(100))
+    val rdd4 = sc
+      .parallelize(Array((1, 2), (2, 3), (2, 4), (3, 4)))
+      .partitionBy(new HashPartitioner(9))
+    val rdd5 = sc.parallelize((1 to 10).map(x => (x, x)), 11)
+
+    val partitioner1 = Partitioner.defaultPartitioner(rdd1, rdd2)
+    val partitioner2 = Partitioner.defaultPartitioner(rdd2, rdd3)
+    val partitioner3 = Partitioner.defaultPartitioner(rdd3, rdd1)
+    val partitioner4 = Partitioner.defaultPartitioner(rdd1, rdd2, rdd3)
+    val partitioner5 = Partitioner.defaultPartitioner(rdd4, rdd5)
+
+    assert(partitioner1.numPartitions == rdd1.getNumPartitions)
+    assert(partitioner2.numPartitions == rdd3.getNumPartitions)
+    assert(partitioner3.numPartitions == rdd3.getNumPartitions)
+    assert(partitioner4.numPartitions == rdd3.getNumPartitions)
+    assert(partitioner5.numPartitions == rdd4.getNumPartitions)
+
+  }
 }
 
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 65d35264dc108..a39e0469272fe 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -310,6 +310,28 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     assert(joined.size > 0)
   }
 
+  // See SPARK-22465
+  test("cogroup between multiple RDD " +
+    "with an order of magnitude difference in number of partitions") {
+    val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 1000)
+    val rdd2 = sc
+      .parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+      .partitionBy(new HashPartitioner(10))
+    val joined = rdd1.cogroup(rdd2)
+    assert(joined.getNumPartitions == rdd1.getNumPartitions)
+  }
+
+  // See SPARK-22465
+  test("cogroup between multiple RDD" +
+    " with number of partitions similar in order of magnitude") {
+    val rdd1 = sc.parallelize((1 to 1000).map(x => (x, x)), 20)
+    val rdd2 = sc
+      .parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
+      .partitionBy(new HashPartitioner(10))
+    val joined = rdd1.cogroup(rdd2)
+    assert(joined.getNumPartitions == rdd2.getNumPartitions)
+  }
+
   test("rightOuterJoin") {
     val rdd1 = sc.parallelize(Array((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Array((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))

From fba03133d1369ce8cfebd6ae68b987a95f1b7ea1 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Sun, 24 Dec 2017 22:57:53 -0800
Subject: [PATCH 187/356] [SPARK-22707][ML] Optimize CrossValidator memory
 occupation by models in fitting

## What changes were proposed in this pull request?

Via some test I found CrossValidator still exists memory issue, it will still occupy `O(n*sizeof(model))` memory for holding models when fitting, if well optimized, it should be `O(parallelism*sizeof(model))`

This is because modelFutures will hold the reference to model object after future is complete (we can use `future.value.get.get` to fetch it), and the `Future.sequence` and the `modelFutures` array holds references to each model future. So all model object are keep referenced. So it will still occupy `O(n*sizeof(model))` memory.

I fix this by merging the `modelFuture` and `foldMetricFuture` together, and use `atomicInteger` to statistic complete fitting tasks and when all done, trigger `trainingDataset.unpersist`.

I ever commented this issue on the old PR [SPARK-19357]
https://github.com/apache/spark/pull/16774#pullrequestreview-53674264
unfortunately, at that time I do not realize that the issue still exists, but now I confirm it and create this PR to fix it.

## Discussion
I give 3 approaches which we can compare, after discussion I realized none of them is ideal, we have to make a trade-off.

**After discussion with jkbradley , choose approach 3**

### Approach 1
~~The approach proposed by MrBago at~~ https://github.com/apache/spark/pull/19904#discussion_r156751569
~~This approach resolve the model objects referenced issue, allow the model objects to be GCed in time. **BUT, in some cases, it still do not resolve the O(N) model memory occupation issue**. Let me use an extreme case to describe it:~~
~~suppose we set `parallelism = 1`, and there're 100 paramMaps. So we have 100 fitting & evaluation tasks. In this approach, because of `parallelism = 1`, the code have to wait 100 fitting tasks complete, **(at this time the memory occupation by models already reach 100 * sizeof(model) )** and then it will unpersist training dataset and then do 100 evaluation tasks.~~

### Approach 2
~~This approach is my PR old version code~~ https://github.com/apache/spark/pull/19904/commits/2cc7c28f385009570536690d686f2843485942b2
~~This approach can make sure at any case, the peak memory occupation by models to be `O(numParallelism * sizeof(model))`, but, it exists an issue that, in some extreme case, the "unpersist training dataset" will be delayed until most of the evaluation tasks complete. Suppose the case
 `parallelism = 1`, and there're 100 fitting & evaluation tasks, each fitting&evaluation task have to be executed one by one, so only after the first 99 fitting&evaluation tasks and the 100th fitting task complete, the "unpersist training dataset" will be triggered.~~

### Approach 3
After I compared approach 1 and approach 2, I realized that, in the case which parallelism is low but there're many fitting & evaluation tasks, we cannot achieve both of the following two goals:
- Make the peak memory occupation by models(driver-side) to be O(parallelism * sizeof(model))
- unpersist training dataset before most of the evaluation tasks started.

So I vote for a simpler approach, move the unpersist training dataset to the end (Does this really matters ?)
Because the goal 1 is more important, we must make sure the peak memory occupation by models (driver-side) to be O(parallelism * sizeof(model)), otherwise it will bring high risk of OOM.
Like following code:
```
      val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
        Future[Double] {
          val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
          //...other minor codes
          val metric = eval.evaluate(model.transform(validationDataset, paramMap))
          logDebug(s"Got metric metricformodeltrainedwithparamMap.")
          metric
        } (executionContext)
      }
      val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
      trainingDataset.unpersist() // <------- unpersist at the end
      validationDataset.unpersist()
```

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #19904 from WeichenXu123/fix_cross_validator_memory_issue.
---
 .../apache/spark/ml/tuning/CrossValidator.scala | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 1682ca91bf832..0130b3e255f0d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -147,24 +147,12 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
       logDebug(s"Train split $splitIndex with multiple sets of parameters.")
 
       // Fit models in a Future for training in parallel
-      val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
-        Future[Model[_]] {
+      val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
+        Future[Double] {
           val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
-
           if (collectSubModelsParam) {
             subModels.get(splitIndex)(paramIndex) = model
           }
-          model
-        } (executionContext)
-      }
-
-      // Unpersist training data only when all models have trained
-      Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
-        .onComplete { _ => trainingDataset.unpersist() } (executionContext)
-
-      // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
-      val foldMetricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
-        modelFuture.map { model =>
           // TODO: duplicate evaluator to take extra params from input
           val metric = eval.evaluate(model.transform(validationDataset, paramMap))
           logDebug(s"Got metric $metric for model trained with $paramMap.")
@@ -174,6 +162,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
 
       // Wait for metrics to be calculated before unpersisting validation dataset
       val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
+      trainingDataset.unpersist()
       validationDataset.unpersist()
       foldMetrics
     }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits

From 33ae2437ba4e634b510b0f96e914ad1ef4ccafd8 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Mon, 25 Dec 2017 01:14:09 -0800
Subject: [PATCH 188/356] [SPARK-22893][SQL] Unified the data type mismatch
 message

## What changes were proposed in this pull request?

We should use `dataType.simpleString` to unified the data type mismatch message:
Before:
```
spark-sql> select cast(1 as binary);
Error in query: cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7;
```
After:
```
park-sql> select cast(1 as binary);
Error in query: cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7;
```

## How was this patch tested?

Exist test.

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20064 from wangyum/SPARK-22893.
---
 .../spark/sql/catalyst/expressions/Cast.scala |   2 +-
 .../aggregate/ApproximatePercentile.scala     |   4 +-
 .../expressions/conditionalExpressions.scala  |   3 +-
 .../sql/catalyst/expressions/generators.scala |  10 +-
 .../sql/catalyst/expressions/predicates.scala |   2 +-
 .../expressions/windowExpressions.scala       |   8 +-
 .../native/binaryComparison.sql.out           |  48 +--
 .../typeCoercion/native/inConversion.sql.out  | 280 +++++++++---------
 .../native/windowFrameCoercion.sql.out        |   8 +-
 .../sql-tests/results/window.sql.out          |   4 +-
 .../org/apache/spark/sql/DatasetSuite.scala   |   2 +-
 .../spark/sql/GeneratorFunctionSuite.scala    |   4 +-
 12 files changed, 189 insertions(+), 186 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 5279d41278967..274d8813f16db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -181,7 +181,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
       TypeCheckResult.TypeCheckSuccess
     } else {
       TypeCheckResult.TypeCheckFailure(
-        s"cannot cast ${child.dataType} to $dataType")
+        s"cannot cast ${child.dataType.simpleString} to ${dataType.simpleString}")
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 7facb9dad9a76..149ac265e6ed5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -132,7 +132,7 @@ case class ApproximatePercentile(
         case TimestampType => value.asInstanceOf[Long].toDouble
         case n: NumericType => n.numeric.toDouble(value.asInstanceOf[n.InternalType])
         case other: DataType =>
-          throw new UnsupportedOperationException(s"Unexpected data type $other")
+          throw new UnsupportedOperationException(s"Unexpected data type ${other.simpleString}")
       }
       buffer.add(doubleValue)
     }
@@ -157,7 +157,7 @@ case class ApproximatePercentile(
       case DoubleType => doubleResult
       case _: DecimalType => doubleResult.map(Decimal(_))
       case other: DataType =>
-        throw new UnsupportedOperationException(s"Unexpected data type $other")
+        throw new UnsupportedOperationException(s"Unexpected data type ${other.simpleString}")
     }
     if (result.length == 0) {
       null
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 142dfb02be0a8..b444c3a7be92a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -40,7 +40,8 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
   override def checkInputDataTypes(): TypeCheckResult = {
     if (predicate.dataType != BooleanType) {
       TypeCheckResult.TypeCheckFailure(
-        s"type of predicate expression in If should be boolean, not ${predicate.dataType}")
+        "type of predicate expression in If should be boolean, " +
+          s"not ${predicate.dataType.simpleString}")
     } else if (!trueValue.dataType.sameType(falseValue.dataType)) {
       TypeCheckResult.TypeCheckFailure(s"differing types in '$sql' " +
         s"(${trueValue.dataType.simpleString} and ${falseValue.dataType.simpleString}).")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 69af7a250a5ac..4f4d49166e88c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -155,8 +155,8 @@ case class Stack(children: Seq[Expression]) extends Generator {
         val j = (i - 1) % numFields
         if (children(i).dataType != elementSchema.fields(j).dataType) {
           return TypeCheckResult.TypeCheckFailure(
-            s"Argument ${j + 1} (${elementSchema.fields(j).dataType}) != " +
-              s"Argument $i (${children(i).dataType})")
+            s"Argument ${j + 1} (${elementSchema.fields(j).dataType.simpleString}) != " +
+              s"Argument $i (${children(i).dataType.simpleString})")
         }
       }
       TypeCheckResult.TypeCheckSuccess
@@ -249,7 +249,8 @@ abstract class ExplodeBase extends UnaryExpression with CollectionGenerator with
       TypeCheckResult.TypeCheckSuccess
     case _ =>
       TypeCheckResult.TypeCheckFailure(
-        s"input to function explode should be array or map type, not ${child.dataType}")
+        "input to function explode should be array or map type, " +
+          s"not ${child.dataType.simpleString}")
   }
 
   // hive-compatible default alias for explode function ("col" for array, "key", "value" for map)
@@ -378,7 +379,8 @@ case class Inline(child: Expression) extends UnaryExpression with CollectionGene
       TypeCheckResult.TypeCheckSuccess
     case _ =>
       TypeCheckResult.TypeCheckFailure(
-        s"input to function $prettyName should be array of struct type, not ${child.dataType}")
+        s"input to function $prettyName should be array of struct type, " +
+          s"not ${child.dataType.simpleString}")
   }
 
   override def elementSchema: StructType = child.dataType match {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index f4ee3d10f3f43..b469f5cb7586a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -195,7 +195,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate {
           }
         case _ =>
           TypeCheckResult.TypeCheckFailure(s"Arguments must be same type but were: " +
-            s"${value.dataType} != ${mismatchOpt.get.dataType}")
+            s"${value.dataType.simpleString} != ${mismatchOpt.get.dataType.simpleString}")
       }
     } else {
       TypeUtils.checkForOrderingExpr(value.dataType, s"function $prettyName")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
index 220cc4f885d7d..dd13d9a3bba51 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/windowExpressions.scala
@@ -70,9 +70,9 @@ case class WindowSpecDefinition(
       case f: SpecifiedWindowFrame if f.frameType == RangeFrame && f.isValueBound &&
           !isValidFrameType(f.valueBoundary.head.dataType) =>
         TypeCheckFailure(
-          s"The data type '${orderSpec.head.dataType}' used in the order specification does " +
-            s"not match the data type '${f.valueBoundary.head.dataType}' which is used in the " +
-            "range frame.")
+          s"The data type '${orderSpec.head.dataType.simpleString}' used in the order " +
+            "specification does not match the data type " +
+            s"'${f.valueBoundary.head.dataType.simpleString}' which is used in the range frame.")
       case _ => TypeCheckSuccess
     }
   }
@@ -251,7 +251,7 @@ case class SpecifiedWindowFrame(
       TypeCheckFailure(s"Window frame $location bound '$e' is not a literal.")
     case e: Expression if !frameType.inputType.acceptsType(e.dataType) =>
       TypeCheckFailure(
-        s"The data type of the $location bound '${e.dataType}' does not match " +
+        s"The data type of the $location bound '${e.dataType.simpleString}' does not match " +
           s"the expected data type '${frameType.inputType.simpleString}'.")
     case _ => TypeCheckSuccess
   }
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
index fe7bde040707c..2914d6015ea88 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/binaryComparison.sql.out
@@ -16,7 +16,7 @@ SELECT cast(1 as binary) = '1' FROM t
 struct<>
 -- !query 1 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 2
@@ -25,7 +25,7 @@ SELECT cast(1 as binary) > '2' FROM t
 struct<>
 -- !query 2 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 3
@@ -34,7 +34,7 @@ SELECT cast(1 as binary) >= '2' FROM t
 struct<>
 -- !query 3 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 4
@@ -43,7 +43,7 @@ SELECT cast(1 as binary) < '2' FROM t
 struct<>
 -- !query 4 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 5
@@ -52,7 +52,7 @@ SELECT cast(1 as binary) <= '2' FROM t
 struct<>
 -- !query 5 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 6
@@ -61,7 +61,7 @@ SELECT cast(1 as binary) <> '2' FROM t
 struct<>
 -- !query 6 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 7
@@ -70,7 +70,7 @@ SELECT cast(1 as binary) = cast(null as string) FROM t
 struct<>
 -- !query 7 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 8
@@ -79,7 +79,7 @@ SELECT cast(1 as binary) > cast(null as string) FROM t
 struct<>
 -- !query 8 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 9
@@ -88,7 +88,7 @@ SELECT cast(1 as binary) >= cast(null as string) FROM t
 struct<>
 -- !query 9 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 10
@@ -97,7 +97,7 @@ SELECT cast(1 as binary) < cast(null as string) FROM t
 struct<>
 -- !query 10 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 11
@@ -106,7 +106,7 @@ SELECT cast(1 as binary) <= cast(null as string) FROM t
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 12
@@ -115,7 +115,7 @@ SELECT cast(1 as binary) <> cast(null as string) FROM t
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 7
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 7
 
 
 -- !query 13
@@ -124,7 +124,7 @@ SELECT '1' = cast(1 as binary) FROM t
 struct<>
 -- !query 13 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 13
 
 
 -- !query 14
@@ -133,7 +133,7 @@ SELECT '2' > cast(1 as binary) FROM t
 struct<>
 -- !query 14 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 13
 
 
 -- !query 15
@@ -142,7 +142,7 @@ SELECT '2' >= cast(1 as binary) FROM t
 struct<>
 -- !query 15 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 14
 
 
 -- !query 16
@@ -151,7 +151,7 @@ SELECT '2' < cast(1 as binary) FROM t
 struct<>
 -- !query 16 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 13
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 13
 
 
 -- !query 17
@@ -160,7 +160,7 @@ SELECT '2' <= cast(1 as binary) FROM t
 struct<>
 -- !query 17 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 14
 
 
 -- !query 18
@@ -169,7 +169,7 @@ SELECT '2' <> cast(1 as binary) FROM t
 struct<>
 -- !query 18 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 14
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 14
 
 
 -- !query 19
@@ -178,7 +178,7 @@ SELECT cast(null as string) = cast(1 as binary) FROM t
 struct<>
 -- !query 19 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 30
 
 
 -- !query 20
@@ -187,7 +187,7 @@ SELECT cast(null as string) > cast(1 as binary) FROM t
 struct<>
 -- !query 20 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 30
 
 
 -- !query 21
@@ -196,7 +196,7 @@ SELECT cast(null as string) >= cast(1 as binary) FROM t
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 31
 
 
 -- !query 22
@@ -205,7 +205,7 @@ SELECT cast(null as string) < cast(1 as binary) FROM t
 struct<>
 -- !query 22 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 30
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 30
 
 
 -- !query 23
@@ -214,7 +214,7 @@ SELECT cast(null as string) <= cast(1 as binary) FROM t
 struct<>
 -- !query 23 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 31
 
 
 -- !query 24
@@ -223,7 +223,7 @@ SELECT cast(null as string) <> cast(1 as binary) FROM t
 struct<>
 -- !query 24 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast IntegerType to BinaryType; line 1 pos 31
+cannot resolve 'CAST(1 AS BINARY)' due to data type mismatch: cannot cast int to binary; line 1 pos 31
 
 
 -- !query 25
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
index bf8ddee89b798..875ccc1341ec4 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/inConversion.sql.out
@@ -80,7 +80,7 @@ SELECT cast(1 as tinyint) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 9 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ByteType != BinaryType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: tinyint != binary; line 1 pos 26
 
 
 -- !query 10
@@ -89,7 +89,7 @@ SELECT cast(1 as tinyint) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 10 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ByteType != BooleanType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: tinyint != boolean; line 1 pos 26
 
 
 -- !query 11
@@ -98,7 +98,7 @@ SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 11 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ByteType != TimestampType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: tinyint != timestamp; line 1 pos 26
 
 
 -- !query 12
@@ -107,7 +107,7 @@ SELECT cast(1 as tinyint) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 12 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ByteType != DateType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: tinyint != date; line 1 pos 26
 
 
 -- !query 13
@@ -180,7 +180,7 @@ SELECT cast(1 as smallint) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ShortType != BinaryType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: smallint != binary; line 1 pos 27
 
 
 -- !query 22
@@ -189,7 +189,7 @@ SELECT cast(1 as smallint) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 22 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ShortType != BooleanType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: smallint != boolean; line 1 pos 27
 
 
 -- !query 23
@@ -198,7 +198,7 @@ SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM
 struct<>
 -- !query 23 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ShortType != TimestampType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: smallint != timestamp; line 1 pos 27
 
 
 -- !query 24
@@ -207,7 +207,7 @@ SELECT cast(1 as smallint) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 24 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ShortType != DateType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: smallint != date; line 1 pos 27
 
 
 -- !query 25
@@ -280,7 +280,7 @@ SELECT cast(1 as int) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 33 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BinaryType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: int != binary; line 1 pos 22
 
 
 -- !query 34
@@ -289,7 +289,7 @@ SELECT cast(1 as int) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 34 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BooleanType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: int != boolean; line 1 pos 22
 
 
 -- !query 35
@@ -298,7 +298,7 @@ SELECT cast(1 as int) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 35 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: IntegerType != TimestampType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: int != timestamp; line 1 pos 22
 
 
 -- !query 36
@@ -307,7 +307,7 @@ SELECT cast(1 as int) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 36 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: IntegerType != DateType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: int != date; line 1 pos 22
 
 
 -- !query 37
@@ -380,7 +380,7 @@ SELECT cast(1 as bigint) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 45 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: LongType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: bigint != binary; line 1 pos 25
 
 
 -- !query 46
@@ -389,7 +389,7 @@ SELECT cast(1 as bigint) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 46 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: LongType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: bigint != boolean; line 1 pos 25
 
 
 -- !query 47
@@ -398,7 +398,7 @@ SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 47 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: LongType != TimestampType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: bigint != timestamp; line 1 pos 25
 
 
 -- !query 48
@@ -407,7 +407,7 @@ SELECT cast(1 as bigint) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 48 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: LongType != DateType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: bigint != date; line 1 pos 25
 
 
 -- !query 49
@@ -480,7 +480,7 @@ SELECT cast(1 as float) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 57 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: FloatType != BinaryType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: float != binary; line 1 pos 24
 
 
 -- !query 58
@@ -489,7 +489,7 @@ SELECT cast(1 as float) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 58 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: FloatType != BooleanType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: float != boolean; line 1 pos 24
 
 
 -- !query 59
@@ -498,7 +498,7 @@ SELECT cast(1 as float) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 59 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: FloatType != TimestampType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: float != timestamp; line 1 pos 24
 
 
 -- !query 60
@@ -507,7 +507,7 @@ SELECT cast(1 as float) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 60 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: FloatType != DateType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: float != date; line 1 pos 24
 
 
 -- !query 61
@@ -580,7 +580,7 @@ SELECT cast(1 as double) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 69 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: double != binary; line 1 pos 25
 
 
 -- !query 70
@@ -589,7 +589,7 @@ SELECT cast(1 as double) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 70 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: double != boolean; line 1 pos 25
 
 
 -- !query 71
@@ -598,7 +598,7 @@ SELECT cast(1 as double) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 71 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DoubleType != TimestampType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: double != timestamp; line 1 pos 25
 
 
 -- !query 72
@@ -607,7 +607,7 @@ SELECT cast(1 as double) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 72 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DoubleType != DateType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: double != date; line 1 pos 25
 
 
 -- !query 73
@@ -680,7 +680,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 81 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BinaryType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != binary; line 1 pos 33
 
 
 -- !query 82
@@ -689,7 +689,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 82 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BooleanType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != boolean; line 1 pos 33
 
 
 -- !query 83
@@ -698,7 +698,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00.0' as timestamp))
 struct<>
 -- !query 83 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != TimestampType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != timestamp; line 1 pos 33
 
 
 -- !query 84
@@ -707,7 +707,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 84 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != DateType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != date; line 1 pos 33
 
 
 -- !query 85
@@ -780,7 +780,7 @@ SELECT cast(1 as string) in (cast('1' as binary)) FROM t
 struct<>
 -- !query 93 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS STRING) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: StringType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS STRING) IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: string != binary; line 1 pos 25
 
 
 -- !query 94
@@ -789,7 +789,7 @@ SELECT cast(1 as string) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 94 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: StringType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: string != boolean; line 1 pos 25
 
 
 -- !query 95
@@ -814,7 +814,7 @@ SELECT cast('1' as binary) in (cast(1 as tinyint)) FROM t
 struct<>
 -- !query 97 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ByteType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: binary != tinyint; line 1 pos 27
 
 
 -- !query 98
@@ -823,7 +823,7 @@ SELECT cast('1' as binary) in (cast(1 as smallint)) FROM t
 struct<>
 -- !query 98 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ShortType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: binary != smallint; line 1 pos 27
 
 
 -- !query 99
@@ -832,7 +832,7 @@ SELECT cast('1' as binary) in (cast(1 as int)) FROM t
 struct<>
 -- !query 99 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != IntegerType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: binary != int; line 1 pos 27
 
 
 -- !query 100
@@ -841,7 +841,7 @@ SELECT cast('1' as binary) in (cast(1 as bigint)) FROM t
 struct<>
 -- !query 100 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != LongType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: binary != bigint; line 1 pos 27
 
 
 -- !query 101
@@ -850,7 +850,7 @@ SELECT cast('1' as binary) in (cast(1 as float)) FROM t
 struct<>
 -- !query 101 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != FloatType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: binary != float; line 1 pos 27
 
 
 -- !query 102
@@ -859,7 +859,7 @@ SELECT cast('1' as binary) in (cast(1 as double)) FROM t
 struct<>
 -- !query 102 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DoubleType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: binary != double; line 1 pos 27
 
 
 -- !query 103
@@ -868,7 +868,7 @@ SELECT cast('1' as binary) in (cast(1 as decimal(10, 0))) FROM t
 struct<>
 -- !query 103 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BinaryType != DecimalType(10,0); line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: binary != decimal(10,0); line 1 pos 27
 
 
 -- !query 104
@@ -877,7 +877,7 @@ SELECT cast('1' as binary) in (cast(1 as string)) FROM t
 struct<>
 -- !query 104 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BinaryType != StringType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: binary != string; line 1 pos 27
 
 
 -- !query 105
@@ -894,7 +894,7 @@ SELECT cast('1' as binary) in (cast(1 as boolean)) FROM t
 struct<>
 -- !query 106 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: BinaryType != BooleanType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: binary != boolean; line 1 pos 27
 
 
 -- !query 107
@@ -903,7 +903,7 @@ SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM
 struct<>
 -- !query 107 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BinaryType != TimestampType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: binary != timestamp; line 1 pos 27
 
 
 -- !query 108
@@ -912,7 +912,7 @@ SELECT cast('1' as binary) in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 108 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DateType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: binary != date; line 1 pos 27
 
 
 -- !query 109
@@ -921,7 +921,7 @@ SELECT true in (cast(1 as tinyint)) FROM t
 struct<>
 -- !query 109 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ByteType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: boolean != tinyint; line 1 pos 12
 
 
 -- !query 110
@@ -930,7 +930,7 @@ SELECT true in (cast(1 as smallint)) FROM t
 struct<>
 -- !query 110 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ShortType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: boolean != smallint; line 1 pos 12
 
 
 -- !query 111
@@ -939,7 +939,7 @@ SELECT true in (cast(1 as int)) FROM t
 struct<>
 -- !query 111 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != IntegerType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: boolean != int; line 1 pos 12
 
 
 -- !query 112
@@ -948,7 +948,7 @@ SELECT true in (cast(1 as bigint)) FROM t
 struct<>
 -- !query 112 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != LongType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: boolean != bigint; line 1 pos 12
 
 
 -- !query 113
@@ -957,7 +957,7 @@ SELECT true in (cast(1 as float)) FROM t
 struct<>
 -- !query 113 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != FloatType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: boolean != float; line 1 pos 12
 
 
 -- !query 114
@@ -966,7 +966,7 @@ SELECT true in (cast(1 as double)) FROM t
 struct<>
 -- !query 114 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DoubleType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: boolean != double; line 1 pos 12
 
 
 -- !query 115
@@ -975,7 +975,7 @@ SELECT true in (cast(1 as decimal(10, 0))) FROM t
 struct<>
 -- !query 115 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BooleanType != DecimalType(10,0); line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: boolean != decimal(10,0); line 1 pos 12
 
 
 -- !query 116
@@ -984,7 +984,7 @@ SELECT true in (cast(1 as string)) FROM t
 struct<>
 -- !query 116 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BooleanType != StringType; line 1 pos 12
+cannot resolve '(true IN (CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: boolean != string; line 1 pos 12
 
 
 -- !query 117
@@ -993,7 +993,7 @@ SELECT true in (cast('1' as binary)) FROM t
 struct<>
 -- !query 117 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: BooleanType != BinaryType; line 1 pos 12
+cannot resolve '(true IN (CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: boolean != binary; line 1 pos 12
 
 
 -- !query 118
@@ -1010,7 +1010,7 @@ SELECT true in (cast('2017-12-11 09:30:00.0' as timestamp)) FROM t
 struct<>
 -- !query 119 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BooleanType != TimestampType; line 1 pos 12
+cannot resolve '(true IN (CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: boolean != timestamp; line 1 pos 12
 
 
 -- !query 120
@@ -1019,7 +1019,7 @@ SELECT true in (cast('2017-12-11 09:30:00' as date)) FROM t
 struct<>
 -- !query 120 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(true IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DateType; line 1 pos 12
+cannot resolve '(true IN (CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: boolean != date; line 1 pos 12
 
 
 -- !query 121
@@ -1028,7 +1028,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as tinyint)) FROM t
 struct<>
 -- !query 121 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ByteType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != tinyint; line 1 pos 50
 
 
 -- !query 122
@@ -1037,7 +1037,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as smallint)) FROM
 struct<>
 -- !query 122 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ShortType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != smallint; line 1 pos 50
 
 
 -- !query 123
@@ -1046,7 +1046,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as int)) FROM t
 struct<>
 -- !query 123 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != IntegerType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: timestamp != int; line 1 pos 50
 
 
 -- !query 124
@@ -1055,7 +1055,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as bigint)) FROM t
 struct<>
 -- !query 124 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != LongType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != bigint; line 1 pos 50
 
 
 -- !query 125
@@ -1064,7 +1064,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as float)) FROM t
 struct<>
 -- !query 125 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != FloatType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: timestamp != float; line 1 pos 50
 
 
 -- !query 126
@@ -1073,7 +1073,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as double)) FROM t
 struct<>
 -- !query 126 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: TimestampType != DoubleType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: timestamp != double; line 1 pos 50
 
 
 -- !query 127
@@ -1082,7 +1082,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as decimal(10, 0)))
 struct<>
 -- !query 127 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: TimestampType != DecimalType(10,0); line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: timestamp != decimal(10,0); line 1 pos 50
 
 
 -- !query 128
@@ -1099,7 +1099,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2' as binary)) FROM
 struct<>
 -- !query 129 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BinaryType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: timestamp != binary; line 1 pos 50
 
 
 -- !query 130
@@ -1108,7 +1108,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast(2 as boolean)) FROM t
 struct<>
 -- !query 130 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BooleanType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: timestamp != boolean; line 1 pos 50
 
 
 -- !query 131
@@ -1133,7 +1133,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as tinyint)) FROM t
 struct<>
 -- !query 133 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ByteType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: date != tinyint; line 1 pos 43
 
 
 -- !query 134
@@ -1142,7 +1142,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as smallint)) FROM t
 struct<>
 -- !query 134 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ShortType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: date != smallint; line 1 pos 43
 
 
 -- !query 135
@@ -1151,7 +1151,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as int)) FROM t
 struct<>
 -- !query 135 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: DateType != IntegerType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS INT)))' due to data type mismatch: Arguments must be same type but were: date != int; line 1 pos 43
 
 
 -- !query 136
@@ -1160,7 +1160,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as bigint)) FROM t
 struct<>
 -- !query 136 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: DateType != LongType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: date != bigint; line 1 pos 43
 
 
 -- !query 137
@@ -1169,7 +1169,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as float)) FROM t
 struct<>
 -- !query 137 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: DateType != FloatType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: date != float; line 1 pos 43
 
 
 -- !query 138
@@ -1178,7 +1178,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as double)) FROM t
 struct<>
 -- !query 138 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: DateType != DoubleType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: date != double; line 1 pos 43
 
 
 -- !query 139
@@ -1187,7 +1187,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as decimal(10, 0))) FROM t
 struct<>
 -- !query 139 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: DateType != DecimalType(10,0); line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: date != decimal(10,0); line 1 pos 43
 
 
 -- !query 140
@@ -1204,7 +1204,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2' as binary)) FROM t
 struct<>
 -- !query 141 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DateType != BinaryType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: date != binary; line 1 pos 43
 
 
 -- !query 142
@@ -1213,7 +1213,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast(2 as boolean)) FROM t
 struct<>
 -- !query 142 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DateType != BooleanType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST(2 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: date != boolean; line 1 pos 43
 
 
 -- !query 143
@@ -1302,7 +1302,7 @@ SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('1' as binary)) FROM t
 struct<>
 -- !query 153 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ByteType != BinaryType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: tinyint != binary; line 1 pos 26
 
 
 -- !query 154
@@ -1311,7 +1311,7 @@ SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast(1 as boolean)) FROM t
 struct<>
 -- !query 154 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ByteType != BooleanType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: tinyint != boolean; line 1 pos 26
 
 
 -- !query 155
@@ -1320,7 +1320,7 @@ SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00.0' a
 struct<>
 -- !query 155 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ByteType != TimestampType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: tinyint != timestamp; line 1 pos 26
 
 
 -- !query 156
@@ -1329,7 +1329,7 @@ SELECT cast(1 as tinyint) in (cast(1 as tinyint), cast('2017-12-11 09:30:00' as
 struct<>
 -- !query 156 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ByteType != DateType; line 1 pos 26
+cannot resolve '(CAST(1 AS TINYINT) IN (CAST(1 AS TINYINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: tinyint != date; line 1 pos 26
 
 
 -- !query 157
@@ -1402,7 +1402,7 @@ SELECT cast(1 as smallint) in (cast(1 as smallint), cast('1' as binary)) FROM t
 struct<>
 -- !query 165 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: ShortType != BinaryType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: smallint != binary; line 1 pos 27
 
 
 -- !query 166
@@ -1411,7 +1411,7 @@ SELECT cast(1 as smallint) in (cast(1 as smallint), cast(1 as boolean)) FROM t
 struct<>
 -- !query 166 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: ShortType != BooleanType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: smallint != boolean; line 1 pos 27
 
 
 -- !query 167
@@ -1420,7 +1420,7 @@ SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00.0'
 struct<>
 -- !query 167 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: ShortType != TimestampType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: smallint != timestamp; line 1 pos 27
 
 
 -- !query 168
@@ -1429,7 +1429,7 @@ SELECT cast(1 as smallint) in (cast(1 as smallint), cast('2017-12-11 09:30:00' a
 struct<>
 -- !query 168 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: ShortType != DateType; line 1 pos 27
+cannot resolve '(CAST(1 AS SMALLINT) IN (CAST(1 AS SMALLINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: smallint != date; line 1 pos 27
 
 
 -- !query 169
@@ -1502,7 +1502,7 @@ SELECT cast(1 as int) in (cast(1 as int), cast('1' as binary)) FROM t
 struct<>
 -- !query 177 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BinaryType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: int != binary; line 1 pos 22
 
 
 -- !query 178
@@ -1511,7 +1511,7 @@ SELECT cast(1 as int) in (cast(1 as int), cast(1 as boolean)) FROM t
 struct<>
 -- !query 178 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: IntegerType != BooleanType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: int != boolean; line 1 pos 22
 
 
 -- !query 179
@@ -1520,7 +1520,7 @@ SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00.0' as timest
 struct<>
 -- !query 179 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: IntegerType != TimestampType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: int != timestamp; line 1 pos 22
 
 
 -- !query 180
@@ -1529,7 +1529,7 @@ SELECT cast(1 as int) in (cast(1 as int), cast('2017-12-11 09:30:00' as date)) F
 struct<>
 -- !query 180 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: IntegerType != DateType; line 1 pos 22
+cannot resolve '(CAST(1 AS INT) IN (CAST(1 AS INT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: int != date; line 1 pos 22
 
 
 -- !query 181
@@ -1602,7 +1602,7 @@ SELECT cast(1 as bigint) in (cast(1 as bigint), cast('1' as binary)) FROM t
 struct<>
 -- !query 189 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: LongType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: bigint != binary; line 1 pos 25
 
 
 -- !query 190
@@ -1611,7 +1611,7 @@ SELECT cast(1 as bigint) in (cast(1 as bigint), cast(1 as boolean)) FROM t
 struct<>
 -- !query 190 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: LongType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: bigint != boolean; line 1 pos 25
 
 
 -- !query 191
@@ -1620,7 +1620,7 @@ SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00.0' as
 struct<>
 -- !query 191 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: LongType != TimestampType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: bigint != timestamp; line 1 pos 25
 
 
 -- !query 192
@@ -1629,7 +1629,7 @@ SELECT cast(1 as bigint) in (cast(1 as bigint), cast('2017-12-11 09:30:00' as da
 struct<>
 -- !query 192 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: LongType != DateType; line 1 pos 25
+cannot resolve '(CAST(1 AS BIGINT) IN (CAST(1 AS BIGINT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: bigint != date; line 1 pos 25
 
 
 -- !query 193
@@ -1702,7 +1702,7 @@ SELECT cast(1 as float) in (cast(1 as float), cast('1' as binary)) FROM t
 struct<>
 -- !query 201 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: FloatType != BinaryType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: float != binary; line 1 pos 24
 
 
 -- !query 202
@@ -1711,7 +1711,7 @@ SELECT cast(1 as float) in (cast(1 as float), cast(1 as boolean)) FROM t
 struct<>
 -- !query 202 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: FloatType != BooleanType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: float != boolean; line 1 pos 24
 
 
 -- !query 203
@@ -1720,7 +1720,7 @@ SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00.0' as ti
 struct<>
 -- !query 203 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: FloatType != TimestampType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: float != timestamp; line 1 pos 24
 
 
 -- !query 204
@@ -1729,7 +1729,7 @@ SELECT cast(1 as float) in (cast(1 as float), cast('2017-12-11 09:30:00' as date
 struct<>
 -- !query 204 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: FloatType != DateType; line 1 pos 24
+cannot resolve '(CAST(1 AS FLOAT) IN (CAST(1 AS FLOAT), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: float != date; line 1 pos 24
 
 
 -- !query 205
@@ -1802,7 +1802,7 @@ SELECT cast(1 as double) in (cast(1 as double), cast('1' as binary)) FROM t
 struct<>
 -- !query 213 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: double != binary; line 1 pos 25
 
 
 -- !query 214
@@ -1811,7 +1811,7 @@ SELECT cast(1 as double) in (cast(1 as double), cast(1 as boolean)) FROM t
 struct<>
 -- !query 214 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DoubleType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: double != boolean; line 1 pos 25
 
 
 -- !query 215
@@ -1820,7 +1820,7 @@ SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00.0' as
 struct<>
 -- !query 215 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DoubleType != TimestampType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: double != timestamp; line 1 pos 25
 
 
 -- !query 216
@@ -1829,7 +1829,7 @@ SELECT cast(1 as double) in (cast(1 as double), cast('2017-12-11 09:30:00' as da
 struct<>
 -- !query 216 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DoubleType != DateType; line 1 pos 25
+cannot resolve '(CAST(1 AS DOUBLE) IN (CAST(1 AS DOUBLE), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: double != date; line 1 pos 25
 
 
 -- !query 217
@@ -1902,7 +1902,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('1' as bina
 struct<>
 -- !query 225 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BinaryType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != binary; line 1 pos 33
 
 
 -- !query 226
@@ -1911,7 +1911,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast(1 as boolea
 struct<>
 -- !query 226 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != BooleanType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != boolean; line 1 pos 33
 
 
 -- !query 227
@@ -1920,7 +1920,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11
 struct<>
 -- !query 227 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != TimestampType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != timestamp; line 1 pos 33
 
 
 -- !query 228
@@ -1929,7 +1929,7 @@ SELECT cast(1 as decimal(10, 0)) in (cast(1 as decimal(10, 0)), cast('2017-12-11
 struct<>
 -- !query 228 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: DecimalType(10,0) != DateType; line 1 pos 33
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) IN (CAST(1 AS DECIMAL(10,0)), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: decimal(10,0) != date; line 1 pos 33
 
 
 -- !query 229
@@ -2002,7 +2002,7 @@ SELECT cast(1 as string) in (cast(1 as string), cast('1' as binary)) FROM t
 struct<>
 -- !query 237 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: StringType != BinaryType; line 1 pos 25
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: string != binary; line 1 pos 25
 
 
 -- !query 238
@@ -2011,7 +2011,7 @@ SELECT cast(1 as string) in (cast(1 as string), cast(1 as boolean)) FROM t
 struct<>
 -- !query 238 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: StringType != BooleanType; line 1 pos 25
+cannot resolve '(CAST(1 AS STRING) IN (CAST(1 AS STRING), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: string != boolean; line 1 pos 25
 
 
 -- !query 239
@@ -2036,7 +2036,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as tinyint)) FROM t
 struct<>
 -- !query 241 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ByteType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: binary != tinyint; line 1 pos 27
 
 
 -- !query 242
@@ -2045,7 +2045,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as smallint)) FROM t
 struct<>
 -- !query 242 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != ShortType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: binary != smallint; line 1 pos 27
 
 
 -- !query 243
@@ -2054,7 +2054,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as int)) FROM t
 struct<>
 -- !query 243 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != IntegerType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: binary != int; line 1 pos 27
 
 
 -- !query 244
@@ -2063,7 +2063,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as bigint)) FROM t
 struct<>
 -- !query 244 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != LongType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: binary != bigint; line 1 pos 27
 
 
 -- !query 245
@@ -2072,7 +2072,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as float)) FROM t
 struct<>
 -- !query 245 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BinaryType != FloatType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: binary != float; line 1 pos 27
 
 
 -- !query 246
@@ -2081,7 +2081,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as double)) FROM t
 struct<>
 -- !query 246 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DoubleType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: binary != double; line 1 pos 27
 
 
 -- !query 247
@@ -2090,7 +2090,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as decimal(10, 0))) F
 struct<>
 -- !query 247 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BinaryType != DecimalType(10,0); line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: binary != decimal(10,0); line 1 pos 27
 
 
 -- !query 248
@@ -2099,7 +2099,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as string)) FROM t
 struct<>
 -- !query 248 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BinaryType != StringType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: binary != string; line 1 pos 27
 
 
 -- !query 249
@@ -2116,7 +2116,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast(1 as boolean)) FROM t
 struct<>
 -- !query 250 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: BinaryType != BooleanType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: binary != boolean; line 1 pos 27
 
 
 -- !query 251
@@ -2125,7 +2125,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00.0'
 struct<>
 -- !query 251 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BinaryType != TimestampType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: binary != timestamp; line 1 pos 27
 
 
 -- !query 252
@@ -2134,7 +2134,7 @@ SELECT cast('1' as binary) in (cast('1' as binary), cast('2017-12-11 09:30:00' a
 struct<>
 -- !query 252 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BinaryType != DateType; line 1 pos 27
+cannot resolve '(CAST('1' AS BINARY) IN (CAST('1' AS BINARY), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: binary != date; line 1 pos 27
 
 
 -- !query 253
@@ -2143,7 +2143,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as tinyint)) FROM t
 struct<>
 -- !query 253 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ByteType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: boolean != tinyint; line 1 pos 28
 
 
 -- !query 254
@@ -2152,7 +2152,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as smallint)) FROM
 struct<>
 -- !query 254 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != ShortType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: boolean != smallint; line 1 pos 28
 
 
 -- !query 255
@@ -2161,7 +2161,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as int)) FROM t
 struct<>
 -- !query 255 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != IntegerType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: boolean != int; line 1 pos 28
 
 
 -- !query 256
@@ -2170,7 +2170,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as bigint)) FROM t
 struct<>
 -- !query 256 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != LongType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: boolean != bigint; line 1 pos 28
 
 
 -- !query 257
@@ -2179,7 +2179,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as float)) FROM t
 struct<>
 -- !query 257 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: BooleanType != FloatType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: boolean != float; line 1 pos 28
 
 
 -- !query 258
@@ -2188,7 +2188,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as double)) FROM t
 struct<>
 -- !query 258 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DoubleType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: boolean != double; line 1 pos 28
 
 
 -- !query 259
@@ -2197,7 +2197,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as decimal(10, 0)))
 struct<>
 -- !query 259 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: BooleanType != DecimalType(10,0); line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: boolean != decimal(10,0); line 1 pos 28
 
 
 -- !query 260
@@ -2206,7 +2206,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast(1 as string)) FROM t
 struct<>
 -- !query 260 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: BooleanType != StringType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST(1 AS STRING)))' due to data type mismatch: Arguments must be same type but were: boolean != string; line 1 pos 28
 
 
 -- !query 261
@@ -2215,7 +2215,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast('1' as binary)) FROM
 struct<>
 -- !query 261 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: BooleanType != BinaryType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: boolean != binary; line 1 pos 28
 
 
 -- !query 262
@@ -2232,7 +2232,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00.
 struct<>
 -- !query 263 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: BooleanType != TimestampType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00.0' AS TIMESTAMP)))' due to data type mismatch: Arguments must be same type but were: boolean != timestamp; line 1 pos 28
 
 
 -- !query 264
@@ -2241,7 +2241,7 @@ SELECT cast('1' as boolean) in (cast('1' as boolean), cast('2017-12-11 09:30:00'
 struct<>
 -- !query 264 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: BooleanType != DateType; line 1 pos 28
+cannot resolve '(CAST('1' AS BOOLEAN) IN (CAST('1' AS BOOLEAN), CAST('2017-12-11 09:30:00' AS DATE)))' due to data type mismatch: Arguments must be same type but were: boolean != date; line 1 pos 28
 
 
 -- !query 265
@@ -2250,7 +2250,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 265 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ByteType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != tinyint; line 1 pos 50
 
 
 -- !query 266
@@ -2259,7 +2259,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 266 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != ShortType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != smallint; line 1 pos 50
 
 
 -- !query 267
@@ -2268,7 +2268,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 267 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != IntegerType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: timestamp != int; line 1 pos 50
 
 
 -- !query 268
@@ -2277,7 +2277,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 268 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != LongType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: timestamp != bigint; line 1 pos 50
 
 
 -- !query 269
@@ -2286,7 +2286,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 269 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: TimestampType != FloatType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: timestamp != float; line 1 pos 50
 
 
 -- !query 270
@@ -2295,7 +2295,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 270 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: TimestampType != DoubleType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: timestamp != double; line 1 pos 50
 
 
 -- !query 271
@@ -2304,7 +2304,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 271 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: TimestampType != DecimalType(10,0); line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: timestamp != decimal(10,0); line 1 pos 50
 
 
 -- !query 272
@@ -2321,7 +2321,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 273 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BinaryType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: timestamp != binary; line 1 pos 50
 
 
 -- !query 274
@@ -2330,7 +2330,7 @@ SELECT cast('2017-12-12 09:30:00.0' as timestamp) in (cast('2017-12-12 09:30:00.
 struct<>
 -- !query 274 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: TimestampType != BooleanType; line 1 pos 50
+cannot resolve '(CAST('2017-12-12 09:30:00.0' AS TIMESTAMP) IN (CAST('2017-12-12 09:30:00.0' AS TIMESTAMP), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: timestamp != boolean; line 1 pos 50
 
 
 -- !query 275
@@ -2355,7 +2355,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 277 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ByteType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS TINYINT)))' due to data type mismatch: Arguments must be same type but were: date != tinyint; line 1 pos 43
 
 
 -- !query 278
@@ -2364,7 +2364,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 278 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: DateType != ShortType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS SMALLINT)))' due to data type mismatch: Arguments must be same type but were: date != smallint; line 1 pos 43
 
 
 -- !query 279
@@ -2373,7 +2373,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 279 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: DateType != IntegerType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS INT)))' due to data type mismatch: Arguments must be same type but were: date != int; line 1 pos 43
 
 
 -- !query 280
@@ -2382,7 +2382,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 280 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: DateType != LongType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BIGINT)))' due to data type mismatch: Arguments must be same type but were: date != bigint; line 1 pos 43
 
 
 -- !query 281
@@ -2391,7 +2391,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 281 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: DateType != FloatType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS FLOAT)))' due to data type mismatch: Arguments must be same type but were: date != float; line 1 pos 43
 
 
 -- !query 282
@@ -2400,7 +2400,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 282 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: DateType != DoubleType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DOUBLE)))' due to data type mismatch: Arguments must be same type but were: date != double; line 1 pos 43
 
 
 -- !query 283
@@ -2409,7 +2409,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 283 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: DateType != DecimalType(10,0); line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS DECIMAL(10,0))))' due to data type mismatch: Arguments must be same type but were: date != decimal(10,0); line 1 pos 43
 
 
 -- !query 284
@@ -2426,7 +2426,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 285 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: DateType != BinaryType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST('1' AS BINARY)))' due to data type mismatch: Arguments must be same type but were: date != binary; line 1 pos 43
 
 
 -- !query 286
@@ -2435,7 +2435,7 @@ SELECT cast('2017-12-12 09:30:00' as date) in (cast('2017-12-12 09:30:00' as dat
 struct<>
 -- !query 286 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: DateType != BooleanType; line 1 pos 43
+cannot resolve '(CAST('2017-12-12 09:30:00' AS DATE) IN (CAST('2017-12-12 09:30:00' AS DATE), CAST(1 AS BOOLEAN)))' due to data type mismatch: Arguments must be same type but were: date != boolean; line 1 pos 43
 
 
 -- !query 287
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
index 5dd257ba6a0bb..01d83938031fe 100644
--- a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/windowFrameCoercion.sql.out
@@ -168,7 +168,7 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as string) DESC RANGE BETWE
 struct<>
 -- !query 20 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'StringType' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
+cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS STRING) FOLLOWING' due to data type mismatch: The data type of the upper bound 'string' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
 
 
 -- !query 21
@@ -177,7 +177,7 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('1' as binary) DESC RANGE BET
 struct<>
 -- !query 21 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY 1 ORDER BY CAST('1' AS BINARY) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'BinaryType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 21
+cannot resolve '(PARTITION BY 1 ORDER BY CAST('1' AS BINARY) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'binary' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 21
 
 
 -- !query 22
@@ -186,7 +186,7 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast(1 as boolean) DESC RANGE BETW
 struct<>
 -- !query 22 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'BooleanType' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
+cannot resolve 'RANGE BETWEEN CURRENT ROW AND CAST(1 AS BOOLEAN) FOLLOWING' due to data type mismatch: The data type of the upper bound 'boolean' does not match the expected data type '(numeric or calendarinterval)'.; line 1 pos 21
 
 
 -- !query 23
@@ -195,7 +195,7 @@ SELECT COUNT(*) OVER (PARTITION BY 1 ORDER BY cast('2017-12-11 09:30:00.0' as ti
 struct<>
 -- !query 23 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'TimestampType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 21
+cannot resolve '(PARTITION BY 1 ORDER BY CAST('2017-12-11 09:30:00.0' AS TIMESTAMP) DESC NULLS LAST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 21
 
 
 -- !query 24
diff --git a/sql/core/src/test/resources/sql-tests/results/window.sql.out b/sql/core/src/test/resources/sql-tests/results/window.sql.out
index a52e198eb9a8f..133458ae9303b 100644
--- a/sql/core/src/test/resources/sql-tests/results/window.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/window.sql.out
@@ -61,7 +61,7 @@ ROWS BETWEEN CURRENT ROW AND 2147483648 FOLLOWING) FROM testData ORDER BY cate,
 struct<>
 -- !query 3 output
 org.apache.spark.sql.AnalysisException
-cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'LongType' does not match the expected data type 'int'.; line 1 pos 41
+cannot resolve 'ROWS BETWEEN CURRENT ROW AND 2147483648L FOLLOWING' due to data type mismatch: The data type of the upper bound 'bigint' does not match the expected data type 'int'.; line 1 pos 41
 
 
 -- !query 4
@@ -221,7 +221,7 @@ RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) FROM testData ORDER BY cate, val
 struct<>
 -- !query 14 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_timestamp() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'TimestampType' used in the order specification does not match the data type 'IntegerType' which is used in the range frame.; line 1 pos 33
+cannot resolve '(PARTITION BY testdata.`cate` ORDER BY current_timestamp() ASC NULLS FIRST RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)' due to data type mismatch: The data type 'timestamp' used in the order specification does not match the data type 'int' which is used in the range frame.; line 1 pos 33
 
 
 -- !query 15
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index bd1e7adefc7a9..d535896723bd5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -660,7 +660,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     val e = intercept[AnalysisException] {
       df.as[KryoData]
     }.message
-    assert(e.contains("cannot cast IntegerType to BinaryType"))
+    assert(e.contains("cannot cast int to binary"))
   }
 
   test("Java encoder") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
index 6b98209fd49b8..109fcf90a3ec9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
@@ -65,7 +65,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext {
     val m3 = intercept[AnalysisException] {
       df.selectExpr("stack(2, 1, '2.2')")
     }.getMessage
-    assert(m3.contains("data type mismatch: Argument 1 (IntegerType) != Argument 2 (StringType)"))
+    assert(m3.contains("data type mismatch: Argument 1 (int) != Argument 2 (string)"))
 
     // stack on column data
     val df2 = Seq((2, 1, 2, 3)).toDF("n", "a", "b", "c")
@@ -80,7 +80,7 @@ class GeneratorFunctionSuite extends QueryTest with SharedSQLContext {
     val m5 = intercept[AnalysisException] {
       df3.selectExpr("stack(2, a, b)")
     }.getMessage
-    assert(m5.contains("data type mismatch: Argument 1 (IntegerType) != Argument 2 (DoubleType)"))
+    assert(m5.contains("data type mismatch: Argument 1 (int) != Argument 2 (double)"))
 
   }
 

From 12d20dd75b1620da362dbb5345bed58e47ddacb9 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Mon, 25 Dec 2017 20:29:10 +0900
Subject: [PATCH 189/356] [SPARK-22874][PYSPARK][SQL][FOLLOW-UP] Modify error
 messages to show actual versions.

## What changes were proposed in this pull request?

This is a follow-up pr of #20054 modifying error messages for both pandas and pyarrow to show actual versions.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20074 from ueshin/issues/SPARK-22874_fup1.
---
 python/pyspark/sql/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/utils.py b/python/pyspark/sql/utils.py
index fb7d42a35d8f4..08c34c6dccc5e 100644
--- a/python/pyspark/sql/utils.py
+++ b/python/pyspark/sql/utils.py
@@ -118,7 +118,8 @@ def require_minimum_pandas_version():
     from distutils.version import LooseVersion
     import pandas
     if LooseVersion(pandas.__version__) < LooseVersion('0.19.2'):
-        raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process")
+        raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process; "
+                          "however, your version was %s." % pandas.__version__)
 
 
 def require_minimum_pyarrow_version():
@@ -127,4 +128,5 @@ def require_minimum_pyarrow_version():
     from distutils.version import LooseVersion
     import pyarrow
     if LooseVersion(pyarrow.__version__) < LooseVersion('0.8.0'):
-        raise ImportError("pyarrow >= 0.8.0 must be installed on calling Python process")
+        raise ImportError("pyarrow >= 0.8.0 must be installed on calling Python process; "
+                          "however, your version was %s." % pyarrow.__version__)

From be03d3ad793dfe8f3ec33074d4ae95f5adb86ee4 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 25 Dec 2017 16:17:39 -0800
Subject: [PATCH 190/356] [SPARK-22893][SQL][HOTFIX] Fix a error message of
 VersionsSuite

## What changes were proposed in this pull request?

https://github.com/apache/spark/pull/20064 breaks Jenkins tests because it missed to update one error message for Hive 0.12 and Hive 0.13. This PR fixes that.
- https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-sbt-hadoop-2.7/3924/
- https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-sbt-hadoop-2.6/3977/
- https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-maven-hadoop-2.7/4226/
- https://amplab.cs.berkeley.edu/jenkins/view/Spark%20QA%20Test%20(Dashboard)/job/spark-master-test-maven-hadoop-2.6/4260/

## How was this patch tested?

Pass the Jenkins without failure.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20079 from dongjoon-hyun/SPARK-22893.
---
 .../scala/org/apache/spark/sql/hive/client/VersionsSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 9d15dabc8d3f5..94473a08dd317 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -773,7 +773,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
            """.stripMargin
           )
 
-          val errorMsg = "data type mismatch: cannot cast DecimalType(2,1) to BinaryType"
+          val errorMsg = "data type mismatch: cannot cast decimal(2,1) to binary"
 
           if (isPartitioned) {
             val insertStmt = s"INSERT OVERWRITE TABLE $tableName partition (ds='a') SELECT 1.3"

From 0e6833006d28df426eb132bb8fc82917b8e2aedd Mon Sep 17 00:00:00 2001
From: Yash Sharma <ysharma@atlassian.com>
Date: Tue, 26 Dec 2017 09:50:39 +0200
Subject: [PATCH 191/356] [SPARK-20168][DSTREAM] Add changes to use kinesis
 fetches from specific timestamp

## What changes were proposed in this pull request?

Kinesis client can resume from a specified timestamp while creating a stream. We should have option to pass a timestamp in config to allow kinesis to resume from the given timestamp.

The patch introduces a new `KinesisInitialPositionInStream` that takes the `InitialPositionInStream` with the `timestamp` information that can be used to resume kinesis fetches from the provided timestamp.

## How was this patch tested?

Unit Tests

cc : budde brkyvz

Author: Yash Sharma <ysharma@atlassian.com>

Closes #18029 from yssharma/ysharma/kcl_resume.
---
 .../kinesis/KinesisInitialPositions.java      | 91 +++++++++++++++++++
 .../streaming/KinesisWordCountASL.scala       |  5 +-
 .../kinesis/KinesisInputDStream.scala         | 31 +++++--
 .../streaming/kinesis/KinesisReceiver.scala   | 45 +++++----
 .../streaming/kinesis/KinesisUtils.scala      | 15 ++-
 .../JavaKinesisInputDStreamBuilderSuite.java  | 47 ++++++++--
 .../KinesisInputDStreamBuilderSuite.scala     | 68 +++++++++++++-
 .../kinesis/KinesisStreamSuite.scala          | 11 ++-
 8 files changed, 264 insertions(+), 49 deletions(-)
 create mode 100644 external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java

diff --git a/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java b/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java
new file mode 100644
index 0000000000000..206e1e4699030
--- /dev/null
+++ b/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.streaming.kinesis;
+
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
+
+import java.io.Serializable;
+import java.util.Date;
+
+/**
+ * A java wrapper for exposing [[InitialPositionInStream]]
+ * to the corresponding Kinesis readers.
+ */
+interface KinesisInitialPosition {
+    InitialPositionInStream getPosition();
+}
+
+public class KinesisInitialPositions {
+    public static class Latest implements KinesisInitialPosition, Serializable {
+        public Latest() {}
+
+        @Override
+        public InitialPositionInStream getPosition() {
+            return InitialPositionInStream.LATEST;
+        }
+    }
+
+    public static class TrimHorizon implements KinesisInitialPosition, Serializable {
+        public TrimHorizon() {}
+
+        @Override
+        public InitialPositionInStream getPosition() {
+            return InitialPositionInStream.TRIM_HORIZON;
+        }
+    }
+
+    public static class AtTimestamp implements KinesisInitialPosition, Serializable {
+        private Date timestamp;
+
+        public AtTimestamp(Date timestamp) {
+            this.timestamp = timestamp;
+        }
+
+        @Override
+        public InitialPositionInStream getPosition() {
+            return InitialPositionInStream.AT_TIMESTAMP;
+        }
+
+        public Date getTimestamp() {
+            return timestamp;
+        }
+    }
+
+
+    /**
+     * Returns instance of [[KinesisInitialPosition]] based on the passed [[InitialPositionInStream]].
+     * This method is used in KinesisUtils for translating the InitialPositionInStream
+     * to InitialPosition. This function would be removed when we deprecate the KinesisUtils.
+     *
+     * @return [[InitialPosition]]
+     */
+    public static KinesisInitialPosition fromKinesisInitialPosition(
+            InitialPositionInStream initialPositionInStream) throws UnsupportedOperationException {
+        if (initialPositionInStream == InitialPositionInStream.LATEST) {
+            return new Latest();
+        } else if (initialPositionInStream == InitialPositionInStream.TRIM_HORIZON) {
+            return new TrimHorizon();
+        } else {
+            // InitialPositionInStream.AT_TIMESTAMP is not supported.
+            // Use InitialPosition.atTimestamp(timestamp) instead.
+            throw new UnsupportedOperationException(
+                    "Only InitialPositionInStream.LATEST and InitialPositionInStream.TRIM_HORIZON " +
+                            "supported in initialPositionInStream(). Please use the initialPosition() from " +
+                            "builder API in KinesisInputDStream for using InitialPositionInStream.AT_TIMESTAMP");
+        }
+    }
+}
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
index cde2c4b04c0c7..fcb790e3ea1f9 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala
@@ -24,7 +24,6 @@ import scala.util.Random
 
 import com.amazonaws.auth.DefaultAWSCredentialsProviderChain
 import com.amazonaws.services.kinesis.AmazonKinesisClient
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import com.amazonaws.services.kinesis.model.PutRecordRequest
 import org.apache.log4j.{Level, Logger}
 
@@ -33,9 +32,9 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.streaming.{Milliseconds, StreamingContext}
 import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.Latest
 import org.apache.spark.streaming.kinesis.KinesisInputDStream
 
-
 /**
  * Consumes messages from a Amazon Kinesis streams and does wordcount.
  *
@@ -139,7 +138,7 @@ object KinesisWordCountASL extends Logging {
         .streamName(streamName)
         .endpointUrl(endpointUrl)
         .regionName(regionName)
-        .initialPositionInStream(InitialPositionInStream.LATEST)
+        .initialPosition(new Latest())
         .checkpointAppName(appName)
         .checkpointInterval(kinesisCheckpointInterval)
         .storageLevel(StorageLevel.MEMORY_AND_DISK_2)
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
index f61e398e7bdd1..1ffec01df9f00 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala
@@ -28,6 +28,7 @@ import org.apache.spark.storage.{BlockId, StorageLevel}
 import org.apache.spark.streaming.{Duration, StreamingContext, Time}
 import org.apache.spark.streaming.api.java.JavaStreamingContext
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.Latest
 import org.apache.spark.streaming.receiver.Receiver
 import org.apache.spark.streaming.scheduler.ReceivedBlockInfo
 
@@ -36,7 +37,7 @@ private[kinesis] class KinesisInputDStream[T: ClassTag](
     val streamName: String,
     val endpointUrl: String,
     val regionName: String,
-    val initialPositionInStream: InitialPositionInStream,
+    val initialPosition: KinesisInitialPosition,
     val checkpointAppName: String,
     val checkpointInterval: Duration,
     val _storageLevel: StorageLevel,
@@ -77,7 +78,7 @@ private[kinesis] class KinesisInputDStream[T: ClassTag](
   }
 
   override def getReceiver(): Receiver[T] = {
-    new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream,
+    new KinesisReceiver(streamName, endpointUrl, regionName, initialPosition,
       checkpointAppName, checkpointInterval, _storageLevel, messageHandler,
       kinesisCreds, dynamoDBCreds, cloudWatchCreds)
   }
@@ -100,7 +101,7 @@ object KinesisInputDStream {
     // Params with defaults
     private var endpointUrl: Option[String] = None
     private var regionName: Option[String] = None
-    private var initialPositionInStream: Option[InitialPositionInStream] = None
+    private var initialPosition: Option[KinesisInitialPosition] = None
     private var checkpointInterval: Option[Duration] = None
     private var storageLevel: Option[StorageLevel] = None
     private var kinesisCredsProvider: Option[SparkAWSCredentials] = None
@@ -180,16 +181,32 @@ object KinesisInputDStream {
       this
     }
 
+    /**
+     * Sets the initial position data is read from in the Kinesis stream. Defaults to
+     * [[KinesisInitialPositions.Latest]] if no custom value is specified.
+     *
+     * @param initialPosition [[KinesisInitialPosition]] value specifying where Spark Streaming
+     *                        will start reading records in the Kinesis stream from
+     * @return Reference to this [[KinesisInputDStream.Builder]]
+     */
+    def initialPosition(initialPosition: KinesisInitialPosition): Builder = {
+      this.initialPosition = Option(initialPosition)
+      this
+    }
+
     /**
      * Sets the initial position data is read from in the Kinesis stream. Defaults to
      * [[InitialPositionInStream.LATEST]] if no custom value is specified.
+     * This function would be removed when we deprecate the KinesisUtils.
      *
      * @param initialPosition InitialPositionInStream value specifying where Spark Streaming
      *                        will start reading records in the Kinesis stream from
      * @return Reference to this [[KinesisInputDStream.Builder]]
      */
+    @deprecated("use initialPosition(initialPosition: KinesisInitialPosition)", "2.3.0")
     def initialPositionInStream(initialPosition: InitialPositionInStream): Builder = {
-      initialPositionInStream = Option(initialPosition)
+      this.initialPosition = Option(
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPosition))
       this
     }
 
@@ -266,7 +283,7 @@ object KinesisInputDStream {
         getRequiredParam(streamName, "streamName"),
         endpointUrl.getOrElse(DEFAULT_KINESIS_ENDPOINT_URL),
         regionName.getOrElse(DEFAULT_KINESIS_REGION_NAME),
-        initialPositionInStream.getOrElse(DEFAULT_INITIAL_POSITION_IN_STREAM),
+        initialPosition.getOrElse(DEFAULT_INITIAL_POSITION),
         getRequiredParam(checkpointAppName, "checkpointAppName"),
         checkpointInterval.getOrElse(ssc.graph.batchDuration),
         storageLevel.getOrElse(DEFAULT_STORAGE_LEVEL),
@@ -293,7 +310,6 @@ object KinesisInputDStream {
    * Creates a [[KinesisInputDStream.Builder]] for constructing [[KinesisInputDStream]] instances.
    *
    * @since 2.2.0
-   *
    * @return [[KinesisInputDStream.Builder]] instance
    */
   def builder: Builder = new Builder
@@ -309,7 +325,6 @@ object KinesisInputDStream {
   private[kinesis] val DEFAULT_KINESIS_ENDPOINT_URL: String =
     "https://kinesis.us-east-1.amazonaws.com"
   private[kinesis] val DEFAULT_KINESIS_REGION_NAME: String = "us-east-1"
-  private[kinesis] val DEFAULT_INITIAL_POSITION_IN_STREAM: InitialPositionInStream =
-    InitialPositionInStream.LATEST
+  private[kinesis] val DEFAULT_INITIAL_POSITION: KinesisInitialPosition = new Latest()
   private[kinesis] val DEFAULT_STORAGE_LEVEL: StorageLevel = StorageLevel.MEMORY_AND_DISK_2
 }
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
index 1026d0fcb59bd..fa0de6298a5f1 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala
@@ -24,12 +24,13 @@ import scala.collection.mutable
 import scala.util.control.NonFatal
 
 import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer, IRecordProcessorFactory}
-import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{InitialPositionInStream, KinesisClientLibConfiguration, Worker}
+import com.amazonaws.services.kinesis.clientlibrary.lib.worker.{KinesisClientLibConfiguration, Worker}
 import com.amazonaws.services.kinesis.model.Record
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.storage.{StorageLevel, StreamBlockId}
 import org.apache.spark.streaming.Duration
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.AtTimestamp
 import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener, Receiver}
 import org.apache.spark.util.Utils
 
@@ -56,12 +57,13 @@ import org.apache.spark.util.Utils
  * @param endpointUrl  Url of Kinesis service (e.g., https://kinesis.us-east-1.amazonaws.com)
  * @param regionName  Region name used by the Kinesis Client Library for
  *                    DynamoDB (lease coordination and checkpointing) and CloudWatch (metrics)
- * @param initialPositionInStream  In the absence of Kinesis checkpoint info, this is the
- *                                 worker's initial starting position in the stream.
- *                                 The values are either the beginning of the stream
- *                                 per Kinesis' limit of 24 hours
- *                                 (InitialPositionInStream.TRIM_HORIZON) or
- *                                 the tip of the stream (InitialPositionInStream.LATEST).
+ * @param initialPosition  Instance of [[KinesisInitialPosition]]
+ *                         In the absence of Kinesis checkpoint info, this is the
+ *                         worker's initial starting position in the stream.
+ *                         The values are either the beginning of the stream
+ *                         per Kinesis' limit of 24 hours
+ *                         ([[KinesisInitialPositions.TrimHorizon]]) or
+ *                         the tip of the stream ([[KinesisInitialPositions.Latest]]).
  * @param checkpointAppName  Kinesis application name. Kinesis Apps are mapped to Kinesis Streams
  *                 by the Kinesis Client Library.  If you change the App name or Stream name,
  *                 the KCL will throw errors.  This usually requires deleting the backing
@@ -83,7 +85,7 @@ private[kinesis] class KinesisReceiver[T](
     val streamName: String,
     endpointUrl: String,
     regionName: String,
-    initialPositionInStream: InitialPositionInStream,
+    initialPosition: KinesisInitialPosition,
     checkpointAppName: String,
     checkpointInterval: Duration,
     storageLevel: StorageLevel,
@@ -148,18 +150,29 @@ private[kinesis] class KinesisReceiver[T](
 
     kinesisCheckpointer = new KinesisCheckpointer(receiver, checkpointInterval, workerId)
     val kinesisProvider = kinesisCreds.provider
-    val kinesisClientLibConfiguration = new KinesisClientLibConfiguration(
-          checkpointAppName,
-          streamName,
-          kinesisProvider,
-          dynamoDBCreds.map(_.provider).getOrElse(kinesisProvider),
-          cloudWatchCreds.map(_.provider).getOrElse(kinesisProvider),
-          workerId)
+
+    val kinesisClientLibConfiguration = {
+      val baseClientLibConfiguration = new KinesisClientLibConfiguration(
+        checkpointAppName,
+        streamName,
+        kinesisProvider,
+        dynamoDBCreds.map(_.provider).getOrElse(kinesisProvider),
+        cloudWatchCreds.map(_.provider).getOrElse(kinesisProvider),
+        workerId)
         .withKinesisEndpoint(endpointUrl)
-        .withInitialPositionInStream(initialPositionInStream)
+        .withInitialPositionInStream(initialPosition.getPosition)
         .withTaskBackoffTimeMillis(500)
         .withRegionName(regionName)
 
+      // Update the Kinesis client lib config with timestamp
+      // if InitialPositionInStream.AT_TIMESTAMP is passed
+      initialPosition match {
+        case ts: AtTimestamp =>
+          baseClientLibConfiguration.withTimestampAtInitialPositionInStream(ts.getTimestamp)
+        case _ => baseClientLibConfiguration
+      }
+    }
+
    /*
     *  RecordProcessorFactory creates impls of IRecordProcessor.
     *  IRecordProcessor adapts the KCL to our Spark KinesisReceiver via the
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index 1298463bfba1e..2500460bd330b 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -73,7 +73,8 @@ object KinesisUtils {
     // Setting scope to override receiver stream's scope of "receiver stream"
     ssc.withNamedScope("kinesis stream") {
       new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPositionInStream),
+        kinesisAppName, checkpointInterval, storageLevel,
         cleanedHandler, DefaultCredentials, None, None)
     }
   }
@@ -129,7 +130,8 @@ object KinesisUtils {
         awsAccessKeyId = awsAccessKeyId,
         awsSecretKey = awsSecretKey)
       new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPositionInStream),
+        kinesisAppName, checkpointInterval, storageLevel,
         cleanedHandler, kinesisCredsProvider, None, None)
     }
   }
@@ -198,7 +200,8 @@ object KinesisUtils {
           awsAccessKeyId = awsAccessKeyId,
           awsSecretKey = awsSecretKey))
       new KinesisInputDStream[T](ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPositionInStream),
+        kinesisAppName, checkpointInterval, storageLevel,
         cleanedHandler, kinesisCredsProvider, None, None)
     }
   }
@@ -243,7 +246,8 @@ object KinesisUtils {
     // Setting scope to override receiver stream's scope of "receiver stream"
     ssc.withNamedScope("kinesis stream") {
       new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPositionInStream),
+        kinesisAppName, checkpointInterval, storageLevel,
         KinesisInputDStream.defaultMessageHandler, DefaultCredentials, None, None)
     }
   }
@@ -293,7 +297,8 @@ object KinesisUtils {
         awsAccessKeyId = awsAccessKeyId,
         awsSecretKey = awsSecretKey)
       new KinesisInputDStream[Array[Byte]](ssc, streamName, endpointUrl, validateRegion(regionName),
-        initialPositionInStream, kinesisAppName, checkpointInterval, storageLevel,
+        KinesisInitialPositions.fromKinesisInitialPosition(initialPositionInStream),
+        kinesisAppName, checkpointInterval, storageLevel,
         KinesisInputDStream.defaultMessageHandler, kinesisCredsProvider, None, None)
     }
   }
diff --git a/external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java b/external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java
index be6d549b1e429..03becd73d1a06 100644
--- a/external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java
+++ b/external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java
@@ -17,14 +17,13 @@
 
 package org.apache.spark.streaming.kinesis;
 
-import org.junit.Test;
-
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
-
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.TrimHorizon;
 import org.apache.spark.storage.StorageLevel;
 import org.apache.spark.streaming.Duration;
-import org.apache.spark.streaming.Seconds;
 import org.apache.spark.streaming.LocalJavaStreamingContext;
+import org.apache.spark.streaming.Seconds;
+import org.junit.Test;
 
 public class JavaKinesisInputDStreamBuilderSuite extends LocalJavaStreamingContext {
   /**
@@ -35,7 +34,41 @@ public void testJavaKinesisDStreamBuilder() {
     String streamName = "a-very-nice-stream-name";
     String endpointUrl = "https://kinesis.us-west-2.amazonaws.com";
     String region = "us-west-2";
-    InitialPositionInStream initialPosition = InitialPositionInStream.TRIM_HORIZON;
+    KinesisInitialPosition initialPosition = new TrimHorizon();
+    String appName = "a-very-nice-kinesis-app";
+    Duration checkpointInterval = Seconds.apply(30);
+    StorageLevel storageLevel = StorageLevel.MEMORY_ONLY();
+
+    KinesisInputDStream<byte[]> kinesisDStream = KinesisInputDStream.builder()
+      .streamingContext(ssc)
+      .streamName(streamName)
+      .endpointUrl(endpointUrl)
+      .regionName(region)
+      .initialPosition(initialPosition)
+      .checkpointAppName(appName)
+      .checkpointInterval(checkpointInterval)
+      .storageLevel(storageLevel)
+      .build();
+    assert(kinesisDStream.streamName() == streamName);
+    assert(kinesisDStream.endpointUrl() == endpointUrl);
+    assert(kinesisDStream.regionName() == region);
+    assert(kinesisDStream.initialPosition().getPosition() == initialPosition.getPosition());
+    assert(kinesisDStream.checkpointAppName() == appName);
+    assert(kinesisDStream.checkpointInterval() == checkpointInterval);
+    assert(kinesisDStream._storageLevel() == storageLevel);
+    ssc.stop();
+  }
+
+  /**
+   * Test to ensure that the old API for InitialPositionInStream
+   * is supported in KinesisDStream.Builder.
+   * This test would be removed when we deprecate the KinesisUtils.
+   */
+  @Test
+  public void testJavaKinesisDStreamBuilderOldApi() {
+    String streamName = "a-very-nice-stream-name";
+    String endpointUrl = "https://kinesis.us-west-2.amazonaws.com";
+    String region = "us-west-2";
     String appName = "a-very-nice-kinesis-app";
     Duration checkpointInterval = Seconds.apply(30);
     StorageLevel storageLevel = StorageLevel.MEMORY_ONLY();
@@ -45,7 +78,7 @@ public void testJavaKinesisDStreamBuilder() {
       .streamName(streamName)
       .endpointUrl(endpointUrl)
       .regionName(region)
-      .initialPositionInStream(initialPosition)
+      .initialPositionInStream(InitialPositionInStream.LATEST)
       .checkpointAppName(appName)
       .checkpointInterval(checkpointInterval)
       .storageLevel(storageLevel)
@@ -53,7 +86,7 @@ public void testJavaKinesisDStreamBuilder() {
     assert(kinesisDStream.streamName() == streamName);
     assert(kinesisDStream.endpointUrl() == endpointUrl);
     assert(kinesisDStream.regionName() == region);
-    assert(kinesisDStream.initialPositionInStream() == initialPosition);
+    assert(kinesisDStream.initialPosition().getPosition() == InitialPositionInStream.LATEST);
     assert(kinesisDStream.checkpointAppName() == appName);
     assert(kinesisDStream.checkpointInterval() == checkpointInterval);
     assert(kinesisDStream._storageLevel() == storageLevel);
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
index afa1a7f8ca663..e0e26847aa0ec 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.streaming.kinesis
 
+import java.util.Calendar
+
 import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream
 import org.scalatest.BeforeAndAfterEach
 import org.scalatest.mockito.MockitoSugar
 
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.streaming.{Seconds, StreamingContext, TestSuiteBase}
+import org.apache.spark.streaming.{Duration, Seconds, StreamingContext, TestSuiteBase}
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.{AtTimestamp, TrimHorizon}
 
 class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterEach
    with MockitoSugar {
@@ -69,7 +72,7 @@ class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterE
     val dstream = builder.build()
     assert(dstream.endpointUrl == DEFAULT_KINESIS_ENDPOINT_URL)
     assert(dstream.regionName == DEFAULT_KINESIS_REGION_NAME)
-    assert(dstream.initialPositionInStream == DEFAULT_INITIAL_POSITION_IN_STREAM)
+    assert(dstream.initialPosition == DEFAULT_INITIAL_POSITION)
     assert(dstream.checkpointInterval == batchDuration)
     assert(dstream._storageLevel == DEFAULT_STORAGE_LEVEL)
     assert(dstream.kinesisCreds == DefaultCredentials)
@@ -80,7 +83,7 @@ class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterE
   test("should propagate custom non-auth values to KinesisInputDStream") {
     val customEndpointUrl = "https://kinesis.us-west-2.amazonaws.com"
     val customRegion = "us-west-2"
-    val customInitialPosition = InitialPositionInStream.TRIM_HORIZON
+    val customInitialPosition = new TrimHorizon()
     val customAppName = "a-very-nice-kinesis-app"
     val customCheckpointInterval = Seconds(30)
     val customStorageLevel = StorageLevel.MEMORY_ONLY
@@ -91,7 +94,7 @@ class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterE
     val dstream = builder
       .endpointUrl(customEndpointUrl)
       .regionName(customRegion)
-      .initialPositionInStream(customInitialPosition)
+      .initialPosition(customInitialPosition)
       .checkpointAppName(customAppName)
       .checkpointInterval(customCheckpointInterval)
       .storageLevel(customStorageLevel)
@@ -101,12 +104,67 @@ class KinesisInputDStreamBuilderSuite extends TestSuiteBase with BeforeAndAfterE
       .build()
     assert(dstream.endpointUrl == customEndpointUrl)
     assert(dstream.regionName == customRegion)
-    assert(dstream.initialPositionInStream == customInitialPosition)
+    assert(dstream.initialPosition == customInitialPosition)
     assert(dstream.checkpointAppName == customAppName)
     assert(dstream.checkpointInterval == customCheckpointInterval)
     assert(dstream._storageLevel == customStorageLevel)
     assert(dstream.kinesisCreds == customKinesisCreds)
     assert(dstream.dynamoDBCreds == Option(customDynamoDBCreds))
     assert(dstream.cloudWatchCreds == Option(customCloudWatchCreds))
+
+    // Testing with AtTimestamp
+    val cal = Calendar.getInstance()
+    cal.add(Calendar.DATE, -1)
+    val timestamp = cal.getTime()
+    val initialPositionAtTimestamp = new AtTimestamp(timestamp)
+
+    val dstreamAtTimestamp = builder
+      .endpointUrl(customEndpointUrl)
+      .regionName(customRegion)
+      .initialPosition(initialPositionAtTimestamp)
+      .checkpointAppName(customAppName)
+      .checkpointInterval(customCheckpointInterval)
+      .storageLevel(customStorageLevel)
+      .kinesisCredentials(customKinesisCreds)
+      .dynamoDBCredentials(customDynamoDBCreds)
+      .cloudWatchCredentials(customCloudWatchCreds)
+      .build()
+    assert(dstreamAtTimestamp.endpointUrl == customEndpointUrl)
+    assert(dstreamAtTimestamp.regionName == customRegion)
+    assert(dstreamAtTimestamp.initialPosition.getPosition
+      == initialPositionAtTimestamp.getPosition)
+    assert(
+      dstreamAtTimestamp.initialPosition.asInstanceOf[AtTimestamp].getTimestamp.equals(timestamp))
+    assert(dstreamAtTimestamp.checkpointAppName == customAppName)
+    assert(dstreamAtTimestamp.checkpointInterval == customCheckpointInterval)
+    assert(dstreamAtTimestamp._storageLevel == customStorageLevel)
+    assert(dstreamAtTimestamp.kinesisCreds == customKinesisCreds)
+    assert(dstreamAtTimestamp.dynamoDBCreds == Option(customDynamoDBCreds))
+    assert(dstreamAtTimestamp.cloudWatchCreds == Option(customCloudWatchCreds))
+  }
+
+  test("old Api should throw UnsupportedOperationExceptionexception with AT_TIMESTAMP") {
+    val streamName: String = "a-very-nice-stream-name"
+    val endpointUrl: String = "https://kinesis.us-west-2.amazonaws.com"
+    val region: String = "us-west-2"
+    val appName: String = "a-very-nice-kinesis-app"
+    val checkpointInterval: Duration = Seconds.apply(30)
+    val storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY
+
+    // This should not build.
+    // InitialPositionInStream.AT_TIMESTAMP is not supported in old Api.
+    // The builder Api in KinesisInputDStream should be used.
+    intercept[UnsupportedOperationException] {
+      val kinesisDStream: KinesisInputDStream[Array[Byte]] = KinesisInputDStream.builder
+        .streamingContext(ssc)
+        .streamName(streamName)
+        .endpointUrl(endpointUrl)
+        .regionName(region)
+        .initialPositionInStream(InitialPositionInStream.AT_TIMESTAMP)
+        .checkpointAppName(appName)
+        .checkpointInterval(checkpointInterval)
+        .storageLevel(storageLevel)
+        .build
+    }
   }
 }
diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
index 7e5bda923f63e..a7a68eba910bf 100644
--- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
+++ b/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala
@@ -34,6 +34,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.storage.{StorageLevel, StreamBlockId}
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
+import org.apache.spark.streaming.kinesis.KinesisInitialPositions.Latest
 import org.apache.spark.streaming.kinesis.KinesisReadConfigurations._
 import org.apache.spark.streaming.kinesis.KinesisTestUtils._
 import org.apache.spark.streaming.receiver.BlockManagerBasedStoreResult
@@ -178,7 +179,7 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
       .streamName(testUtils.streamName)
       .endpointUrl(testUtils.endpointUrl)
       .regionName(testUtils.regionName)
-      .initialPositionInStream(InitialPositionInStream.LATEST)
+      .initialPosition(new Latest())
       .checkpointInterval(Seconds(10))
       .storageLevel(StorageLevel.MEMORY_ONLY)
       .build()
@@ -209,7 +210,7 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
       .streamName(testUtils.streamName)
       .endpointUrl(testUtils.endpointUrl)
       .regionName(testUtils.regionName)
-      .initialPositionInStream(InitialPositionInStream.LATEST)
+      .initialPosition(new Latest())
       .checkpointInterval(Seconds(10))
       .storageLevel(StorageLevel.MEMORY_ONLY)
       .buildWithMessageHandler(addFive(_))
@@ -245,7 +246,7 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
       .streamName("dummyStream")
       .endpointUrl(dummyEndpointUrl)
       .regionName(dummyRegionName)
-      .initialPositionInStream(InitialPositionInStream.LATEST)
+      .initialPosition(new Latest())
       .checkpointInterval(Seconds(10))
       .storageLevel(StorageLevel.MEMORY_ONLY)
       .build()
@@ -293,7 +294,7 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
         .streamName(localTestUtils.streamName)
         .endpointUrl(localTestUtils.endpointUrl)
         .regionName(localTestUtils.regionName)
-        .initialPositionInStream(InitialPositionInStream.LATEST)
+        .initialPosition(new Latest())
         .checkpointInterval(Seconds(10))
         .storageLevel(StorageLevel.MEMORY_ONLY)
         .build()
@@ -369,7 +370,7 @@ abstract class KinesisStreamTests(aggregateTestData: Boolean) extends KinesisFun
       .streamName(testUtils.streamName)
       .endpointUrl(testUtils.endpointUrl)
       .regionName(testUtils.regionName)
-      .initialPositionInStream(InitialPositionInStream.LATEST)
+      .initialPosition(new Latest())
       .checkpointInterval(Seconds(10))
       .storageLevel(StorageLevel.MEMORY_ONLY)
       .build()

From eb386be1ed383323da6e757f63f3b8a7ced38cc4 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Tue, 26 Dec 2017 21:37:25 +0900
Subject: [PATCH 192/356] [SPARK-21552][SQL] Add DecimalType support to
 ArrowWriter.

## What changes were proposed in this pull request?

Decimal type is not yet supported in `ArrowWriter`.
This is adding the decimal type support.

## How was this patch tested?

Added a test to `ArrowConvertersSuite`.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #18754 from ueshin/issues/SPARK-21552.
---
 python/pyspark/sql/tests.py                   | 61 +++++++++++------
 python/pyspark/sql/types.py                   |  2 +-
 .../sql/execution/arrow/ArrowWriter.scala     | 21 ++++++
 .../arrow/ArrowConvertersSuite.scala          | 67 ++++++++++++++++++-
 .../execution/arrow/ArrowWriterSuite.scala    |  2 +
 5 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index b977160af566d..b811a0f8a31d3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3142,6 +3142,7 @@ class ArrowTests(ReusedSQLTestCase):
     @classmethod
     def setUpClass(cls):
         from datetime import datetime
+        from decimal import Decimal
         ReusedSQLTestCase.setUpClass()
 
         # Synchronize default timezone between Python and Java
@@ -3158,11 +3159,15 @@ def setUpClass(cls):
             StructField("3_long_t", LongType(), True),
             StructField("4_float_t", FloatType(), True),
             StructField("5_double_t", DoubleType(), True),
-            StructField("6_date_t", DateType(), True),
-            StructField("7_timestamp_t", TimestampType(), True)])
-        cls.data = [(u"a", 1, 10, 0.2, 2.0, datetime(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
-                    (u"b", 2, 20, 0.4, 4.0, datetime(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
-                    (u"c", 3, 30, 0.8, 6.0, datetime(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
+            StructField("6_decimal_t", DecimalType(38, 18), True),
+            StructField("7_date_t", DateType(), True),
+            StructField("8_timestamp_t", TimestampType(), True)])
+        cls.data = [(u"a", 1, 10, 0.2, 2.0, Decimal("2.0"),
+                     datetime(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1)),
+                    (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"),
+                     datetime(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
+                    (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"),
+                     datetime(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
 
     @classmethod
     def tearDownClass(cls):
@@ -3190,10 +3195,11 @@ def create_pandas_data_frame(self):
         return pd.DataFrame(data=data_dict)
 
     def test_unsupported_datatype(self):
-        schema = StructType([StructField("decimal", DecimalType(), True)])
+        schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(None,)], schema=schema)
         with QuietTest(self.sc):
-            self.assertRaises(Exception, lambda: df.toPandas())
+            with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
+                df.toPandas()
 
     def test_null_conversion(self):
         df_null = self.spark.createDataFrame([tuple([None for _ in range(len(self.data[0]))])] +
@@ -3293,7 +3299,7 @@ def test_createDataFrame_respect_session_timezone(self):
             self.assertNotEqual(result_ny, result_la)
 
             # Correct result_la by adjusting 3 hours difference between Los Angeles and New York
-            result_la_corrected = [Row(**{k: v - timedelta(hours=3) if k == '7_timestamp_t' else v
+            result_la_corrected = [Row(**{k: v - timedelta(hours=3) if k == '8_timestamp_t' else v
                                           for k, v in row.asDict().items()})
                                    for row in result_la]
             self.assertEqual(result_ny, result_la_corrected)
@@ -3317,11 +3323,11 @@ def test_createDataFrame_with_incorrect_schema(self):
     def test_createDataFrame_with_names(self):
         pdf = self.create_pandas_data_frame()
         # Test that schema as a list of column names gets applied
-        df = self.spark.createDataFrame(pdf, schema=list('abcdefg'))
-        self.assertEquals(df.schema.fieldNames(), list('abcdefg'))
+        df = self.spark.createDataFrame(pdf, schema=list('abcdefgh'))
+        self.assertEquals(df.schema.fieldNames(), list('abcdefgh'))
         # Test that schema as tuple of column names gets applied
-        df = self.spark.createDataFrame(pdf, schema=tuple('abcdefg'))
-        self.assertEquals(df.schema.fieldNames(), list('abcdefg'))
+        df = self.spark.createDataFrame(pdf, schema=tuple('abcdefgh'))
+        self.assertEquals(df.schema.fieldNames(), list('abcdefgh'))
 
     def test_createDataFrame_column_name_encoding(self):
         import pandas as pd
@@ -3344,7 +3350,7 @@ def test_createDataFrame_does_not_modify_input(self):
         # Some series get converted for Spark to consume, this makes sure input is unchanged
         pdf = self.create_pandas_data_frame()
         # Use a nanosecond value to make sure it is not truncated
-        pdf.ix[0, '7_timestamp_t'] = pd.Timestamp(1)
+        pdf.ix[0, '8_timestamp_t'] = pd.Timestamp(1)
         # Integers with nulls will get NaNs filled with 0 and will be casted
         pdf.ix[1, '2_int_t'] = None
         pdf_copy = pdf.copy(deep=True)
@@ -3514,6 +3520,7 @@ def test_vectorized_udf_basic(self):
             col('id').alias('long'),
             col('id').cast('float').alias('float'),
             col('id').cast('double').alias('double'),
+            col('id').cast('decimal').alias('decimal'),
             col('id').cast('boolean').alias('bool'))
         f = lambda x: x
         str_f = pandas_udf(f, StringType())
@@ -3521,10 +3528,12 @@ def test_vectorized_udf_basic(self):
         long_f = pandas_udf(f, LongType())
         float_f = pandas_udf(f, FloatType())
         double_f = pandas_udf(f, DoubleType())
+        decimal_f = pandas_udf(f, DecimalType())
         bool_f = pandas_udf(f, BooleanType())
         res = df.select(str_f(col('str')), int_f(col('int')),
                         long_f(col('long')), float_f(col('float')),
-                        double_f(col('double')), bool_f(col('bool')))
+                        double_f(col('double')), decimal_f('decimal'),
+                        bool_f(col('bool')))
         self.assertEquals(df.collect(), res.collect())
 
     def test_vectorized_udf_null_boolean(self):
@@ -3590,6 +3599,16 @@ def test_vectorized_udf_null_double(self):
         res = df.select(double_f(col('double')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_vectorized_udf_null_decimal(self):
+        from decimal import Decimal
+        from pyspark.sql.functions import pandas_udf, col
+        data = [(Decimal(3.0),), (Decimal(5.0),), (Decimal(-1.0),), (None,)]
+        schema = StructType().add("decimal", DecimalType(38, 18))
+        df = self.spark.createDataFrame(data, schema)
+        decimal_f = pandas_udf(lambda x: x, DecimalType(38, 18))
+        res = df.select(decimal_f(col('decimal')))
+        self.assertEquals(df.collect(), res.collect())
+
     def test_vectorized_udf_null_string(self):
         from pyspark.sql.functions import pandas_udf, col
         data = [("foo",), (None,), ("bar",), ("bar",)]
@@ -3607,6 +3626,7 @@ def test_vectorized_udf_datatype_string(self):
             col('id').alias('long'),
             col('id').cast('float').alias('float'),
             col('id').cast('double').alias('double'),
+            col('id').cast('decimal').alias('decimal'),
             col('id').cast('boolean').alias('bool'))
         f = lambda x: x
         str_f = pandas_udf(f, 'string')
@@ -3614,10 +3634,12 @@ def test_vectorized_udf_datatype_string(self):
         long_f = pandas_udf(f, 'long')
         float_f = pandas_udf(f, 'float')
         double_f = pandas_udf(f, 'double')
+        decimal_f = pandas_udf(f, 'decimal(38, 18)')
         bool_f = pandas_udf(f, 'boolean')
         res = df.select(str_f(col('str')), int_f(col('int')),
                         long_f(col('long')), float_f(col('float')),
-                        double_f(col('double')), bool_f(col('bool')))
+                        double_f(col('double')), decimal_f('decimal'),
+                        bool_f(col('bool')))
         self.assertEquals(df.collect(), res.collect())
 
     def test_vectorized_udf_complex(self):
@@ -3713,12 +3735,12 @@ def test_vectorized_udf_varargs(self):
 
     def test_vectorized_udf_unsupported_types(self):
         from pyspark.sql.functions import pandas_udf, col
-        schema = StructType([StructField("dt", DecimalType(), True)])
+        schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(None,)], schema=schema)
-        f = pandas_udf(lambda x: x, DecimalType())
+        f = pandas_udf(lambda x: x, MapType(StringType(), IntegerType()))
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(Exception, 'Unsupported data type'):
-                df.select(f(col('dt'))).collect()
+                df.select(f(col('map'))).collect()
 
     def test_vectorized_udf_null_date(self):
         from pyspark.sql.functions import pandas_udf, col
@@ -4012,7 +4034,8 @@ def test_wrong_args(self):
     def test_unsupported_types(self):
         from pyspark.sql.functions import pandas_udf, col, PandasUDFType
         schema = StructType(
-            [StructField("id", LongType(), True), StructField("dt", DecimalType(), True)])
+            [StructField("id", LongType(), True),
+             StructField("map", MapType(StringType(), IntegerType()), True)])
         df = self.spark.createDataFrame([(1, None,)], schema=schema)
         f = pandas_udf(lambda x: x, df.schema, PandasUDFType.GROUP_MAP)
         with QuietTest(self.sc):
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 063264a89379c..02b245713b6ab 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1617,7 +1617,7 @@ def to_arrow_type(dt):
     elif type(dt) == DoubleType:
         arrow_type = pa.float64()
     elif type(dt) == DecimalType:
-        arrow_type = pa.decimal(dt.precision, dt.scale)
+        arrow_type = pa.decimal128(dt.precision, dt.scale)
     elif type(dt) == StringType:
         arrow_type = pa.string()
     elif type(dt) == DateType:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
index 0258056d9de49..22b63513548fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowWriter.scala
@@ -53,6 +53,8 @@ object ArrowWriter {
       case (LongType, vector: BigIntVector) => new LongWriter(vector)
       case (FloatType, vector: Float4Vector) => new FloatWriter(vector)
       case (DoubleType, vector: Float8Vector) => new DoubleWriter(vector)
+      case (DecimalType.Fixed(precision, scale), vector: DecimalVector) =>
+        new DecimalWriter(vector, precision, scale)
       case (StringType, vector: VarCharVector) => new StringWriter(vector)
       case (BinaryType, vector: VarBinaryVector) => new BinaryWriter(vector)
       case (DateType, vector: DateDayVector) => new DateWriter(vector)
@@ -214,6 +216,25 @@ private[arrow] class DoubleWriter(val valueVector: Float8Vector) extends ArrowFi
   }
 }
 
+private[arrow] class DecimalWriter(
+    val valueVector: DecimalVector,
+    precision: Int,
+    scale: Int) extends ArrowFieldWriter {
+
+  override def setNull(): Unit = {
+    valueVector.setNull(count)
+  }
+
+  override def setValue(input: SpecializedGetters, ordinal: Int): Unit = {
+    val decimal = input.getDecimal(ordinal, precision, scale)
+    if (decimal.changePrecision(precision, scale)) {
+      valueVector.setSafe(count, decimal.toJavaBigDecimal)
+    } else {
+      setNull()
+    }
+  }
+}
+
 private[arrow] class StringWriter(val valueVector: VarCharVector) extends ArrowFieldWriter {
 
   override def setNull(): Unit = {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
index fd5a3df6abc68..261df06100aef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
-import org.apache.spark.sql.types.{BinaryType, IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{BinaryType, Decimal, IntegerType, StructField, StructType}
 import org.apache.spark.util.Utils
 
 
@@ -304,6 +304,70 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
     collectAndValidate(df, json, "floating_point-double_precision.json")
   }
 
+  test("decimal conversion") {
+    val json =
+      s"""
+         |{
+         |  "schema" : {
+         |    "fields" : [ {
+         |      "name" : "a_d",
+         |      "type" : {
+         |        "name" : "decimal",
+         |        "precision" : 38,
+         |        "scale" : 18
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ]
+         |    }, {
+         |      "name" : "b_d",
+         |      "type" : {
+         |        "name" : "decimal",
+         |        "precision" : 38,
+         |        "scale" : 18
+         |      },
+         |      "nullable" : true,
+         |      "children" : [ ]
+         |    } ]
+         |  },
+         |  "batches" : [ {
+         |    "count" : 7,
+         |    "columns" : [ {
+         |      "name" : "a_d",
+         |      "count" : 7,
+         |      "VALIDITY" : [ 1, 1, 1, 1, 1, 1, 1 ],
+         |      "DATA" : [
+         |        "1000000000000000000",
+         |        "2000000000000000000",
+         |        "10000000000000000",
+         |        "200000000000000000000",
+         |        "100000000000000",
+         |        "20000000000000000000000",
+         |        "30000000000000000000" ]
+         |    }, {
+         |      "name" : "b_d",
+         |      "count" : 7,
+         |      "VALIDITY" : [ 1, 0, 0, 1, 0, 1, 0 ],
+         |      "DATA" : [
+         |        "1100000000000000000",
+         |        "0",
+         |        "0",
+         |        "2200000000000000000",
+         |        "0",
+         |        "3300000000000000000",
+         |        "0" ]
+         |    } ]
+         |  } ]
+         |}
+       """.stripMargin
+
+    val a_d = List(1.0, 2.0, 0.01, 200.0, 0.0001, 20000.0, 30.0).map(Decimal(_))
+    val b_d = List(Some(Decimal(1.1)), None, None, Some(Decimal(2.2)), None, Some(Decimal(3.3)),
+      Some(Decimal("123456789012345678901234567890")))
+    val df = a_d.zip(b_d).toDF("a_d", "b_d")
+
+    collectAndValidate(df, json, "decimalData.json")
+  }
+
   test("index conversion") {
     val data = List[Int](1, 2, 3, 4, 5, 6)
     val json =
@@ -1153,7 +1217,6 @@ class ArrowConvertersSuite extends SharedSQLContext with BeforeAndAfterAll {
       assert(msg.getCause.getClass === classOf[UnsupportedOperationException])
     }
 
-    runUnsupported { decimalData.toArrowPayload.collect() }
     runUnsupported { mapData.toDF().toArrowPayload.collect() }
     runUnsupported { complexData.toArrowPayload.collect() }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index a71e30aa3ca96..508c116aae92e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -49,6 +49,7 @@ class ArrowWriterSuite extends SparkFunSuite {
             case LongType => reader.getLong(rowId)
             case FloatType => reader.getFloat(rowId)
             case DoubleType => reader.getDouble(rowId)
+            case DecimalType.Fixed(precision, scale) => reader.getDecimal(rowId, precision, scale)
             case StringType => reader.getUTF8String(rowId)
             case BinaryType => reader.getBinary(rowId)
             case DateType => reader.getInt(rowId)
@@ -66,6 +67,7 @@ class ArrowWriterSuite extends SparkFunSuite {
     check(LongType, Seq(1L, 2L, null, 4L))
     check(FloatType, Seq(1.0f, 2.0f, null, 4.0f))
     check(DoubleType, Seq(1.0d, 2.0d, null, 4.0d))
+    check(DecimalType.SYSTEM_DEFAULT, Seq(Decimal(1), Decimal(2), null, Decimal(4)))
     check(StringType, Seq("a", "b", null, "d").map(UTF8String.fromString))
     check(BinaryType, Seq("a".getBytes(), "b".getBytes(), null, "d".getBytes()))
     check(DateType, Seq(0, 1, 2, null, 4))

From ff48b1b338241039a7189e7a3c04333b1256fdb3 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Tue, 26 Dec 2017 06:39:40 -0800
Subject: [PATCH 193/356] [SPARK-22901][PYTHON] Add deterministic flag to
 pyspark UDF

## What changes were proposed in this pull request?

In SPARK-20586 the flag `deterministic` was added to Scala UDF, but it is not available for python UDF. This flag is useful for cases when the UDF's code can return different result with the same input. Due to optimization, duplicate invocations may be eliminated or the function may even be invoked more times than it is present in the query. This can lead to unexpected behavior.

This PR adds the deterministic flag, via the `asNondeterministic` method, to let the user mark the function as non-deterministic and therefore avoid the optimizations which might lead to strange behaviors.

## How was this patch tested?

Manual tests:
```
>>> from pyspark.sql.functions import *
>>> from pyspark.sql.types import *
>>> df_br = spark.createDataFrame([{'name': 'hello'}])
>>> import random
>>> udf_random_col =  udf(lambda: int(100*random.random()), IntegerType()).asNondeterministic()
>>> df_br = df_br.withColumn('RAND', udf_random_col())
>>> random.seed(1234)
>>> udf_add_ten =  udf(lambda rand: rand + 10, IntegerType())
>>> df_br.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).show()
+-----+----+-------------+
| name|RAND|RAND_PLUS_TEN|
+-----+----+-------------+
|hello|   3|           13|
+-----+----+-------------+

```

Author: Marco Gaido <marcogaido91@gmail.com>
Author: Marco Gaido <mgaido@hortonworks.com>

Closes #19929 from mgaido91/SPARK-22629.
---
 .../org/apache/spark/api/python/PythonRunner.scala  |  7 +++++++
 python/pyspark/sql/functions.py                     | 11 ++++++++---
 python/pyspark/sql/tests.py                         |  9 +++++++++
 python/pyspark/sql/udf.py                           | 13 ++++++++++++-
 .../org/apache/spark/sql/UDFRegistration.scala      |  5 +++--
 .../spark/sql/execution/python/PythonUDF.scala      |  5 ++++-
 .../python/UserDefinedPythonFunction.scala          |  5 +++--
 .../execution/python/BatchEvalPythonExecSuite.scala |  3 ++-
 8 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index 93d508c28ebba..1ec0e717fac29 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -39,6 +39,13 @@ private[spark] object PythonEvalType {
 
   val SQL_PANDAS_SCALAR_UDF = 200
   val SQL_PANDAS_GROUP_MAP_UDF = 201
+
+  def toString(pythonEvalType: Int): String = pythonEvalType match {
+    case NON_UDF => "NON_UDF"
+    case SQL_BATCHED_UDF => "SQL_BATCHED_UDF"
+    case SQL_PANDAS_SCALAR_UDF => "SQL_PANDAS_SCALAR_UDF"
+    case SQL_PANDAS_GROUP_MAP_UDF => "SQL_PANDAS_GROUP_MAP_UDF"
+  }
 }
 
 /**
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index ddd8df3b15bf6..66ee033bab998 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2093,9 +2093,14 @@ class PandasUDFType(object):
 def udf(f=None, returnType=StringType()):
     """Creates a user defined function (UDF).
 
-    .. note:: The user-defined functions must be deterministic. Due to optimization,
-        duplicate invocations may be eliminated or the function may even be invoked more times than
-        it is present in the query.
+    .. note:: The user-defined functions are considered deterministic by default. Due to
+        optimization, duplicate invocations may be eliminated or the function may even be invoked
+        more times than it is present in the query. If your function is not deterministic, call
+        `asNondeterministic` on the user defined function. E.g.:
+
+    >>> from pyspark.sql.types import IntegerType
+    >>> import random
+    >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
 
     .. note:: The user-defined functions do not support conditional expressions or short curcuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index b811a0f8a31d3..3ef1522887325 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -435,6 +435,15 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 123138117fdc3..54b5a8656e1c8 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -92,6 +92,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
+        self._deterministic = True
 
     @property
     def returnType(self):
@@ -129,7 +130,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType)
+            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -161,5 +162,15 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
+        wrapper.asNondeterministic = self.asNondeterministic
 
         return wrapper
+
+    def asNondeterministic(self):
+        """
+        Updates UserDefinedFunction to nondeterministic.
+
+        .. versionadded:: 2.3
+        """
+        self._deterministic = False
+        return self
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index 3ff476147b8b7..dc2468a721e41 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -23,6 +23,7 @@ import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
 
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection}
@@ -41,8 +42,6 @@ import org.apache.spark.util.Utils
  *   spark.udf
  * }}}
  *
- * @note The user-defined functions must be deterministic.
- *
  * @since 1.3.0
  */
 @InterfaceStability.Stable
@@ -58,6 +57,8 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
         | pythonIncludes: ${udf.func.pythonIncludes}
         | pythonExec: ${udf.func.pythonExec}
         | dataType: ${udf.dataType}
+        | pythonEvalType: ${PythonEvalType.toString(udf.pythonEvalType)}
+        | udfDeterministic: ${udf.udfDeterministic}
       """.stripMargin)
 
     functionRegistry.createOrReplaceTempFunction(name, udf.builder)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
index ef27fbc2db7d9..d3f743d9eb61e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/PythonUDF.scala
@@ -29,9 +29,12 @@ case class PythonUDF(
     func: PythonFunction,
     dataType: DataType,
     children: Seq[Expression],
-    evalType: Int)
+    evalType: Int,
+    udfDeterministic: Boolean)
   extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression {
 
+  override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic)
+
   override def toString: String = s"$name(${children.mkString(", ")})"
 
   override def nullable: Boolean = true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
index 348e49e473ed3..50dca32cb7861 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/UserDefinedPythonFunction.scala
@@ -29,10 +29,11 @@ case class UserDefinedPythonFunction(
     name: String,
     func: PythonFunction,
     dataType: DataType,
-    pythonEvalType: Int) {
+    pythonEvalType: Int,
+    udfDeterministic: Boolean) {
 
   def builder(e: Seq[Expression]): PythonUDF = {
-    PythonUDF(name, func, dataType, e, pythonEvalType)
+    PythonUDF(name, func, dataType, e, pythonEvalType, udfDeterministic)
   }
 
   /** Returns a [[Column]] that will evaluate to calling this UDF with the given input. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
index 53d3f34567518..9e4a2e8776956 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -109,4 +109,5 @@ class MyDummyPythonUDF extends UserDefinedPythonFunction(
   name = "dummyUDF",
   func = new DummyUDF,
   dataType = BooleanType,
-  pythonEvalType = PythonEvalType.SQL_BATCHED_UDF)
+  pythonEvalType = PythonEvalType.SQL_BATCHED_UDF,
+  udfDeterministic = true)

From 9348e684208465a8f75c893bdeaa30fc42c0cb5f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 26 Dec 2017 09:37:39 -0800
Subject: [PATCH 194/356] [SPARK-22833][EXAMPLE] Improvement SparkHive Scala
 Examples

## What changes were proposed in this pull request?
Some improvements:
1. Point out we are using both Spark SQ native syntax and HQL syntax in the example
2. Avoid using the same table name with temp view, to not confuse users.
3. Create the external hive table with a directory that already has data, which is a more common use case.
4. Remove the usage of `spark.sql.parquet.writeLegacyFormat`. This config was introduced by https://github.com/apache/spark/pull/8566 and has nothing to do with Hive.
5. Remove `repartition` and `coalesce` example. These 2 are not Hive specific, we should put them in a different example file. BTW they can't accurately control the number of output files, `spark.sql.files.maxRecordsPerFile` also controls it.

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20081 from cloud-fan/minor.
---
 .../examples/sql/hive/SparkHiveExample.scala  | 75 +++++++++++--------
 .../apache/spark/sql/internal/SQLConf.scala   |  4 +-
 2 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
index b193bd595127c..70fb5b27366ec 100644
--- a/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala
@@ -102,40 +102,53 @@ object SparkHiveExample {
     // |  5| val_5|  5| val_5|
     // ...
 
-    // Create Hive managed table with Parquet
-    sql("CREATE TABLE records(key int, value string) STORED AS PARQUET")
-    // Save DataFrame to Hive managed table as Parquet format
-    val hiveTableDF = sql("SELECT * FROM records")
-    hiveTableDF.write.mode(SaveMode.Overwrite).saveAsTable("database_name.records")
-    // Create External Hive table with Parquet
-    sql("CREATE EXTERNAL TABLE records(key int, value string) " +
-      "STORED AS PARQUET LOCATION '/user/hive/warehouse/'")
-    // to make Hive Parquet format compatible with Spark Parquet format
-    spark.sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
-
-    // Multiple Parquet files could be created accordingly to volume of data under directory given.
-    val hiveExternalTableLocation = "/user/hive/warehouse/database_name.db/records"
-
-    // Save DataFrame to Hive External table as compatible Parquet format
-    hiveTableDF.write.mode(SaveMode.Overwrite).parquet(hiveExternalTableLocation)
-
-    // Turn on flag for Dynamic Partitioning
-    spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
-    spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
-
-    // You can create partitions in Hive table, so downstream queries run much faster.
-    hiveTableDF.write.mode(SaveMode.Overwrite).partitionBy("key")
-      .parquet(hiveExternalTableLocation)
+    // Create a Hive managed Parquet table, with HQL syntax instead of the Spark SQL native syntax
+    // `USING hive`
+    sql("CREATE TABLE hive_records(key int, value string) STORED AS PARQUET")
+    // Save DataFrame to the Hive managed table
+    val df = spark.table("src")
+    df.write.mode(SaveMode.Overwrite).saveAsTable("hive_records")
+    // After insertion, the Hive managed table has data now
+    sql("SELECT * FROM hive_records").show()
+    // +---+-------+
+    // |key|  value|
+    // +---+-------+
+    // |238|val_238|
+    // | 86| val_86|
+    // |311|val_311|
+    // ...
 
-    // Reduce number of files for each partition by repartition
-    hiveTableDF.repartition($"key").write.mode(SaveMode.Overwrite)
-      .partitionBy("key").parquet(hiveExternalTableLocation)
+    // Prepare a Parquet data directory
+    val dataDir = "/tmp/parquet_data"
+    spark.range(10).write.parquet(dataDir)
+    // Create a Hive external Parquet table
+    sql(s"CREATE EXTERNAL TABLE hive_ints(key int) STORED AS PARQUET LOCATION '$dataDir'")
+    // The Hive external table should already have data
+    sql("SELECT * FROM hive_ints").show()
+    // +---+
+    // |key|
+    // +---+
+    // |  0|
+    // |  1|
+    // |  2|
+    // ...
 
-    // Control the number of files in each partition by coalesce
-    hiveTableDF.coalesce(10).write.mode(SaveMode.Overwrite)
-      .partitionBy("key").parquet(hiveExternalTableLocation)
-    // $example off:spark_hive$
+    // Turn on flag for Hive Dynamic Partitioning
+    spark.sqlContext.setConf("hive.exec.dynamic.partition", "true")
+    spark.sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
+    // Create a Hive partitioned table using DataFrame API
+    df.write.partitionBy("key").format("hive").saveAsTable("hive_part_tbl")
+    // Partitioned column `key` will be moved to the end of the schema.
+    sql("SELECT * FROM hive_part_tbl").show()
+    // +-------+---+
+    // |  value|key|
+    // +-------+---+
+    // |val_238|238|
+    // | val_86| 86|
+    // |val_311|311|
+    // ...
 
     spark.stop()
+    // $example off:spark_hive$
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 84fe4bb711a4e..f16972e5427e2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -336,8 +336,8 @@ object SQLConf {
     .createWithDefault(true)
 
   val PARQUET_WRITE_LEGACY_FORMAT = buildConf("spark.sql.parquet.writeLegacyFormat")
-    .doc("Whether to follow Parquet's format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.")
+    .doc("Whether to be compatible with the legacy Parquet format adopted by Spark 1.4 and prior " +
+      "versions, when converting Parquet schema to Spark SQL schema and vice versa.")
     .booleanConf
     .createWithDefault(false)
 

From 91d1b300d467cc91948f71e87b7afe1b9027e60f Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Tue, 26 Dec 2017 09:40:41 -0800
Subject: [PATCH 195/356] [SPARK-22894][SQL] DateTimeOperations should accept
 SQL like string type

## What changes were proposed in this pull request?

`DateTimeOperations` accept [`StringType`](https://github.com/apache/spark/blob/ae998ec2b5548b7028d741da4813473dde1ad81e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala#L669),  but:

```
spark-sql> SELECT '2017-12-24' + interval 2 months 2 seconds;
Error in query: cannot resolve '(CAST('2017-12-24' AS DOUBLE) + interval 2 months 2 seconds)' due to data type mismatch: differing types in '(CAST('2017-12-24' AS DOUBLE) + interval 2 months 2 seconds)' (double and calendarinterval).; line 1 pos 7;
'Project [unresolvedalias((cast(2017-12-24 as double) + interval 2 months 2 seconds), None)]
+- OneRowRelation
spark-sql>
```

After this PR:
```
spark-sql> SELECT '2017-12-24' + interval 2 months 2 seconds;
2018-02-24 00:00:02
Time taken: 0.2 seconds, Fetched 1 row(s)

```

## How was this patch tested?

unit tests

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20067 from wangyum/SPARK-22894.
---
 .../spark/sql/catalyst/analysis/TypeCoercion.scala  |  6 ++++--
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala  | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 2f306f58b7b80..1c4be547739dd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -324,9 +324,11 @@ object TypeCoercion {
       // Skip nodes who's children have not been resolved yet.
       case e if !e.childrenResolved => e
 
-      case a @ BinaryArithmetic(left @ StringType(), right) =>
+      case a @ BinaryArithmetic(left @ StringType(), right)
+        if right.dataType != CalendarIntervalType =>
         a.makeCopy(Array(Cast(left, DoubleType), right))
-      case a @ BinaryArithmetic(left, right @ StringType()) =>
+      case a @ BinaryArithmetic(left, right @ StringType())
+        if left.dataType != CalendarIntervalType =>
         a.makeCopy(Array(left, Cast(right, DoubleType)))
 
       // For equality between string and timestamp we cast the string to a timestamp
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5e077285ade55..1972dec7b86ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.io.File
 import java.math.MathContext
 import java.net.{MalformedURLException, URL}
-import java.sql.Timestamp
+import java.sql.{Date, Timestamp}
 import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.{AccumulatorSuite, SparkException}
@@ -2760,6 +2760,17 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("SPARK-22894: DateTimeOperations should accept SQL like string type") {
+    val date = "2017-12-24"
+    val str = sql(s"SELECT CAST('$date' as STRING) + interval 2 months 2 seconds")
+    val dt = sql(s"SELECT CAST('$date' as DATE) + interval 2 months 2 seconds")
+    val ts = sql(s"SELECT CAST('$date' as TIMESTAMP) + interval 2 months 2 seconds")
+
+    checkAnswer(str, Row("2018-02-24 00:00:02") :: Nil)
+    checkAnswer(dt, Row(Date.valueOf("2018-02-24")) :: Nil)
+    checkAnswer(ts, Row(Timestamp.valueOf("2018-02-24 00:00:02")) :: Nil)
+  }
+
   // Only New OrcFileFormat supports this
   Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
       "parquet").foreach { format =>

From 6674acd1edc8c657049113df95af3378b5db8806 Mon Sep 17 00:00:00 2001
From: "xu.wenchun" <xu.wenchun@immomo.com>
Date: Wed, 27 Dec 2017 10:08:32 +0800
Subject: [PATCH 196/356] [SPARK-22846][SQL] Fix table owner is null when
 creating table through spark sql or thriftserver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?
fix table owner is null when create new table through spark sql

## How was this patch tested?
manual test.
1、first create a table
2、then select the table properties from mysql which connected to hive metastore

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: xu.wenchun <xu.wenchun@immomo.com>

Closes #20034 from BruceXu1991/SPARK-22846.
---
 .../scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 7233944dc96dd..7b7f4e0f10210 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -186,7 +186,7 @@ private[hive] class HiveClientImpl(
   /** Returns the configuration for the current session. */
   def conf: HiveConf = state.getConf
 
-  private val userName = state.getAuthenticator.getUserName
+  private val userName = conf.getUser
 
   override def getConf(key: String, defaultValue: String): String = {
     conf.get(key, defaultValue)

From b8bfce51abf28c66ba1fc67b0f25fe1617c81025 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Wed, 27 Dec 2017 20:51:26 +0900
Subject: [PATCH 197/356] [SPARK-22324][SQL][PYTHON][FOLLOW-UP] Update setup.py
 file.

## What changes were proposed in this pull request?

This is a follow-up pr of #19884 updating setup.py file to add pyarrow dependency.

## How was this patch tested?

Existing tests.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20089 from ueshin/issues/SPARK-22324/fup1.
---
 python/README.md | 2 +-
 python/setup.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/README.md b/python/README.md
index 84ec88141cb00..3f17fdb98a081 100644
--- a/python/README.md
+++ b/python/README.md
@@ -29,4 +29,4 @@ The Python packaging for Spark is not intended to replace all of the other use c
 
 ## Python Requirements
 
-At its core PySpark depends on Py4J (currently version 0.10.6), but additional sub-packages have their own requirements (including numpy and pandas).
+At its core PySpark depends on Py4J (currently version 0.10.6), but some additional sub-packages have their own extra requirements for some features (including numpy, pandas, and pyarrow).
diff --git a/python/setup.py b/python/setup.py
index 310670e697a83..251d4526d4dd0 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -201,7 +201,7 @@ def _supports_symlinks():
         extras_require={
             'ml': ['numpy>=1.7'],
             'mllib': ['numpy>=1.7'],
-            'sql': ['pandas>=0.19.2']
+            'sql': ['pandas>=0.19.2', 'pyarrow>=0.8.0']
         },
         classifiers=[
             'Development Status :: 5 - Production/Stable',
@@ -210,6 +210,7 @@ def _supports_symlinks():
             'Programming Language :: Python :: 3',
             'Programming Language :: Python :: 3.4',
             'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
             'Programming Language :: Python :: Implementation :: CPython',
             'Programming Language :: Python :: Implementation :: PyPy']
     )

From 774715d5c73ab6d410208fa1675cd11166e03165 Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Wed, 27 Dec 2017 23:53:10 +0800
Subject: [PATCH 198/356] [SPARK-22904][SQL] Add tests for decimal operations
 and string casts

## What changes were proposed in this pull request?

Test coverage for arithmetic operations leading to:

 1. Precision loss
 2. Overflow

Moreover, tests for casting bad string to other input types and for using bad string as operators of some functions.

## How was this patch tested?

added tests

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20084 from mgaido91/SPARK-22904.
---
 .../native/decimalArithmeticOperations.sql    |  33 +++
 .../native/stringCastAndExpressions.sql       |  57 ++++
 .../decimalArithmeticOperations.sql.out       |  82 ++++++
 .../native/stringCastAndExpressions.sql.out   | 261 ++++++++++++++++++
 4 files changed, 433 insertions(+)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/stringCastAndExpressions.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
new file mode 100644
index 0000000000000..c8e108ac2c45e
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/decimalArithmeticOperations.sql
@@ -0,0 +1,33 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b;
+
+-- division, remainder and pmod by 0 return NULL
+select a / b from t;
+select a % b from t;
+select pmod(a, b) from t;
+
+-- arithmetic operations causing an overflow return NULL
+select (5e36 + 0.1) + 5e36;
+select (-4e36 - 0.1) - 7e36;
+select 12345678901234567890.0 * 12345678901234567890.0;
+select 1e35 / 0.1;
+
+-- arithmetic operations causing a precision loss return NULL
+select 123456789123456789.1234567890 * 1.123456789123456789;
+select 0.001 / 9876543210987654321098765432109876543.2
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/stringCastAndExpressions.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/stringCastAndExpressions.sql
new file mode 100644
index 0000000000000..f17adb56dee91
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/stringCastAndExpressions.sql
@@ -0,0 +1,57 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 'aa' as a;
+
+-- casting to data types which are unable to represent the string input returns NULL
+select cast(a as byte) from t;
+select cast(a as short) from t;
+select cast(a as int) from t;
+select cast(a as long) from t;
+select cast(a as float) from t;
+select cast(a as double) from t;
+select cast(a as decimal) from t;
+select cast(a as boolean) from t;
+select cast(a as timestamp) from t;
+select cast(a as date) from t;
+-- casting to binary works correctly
+select cast(a as binary) from t;
+-- casting to array, struct or map throws exception
+select cast(a as array<string>) from t;
+select cast(a as struct<s:string>) from t;
+select cast(a as map<string, string>) from t;
+
+-- all timestamp/date expressions return NULL if bad input strings are provided
+select to_timestamp(a) from t;
+select to_timestamp('2018-01-01', a) from t;
+select to_unix_timestamp(a) from t;
+select to_unix_timestamp('2018-01-01', a) from t;
+select unix_timestamp(a) from t;
+select unix_timestamp('2018-01-01', a) from t;
+select from_unixtime(a) from t;
+select from_unixtime('2018-01-01', a) from t;
+select next_day(a, 'MO') from t;
+select next_day('2018-01-01', a) from t;
+select trunc(a, 'MM') from t;
+select trunc('2018-01-01', a) from t;
+
+-- some functions return NULL if bad input is provided
+select unhex('-123');
+select sha2(a, a) from t;
+select get_json_object(a, a) from t;
+select json_tuple(a, a) from t;
+select from_json(a, 'a INT') from t;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
new file mode 100644
index 0000000000000..ce02f6adc456c
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/decimalArithmeticOperations.sql.out
@@ -0,0 +1,82 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 10
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1.0 as a, 0.0 as b
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+select a / b from t
+-- !query 1 schema
+struct<(CAST(a AS DECIMAL(2,1)) / CAST(b AS DECIMAL(2,1))):decimal(8,6)>
+-- !query 1 output
+NULL
+
+
+-- !query 2
+select a % b from t
+-- !query 2 schema
+struct<(CAST(a AS DECIMAL(2,1)) % CAST(b AS DECIMAL(2,1))):decimal(1,1)>
+-- !query 2 output
+NULL
+
+
+-- !query 3
+select pmod(a, b) from t
+-- !query 3 schema
+struct<pmod(CAST(a AS DECIMAL(2,1)), CAST(b AS DECIMAL(2,1))):decimal(1,1)>
+-- !query 3 output
+NULL
+
+
+-- !query 4
+select (5e36 + 0.1) + 5e36
+-- !query 4 schema
+struct<(CAST((CAST(5E+36 AS DECIMAL(38,1)) + CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) + CAST(5E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 4 output
+NULL
+
+
+-- !query 5
+select (-4e36 - 0.1) - 7e36
+-- !query 5 schema
+struct<(CAST((CAST(-4E+36 AS DECIMAL(38,1)) - CAST(0.1 AS DECIMAL(38,1))) AS DECIMAL(38,1)) - CAST(7E+36 AS DECIMAL(38,1))):decimal(38,1)>
+-- !query 5 output
+NULL
+
+
+-- !query 6
+select 12345678901234567890.0 * 12345678901234567890.0
+-- !query 6 schema
+struct<(12345678901234567890.0 * 12345678901234567890.0):decimal(38,2)>
+-- !query 6 output
+NULL
+
+
+-- !query 7
+select 1e35 / 0.1
+-- !query 7 schema
+struct<(CAST(1E+35 AS DECIMAL(37,1)) / CAST(0.1 AS DECIMAL(37,1))):decimal(38,3)>
+-- !query 7 output
+NULL
+
+
+-- !query 8
+select 123456789123456789.1234567890 * 1.123456789123456789
+-- !query 8 schema
+struct<(CAST(123456789123456789.1234567890 AS DECIMAL(36,18)) * CAST(1.123456789123456789 AS DECIMAL(36,18))):decimal(38,28)>
+-- !query 8 output
+NULL
+
+
+-- !query 9
+select 0.001 / 9876543210987654321098765432109876543.2
+-- !query 9 schema
+struct<(CAST(0.001 AS DECIMAL(38,3)) / CAST(9876543210987654321098765432109876543.2 AS DECIMAL(38,3))):decimal(38,37)>
+-- !query 9 output
+NULL
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out
new file mode 100644
index 0000000000000..8ed2820244412
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/stringCastAndExpressions.sql.out
@@ -0,0 +1,261 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 32
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 'aa' as a
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+select cast(a as byte) from t
+-- !query 1 schema
+struct<a:tinyint>
+-- !query 1 output
+NULL
+
+
+-- !query 2
+select cast(a as short) from t
+-- !query 2 schema
+struct<a:smallint>
+-- !query 2 output
+NULL
+
+
+-- !query 3
+select cast(a as int) from t
+-- !query 3 schema
+struct<a:int>
+-- !query 3 output
+NULL
+
+
+-- !query 4
+select cast(a as long) from t
+-- !query 4 schema
+struct<a:bigint>
+-- !query 4 output
+NULL
+
+
+-- !query 5
+select cast(a as float) from t
+-- !query 5 schema
+struct<a:float>
+-- !query 5 output
+NULL
+
+
+-- !query 6
+select cast(a as double) from t
+-- !query 6 schema
+struct<a:double>
+-- !query 6 output
+NULL
+
+
+-- !query 7
+select cast(a as decimal) from t
+-- !query 7 schema
+struct<a:decimal(10,0)>
+-- !query 7 output
+NULL
+
+
+-- !query 8
+select cast(a as boolean) from t
+-- !query 8 schema
+struct<a:boolean>
+-- !query 8 output
+NULL
+
+
+-- !query 9
+select cast(a as timestamp) from t
+-- !query 9 schema
+struct<a:timestamp>
+-- !query 9 output
+NULL
+
+
+-- !query 10
+select cast(a as date) from t
+-- !query 10 schema
+struct<a:date>
+-- !query 10 output
+NULL
+
+
+-- !query 11
+select cast(a as binary) from t
+-- !query 11 schema
+struct<a:binary>
+-- !query 11 output
+aa
+
+
+-- !query 12
+select cast(a as array<string>) from t
+-- !query 12 schema
+struct<>
+-- !query 12 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 't.`a`' due to data type mismatch: cannot cast string to array<string>; line 1 pos 7
+
+
+-- !query 13
+select cast(a as struct<s:string>) from t
+-- !query 13 schema
+struct<>
+-- !query 13 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 't.`a`' due to data type mismatch: cannot cast string to struct<s:string>; line 1 pos 7
+
+
+-- !query 14
+select cast(a as map<string, string>) from t
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+cannot resolve 't.`a`' due to data type mismatch: cannot cast string to map<string,string>; line 1 pos 7
+
+
+-- !query 15
+select to_timestamp(a) from t
+-- !query 15 schema
+struct<to_timestamp(t.`a`):timestamp>
+-- !query 15 output
+NULL
+
+
+-- !query 16
+select to_timestamp('2018-01-01', a) from t
+-- !query 16 schema
+struct<to_timestamp('2018-01-01', t.`a`):timestamp>
+-- !query 16 output
+NULL
+
+
+-- !query 17
+select to_unix_timestamp(a) from t
+-- !query 17 schema
+struct<to_unix_timestamp(a, yyyy-MM-dd HH:mm:ss):bigint>
+-- !query 17 output
+NULL
+
+
+-- !query 18
+select to_unix_timestamp('2018-01-01', a) from t
+-- !query 18 schema
+struct<to_unix_timestamp(2018-01-01, a):bigint>
+-- !query 18 output
+NULL
+
+
+-- !query 19
+select unix_timestamp(a) from t
+-- !query 19 schema
+struct<unix_timestamp(a, yyyy-MM-dd HH:mm:ss):bigint>
+-- !query 19 output
+NULL
+
+
+-- !query 20
+select unix_timestamp('2018-01-01', a) from t
+-- !query 20 schema
+struct<unix_timestamp(2018-01-01, a):bigint>
+-- !query 20 output
+NULL
+
+
+-- !query 21
+select from_unixtime(a) from t
+-- !query 21 schema
+struct<from_unixtime(CAST(a AS BIGINT), yyyy-MM-dd HH:mm:ss):string>
+-- !query 21 output
+NULL
+
+
+-- !query 22
+select from_unixtime('2018-01-01', a) from t
+-- !query 22 schema
+struct<from_unixtime(CAST(2018-01-01 AS BIGINT), a):string>
+-- !query 22 output
+NULL
+
+
+-- !query 23
+select next_day(a, 'MO') from t
+-- !query 23 schema
+struct<next_day(CAST(a AS DATE), MO):date>
+-- !query 23 output
+NULL
+
+
+-- !query 24
+select next_day('2018-01-01', a) from t
+-- !query 24 schema
+struct<next_day(CAST(2018-01-01 AS DATE), a):date>
+-- !query 24 output
+NULL
+
+
+-- !query 25
+select trunc(a, 'MM') from t
+-- !query 25 schema
+struct<trunc(CAST(a AS DATE), MM):date>
+-- !query 25 output
+NULL
+
+
+-- !query 26
+select trunc('2018-01-01', a) from t
+-- !query 26 schema
+struct<trunc(CAST(2018-01-01 AS DATE), a):date>
+-- !query 26 output
+NULL
+
+
+-- !query 27
+select unhex('-123')
+-- !query 27 schema
+struct<unhex(-123):binary>
+-- !query 27 output
+NULL
+
+
+-- !query 28
+select sha2(a, a) from t
+-- !query 28 schema
+struct<sha2(CAST(a AS BINARY), CAST(a AS INT)):string>
+-- !query 28 output
+NULL
+
+
+-- !query 29
+select get_json_object(a, a) from t
+-- !query 29 schema
+struct<get_json_object(a, a):string>
+-- !query 29 output
+NULL
+
+
+-- !query 30
+select json_tuple(a, a) from t
+-- !query 30 schema
+struct<c0:string>
+-- !query 30 output
+NULL
+
+
+-- !query 31
+select from_json(a, 'a INT') from t
+-- !query 31 schema
+struct<jsontostructs(a):struct<a:int>>
+-- !query 31 output
+NULL

From 753793bc84df805e519cf59f6804ab26bd02d76e Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Wed, 27 Dec 2017 17:31:12 -0800
Subject: [PATCH 199/356] [SPARK-22899][ML][STREAMING] Fix OneVsRestModel
 transform on streaming data failed.

## What changes were proposed in this pull request?

Fix OneVsRestModel transform on streaming data failed.

## How was this patch tested?

UT will be added soon, once #19979 merged. (Need a helper test method there)

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20077 from WeichenXu123/fix_ovs_model_transform.
---
 .../scala/org/apache/spark/ml/classification/OneVsRest.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 3ab99b35ece2b..f04fde2cbbca1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -165,7 +165,7 @@ final class OneVsRestModel private[ml] (
     val newDataset = dataset.withColumn(accColName, initUDF())
 
     // persist if underlying dataset is not persistent.
-    val handlePersistence = dataset.storageLevel == StorageLevel.NONE
+    val handlePersistence = !dataset.isStreaming && dataset.storageLevel == StorageLevel.NONE
     if (handlePersistence) {
       newDataset.persist(StorageLevel.MEMORY_AND_DISK)
     }

From 5683984520cfe9e9acf49e47a84a56af155a8ad2 Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Thu, 28 Dec 2017 12:28:19 +0800
Subject: [PATCH 200/356] [SPARK-18016][SQL][FOLLOW-UP] Code Generation:
 Constant Pool Limit - reduce entries for mutable state

## What changes were proposed in this pull request?

This PR addresses additional review comments in #19811

## How was this patch tested?

Existing test suites

Author: Kazuaki Ishizaki <ishizaki@jp.ibm.com>

Closes #20036 from kiszk/SPARK-18066-followup.
---
 .../expressions/codegen/CodeGenerator.scala   |  4 +-
 .../expressions/regexpExpressions.scala       | 62 +++++++++----------
 .../apache/spark/sql/execution/SortExec.scala |  2 +-
 .../sql/execution/WholeStageCodegenExec.scala |  2 +-
 .../aggregate/HashAggregateExec.scala         | 16 +++--
 .../execution/basicPhysicalOperators.scala    |  4 +-
 .../joins/BroadcastHashJoinExec.scala         |  2 +-
 .../execution/joins/SortMergeJoinExec.scala   |  7 ++-
 8 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index d6eccadcfb63e..2c714c228e6c9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -190,7 +190,7 @@ class CodegenContext {
 
     /**
      * Returns the reference of next available slot in current compacted array. The size of each
-     * compacted array is controlled by the config `CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT`.
+     * compacted array is controlled by the constant `CodeGenerator.MUTABLESTATEARRAY_SIZE_LIMIT`.
      * Once reaching the threshold, new compacted array is created.
      */
     def getNextSlot(): String = {
@@ -352,7 +352,7 @@ class CodegenContext {
   def initMutableStates(): String = {
     // It's possible that we add same mutable state twice, e.g. the `mergeExpressions` in
     // `TypedAggregateExpression`, we should call `distinct` here to remove the duplicated ones.
-    val initCodes = mutableStateInitCode.distinct
+    val initCodes = mutableStateInitCode.distinct.map(_ + "\n")
 
     // The generated initialization code may exceed 64kb function size limit in JVM if there are too
     // many mutable states, so split it into multiple functions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index fa5425c77ebba..f3e8f6de58975 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -118,9 +118,8 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
       if (rVal != null) {
         val regexStr =
           StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString()))
-        // inline mutable state since not many Like operations in a task
         val pattern = ctx.addMutableState(patternClass, "patternLike",
-          v => s"""$v = ${patternClass}.compile("$regexStr");""", forceInline = true)
+          v => s"""$v = $patternClass.compile("$regexStr");""")
 
         // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
         val eval = left.genCode(ctx)
@@ -143,9 +142,9 @@ case class Like(left: Expression, right: Expression) extends StringRegexExpressi
       val rightStr = ctx.freshName("rightStr")
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
-          String $rightStr = ${eval2}.toString();
-          ${patternClass} $pattern = ${patternClass}.compile($escapeFunc($rightStr));
-          ${ev.value} = $pattern.matcher(${eval1}.toString()).matches();
+          String $rightStr = $eval2.toString();
+          $patternClass $pattern = $patternClass.compile($escapeFunc($rightStr));
+          ${ev.value} = $pattern.matcher($eval1.toString()).matches();
         """
       })
     }
@@ -194,9 +193,8 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
       if (rVal != null) {
         val regexStr =
           StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString())
-        // inline mutable state since not many RLike operations in a task
         val pattern = ctx.addMutableState(patternClass, "patternRLike",
-          v => s"""$v = ${patternClass}.compile("$regexStr");""", forceInline = true)
+          v => s"""$v = $patternClass.compile("$regexStr");""")
 
         // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again.
         val eval = left.genCode(ctx)
@@ -219,9 +217,9 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress
       val pattern = ctx.freshName("pattern")
       nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
         s"""
-          String $rightStr = ${eval2}.toString();
-          ${patternClass} $pattern = ${patternClass}.compile($rightStr);
-          ${ev.value} = $pattern.matcher(${eval1}.toString()).find(0);
+          String $rightStr = $eval2.toString();
+          $patternClass $pattern = $patternClass.compile($rightStr);
+          ${ev.value} = $pattern.matcher($eval1.toString()).find(0);
         """
       })
     }
@@ -338,25 +336,25 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
 
     nullSafeCodeGen(ctx, ev, (subject, regexp, rep) => {
     s"""
-      if (!$regexp.equals(${termLastRegex})) {
+      if (!$regexp.equals($termLastRegex)) {
         // regex value changed
-        ${termLastRegex} = $regexp.clone();
-        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+        $termLastRegex = $regexp.clone();
+        $termPattern = $classNamePattern.compile($termLastRegex.toString());
       }
-      if (!$rep.equals(${termLastReplacementInUTF8})) {
+      if (!$rep.equals($termLastReplacementInUTF8)) {
         // replacement string changed
-        ${termLastReplacementInUTF8} = $rep.clone();
-        ${termLastReplacement} = ${termLastReplacementInUTF8}.toString();
+        $termLastReplacementInUTF8 = $rep.clone();
+        $termLastReplacement = $termLastReplacementInUTF8.toString();
       }
-      $classNameStringBuffer ${termResult} = new $classNameStringBuffer();
-      java.util.regex.Matcher ${matcher} = ${termPattern}.matcher($subject.toString());
+      $classNameStringBuffer $termResult = new $classNameStringBuffer();
+      java.util.regex.Matcher $matcher = $termPattern.matcher($subject.toString());
 
-      while (${matcher}.find()) {
-        ${matcher}.appendReplacement(${termResult}, ${termLastReplacement});
+      while ($matcher.find()) {
+        $matcher.appendReplacement($termResult, $termLastReplacement);
       }
-      ${matcher}.appendTail(${termResult});
-      ${ev.value} = UTF8String.fromString(${termResult}.toString());
-      ${termResult} = null;
+      $matcher.appendTail($termResult);
+      ${ev.value} = UTF8String.fromString($termResult.toString());
+      $termResult = null;
       $setEvNotNull
     """
     })
@@ -425,19 +423,19 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
 
     nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
       s"""
-      if (!$regexp.equals(${termLastRegex})) {
+      if (!$regexp.equals($termLastRegex)) {
         // regex value changed
-        ${termLastRegex} = $regexp.clone();
-        ${termPattern} = ${classNamePattern}.compile(${termLastRegex}.toString());
+        $termLastRegex = $regexp.clone();
+        $termPattern = $classNamePattern.compile($termLastRegex.toString());
       }
-      java.util.regex.Matcher ${matcher} =
-        ${termPattern}.matcher($subject.toString());
-      if (${matcher}.find()) {
-        java.util.regex.MatchResult ${matchResult} = ${matcher}.toMatchResult();
-        if (${matchResult}.group($idx) == null) {
+      java.util.regex.Matcher $matcher =
+        $termPattern.matcher($subject.toString());
+      if ($matcher.find()) {
+        java.util.regex.MatchResult $matchResult = $matcher.toMatchResult();
+        if ($matchResult.group($idx) == null) {
           ${ev.value} = UTF8String.EMPTY_UTF8;
         } else {
-          ${ev.value} = UTF8String.fromString(${matchResult}.group($idx));
+          ${ev.value} = UTF8String.fromString($matchResult.group($idx));
         }
         $setEvNotNull
       } else {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
index daff3c49e7517..ef1bb1c2a4468 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SortExec.scala
@@ -138,7 +138,7 @@ case class SortExec(
     // Initialize the class member variables. This includes the instance of the Sorter and
     // the iterator to return sorted rows.
     val thisPlan = ctx.addReferenceObj("plan", this)
-    // inline mutable state since not many Sort operations in a task
+    // Inline mutable state since not many Sort operations in a task
     sorterVariable = ctx.addMutableState(classOf[UnsafeExternalRowSorter].getName, "sorter",
       v => s"$v = $thisPlan.createSorter();", forceInline = true)
     val metrics = ctx.addMutableState(classOf[TaskMetrics].getName, "metrics",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
index 9e7008d1e0c31..065954559e487 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegenExec.scala
@@ -283,7 +283,7 @@ case class InputAdapter(child: SparkPlan) extends UnaryExecNode with CodegenSupp
 
   override def doProduce(ctx: CodegenContext): String = {
     // Right now, InputAdapter is only used when there is one input RDD.
-    // inline mutable state since an inputAdaptor in a task
+    // Inline mutable state since an InputAdapter is used once in a task for WholeStageCodegen
     val input = ctx.addMutableState("scala.collection.Iterator", "input", v => s"$v = inputs[0];",
       forceInline = true)
     val row = ctx.freshName("row")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index b1af360d85095..9a6f1c6dfa6a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -587,31 +587,35 @@ case class HashAggregateExec(
           fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
         ctx.addInnerClass(generatedMap)
 
+        // Inline mutable state since not many aggregation operations in a task
         fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "vectorizedHastHashMap",
-          v => s"$v = new $fastHashMapClassName();")
-        ctx.addMutableState(s"java.util.Iterator<InternalRow>", "vectorizedFastHashMapIter")
+          v => s"$v = new $fastHashMapClassName();", forceInline = true)
+        ctx.addMutableState(s"java.util.Iterator<InternalRow>", "vectorizedFastHashMapIter",
+          forceInline = true)
       } else {
         val generatedMap = new RowBasedHashMapGenerator(ctx, aggregateExpressions,
           fastHashMapClassName, groupingKeySchema, bufferSchema).generate()
         ctx.addInnerClass(generatedMap)
 
+        // Inline mutable state since not many aggregation operations in a task
         fastHashMapTerm = ctx.addMutableState(fastHashMapClassName, "fastHashMap",
           v => s"$v = new $fastHashMapClassName(" +
-            s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());")
+            s"$thisPlan.getTaskMemoryManager(), $thisPlan.getEmptyAggregationBuffer());",
+          forceInline = true)
         ctx.addMutableState(
           "org.apache.spark.unsafe.KVIterator<UnsafeRow, UnsafeRow>",
-          "fastHashMapIter")
+          "fastHashMapIter", forceInline = true)
       }
     }
 
     // Create a name for the iterator from the regular hash map.
-    // inline mutable state since not many aggregation operations in a task
+    // Inline mutable state since not many aggregation operations in a task
     val iterTerm = ctx.addMutableState(classOf[KVIterator[UnsafeRow, UnsafeRow]].getName,
       "mapIter", forceInline = true)
     // create hashMap
     val hashMapClassName = classOf[UnsafeFixedWidthAggregationMap].getName
     hashMapTerm = ctx.addMutableState(hashMapClassName, "hashMap",
-      v => s"$v = $thisPlan.createHashMap();")
+      v => s"$v = $thisPlan.createHashMap();", forceInline = true)
     sorterTerm = ctx.addMutableState(classOf[UnsafeKVExternalSorter].getName, "sorter",
       forceInline = true)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index 78137d3f97cfc..a15a8d11aa2a0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -284,7 +284,7 @@ case class SampleExec(
       val samplerClass = classOf[PoissonSampler[UnsafeRow]].getName
       val initSampler = ctx.freshName("initSampler")
 
-      // inline mutable state since not many Sample operations in a task
+      // Inline mutable state since not many Sample operations in a task
       val sampler = ctx.addMutableState(s"$samplerClass<UnsafeRow>", "sampleReplace",
         v => {
           val initSamplerFuncName = ctx.addNewFunction(initSampler,
@@ -371,7 +371,7 @@ case class RangeExec(range: org.apache.spark.sql.catalyst.plans.logical.Range)
     val ev = ExprCode("", "false", value)
     val BigInt = classOf[java.math.BigInteger].getName
 
-    // inline mutable state since not many Range operations in a task
+    // Inline mutable state since not many Range operations in a task
     val taskContext = ctx.addMutableState("TaskContext", "taskContext",
       v => s"$v = TaskContext.get();", forceInline = true)
     val inputMetrics = ctx.addMutableState("InputMetrics", "inputMetrics",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index ee763e23415cf..1918fcc5482db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -139,7 +139,7 @@ case class BroadcastHashJoinExec(
     // At the end of the task, we update the avg hash probe.
     val avgHashProbe = metricTerm(ctx, "avgHashProbe")
 
-    // inline mutable state since not many join operations in a task
+    // Inline mutable state since not many join operations in a task
     val relationTerm = ctx.addMutableState(clsName, "relation",
       v => s"""
          | $v = (($clsName) $broadcast.value()).asReadOnlyCopy();
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 073730462a75f..94405410cce90 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -422,7 +422,7 @@ case class SortMergeJoinExec(
    */
   private def genScanner(ctx: CodegenContext): (String, String) = {
     // Create class member for next row from both sides.
-    // inline mutable state since not many join operations in a task
+    // Inline mutable state since not many join operations in a task
     val leftRow = ctx.addMutableState("InternalRow", "leftRow", forceInline = true)
     val rightRow = ctx.addMutableState("InternalRow", "rightRow", forceInline = true)
 
@@ -440,8 +440,9 @@ case class SortMergeJoinExec(
     val spillThreshold = getSpillThreshold
     val inMemoryThreshold = getInMemoryThreshold
 
+    // Inline mutable state since not many join operations in a task
     val matches = ctx.addMutableState(clsName, "matches",
-      v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);")
+      v => s"$v = new $clsName($inMemoryThreshold, $spillThreshold);", forceInline = true)
     // Copy the left keys as class members so they could be used in next function call.
     val matchedKeyVars = copyKeys(ctx, leftKeyVars)
 
@@ -576,7 +577,7 @@ case class SortMergeJoinExec(
   override def needCopyResult: Boolean = true
 
   override def doProduce(ctx: CodegenContext): String = {
-    // inline mutable state since not many join operations in a task
+    // Inline mutable state since not many join operations in a task
     val leftInput = ctx.addMutableState("scala.collection.Iterator", "leftInput",
       v => s"$v = inputs[0];", forceInline = true)
     val rightInput = ctx.addMutableState("scala.collection.Iterator", "rightInput",

From 32ec269d08313720aae3b47cce2f5e9c19702811 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Thu, 28 Dec 2017 12:35:17 +0800
Subject: [PATCH 201/356] [SPARK-22909][SS] Move Structured Streaming v2 APIs
 to streaming folder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

This PR moves Structured Streaming v2 APIs to streaming folder as following:
```
sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming
├── ContinuousReadSupport.java
├── ContinuousWriteSupport.java
├── MicroBatchReadSupport.java
├── MicroBatchWriteSupport.java
├── reader
│   ├── ContinuousDataReader.java
│   ├── ContinuousReader.java
│   ├── MicroBatchReader.java
│   ├── Offset.java
│   └── PartitionOffset.java
└── writer
    └── ContinuousWriter.java
```

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20093 from zsxwing/move.
---
 .../sources/v2/{ => streaming}/ContinuousReadSupport.java  | 7 ++++---
 .../sources/v2/{ => streaming}/ContinuousWriteSupport.java | 6 ++++--
 .../sources/v2/{ => streaming}/MicroBatchReadSupport.java  | 6 ++++--
 .../sources/v2/{ => streaming}/MicroBatchWriteSupport.java | 4 +++-
 .../v2/{ => streaming}/reader/ContinuousDataReader.java    | 6 ++----
 .../v2/{ => streaming}/reader/ContinuousReader.java        | 4 ++--
 .../v2/{ => streaming}/reader/MicroBatchReader.java        | 4 ++--
 .../sql/sources/v2/{ => streaming}/reader/Offset.java      | 2 +-
 .../sources/v2/{ => streaming}/reader/PartitionOffset.java | 2 +-
 .../v2/{ => streaming}/writer/ContinuousWriter.java        | 5 ++++-
 .../execution/datasources/v2/DataSourceV2ScanExec.scala    | 1 +
 .../sql/execution/datasources/v2/WriteToDataSourceV2.scala | 1 +
 .../sql/execution/streaming/MicroBatchExecution.scala      | 2 +-
 .../spark/sql/execution/streaming/RateSourceProvider.scala | 3 ++-
 .../spark/sql/execution/streaming/RateStreamOffset.scala   | 2 +-
 .../spark/sql/execution/streaming/StreamingRelation.scala  | 3 ++-
 .../streaming/continuous/ContinuousDataSourceRDDIter.scala | 1 +
 .../streaming/continuous/ContinuousExecution.scala         | 7 ++++---
 .../streaming/continuous/ContinuousRateStreamSource.scala  | 3 ++-
 .../execution/streaming/continuous/EpochCoordinator.scala  | 5 +++--
 .../execution/streaming/sources/RateStreamSourceV2.scala   | 1 +
 .../spark/sql/execution/streaming/sources/memoryV2.scala   | 4 +++-
 .../org/apache/spark/sql/streaming/DataStreamReader.scala  | 3 ++-
 .../apache/spark/sql/streaming/StreamingQueryManager.scala | 2 +-
 .../spark/sql/execution/streaming/OffsetSeqLogSuite.scala  | 1 -
 .../spark/sql/execution/streaming/RateSourceSuite.scala    | 1 -
 .../spark/sql/execution/streaming/RateSourceV2Suite.scala  | 3 ++-
 27 files changed, 54 insertions(+), 35 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/ContinuousReadSupport.java (88%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/ContinuousWriteSupport.java (90%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/MicroBatchReadSupport.java (90%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/MicroBatchWriteSupport.java (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/reader/ContinuousDataReader.java (90%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/reader/ContinuousReader.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/reader/MicroBatchReader.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/reader/Offset.java (97%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/reader/PartitionOffset.java (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/sources/v2/{ => streaming}/writer/ContinuousWriter.java (87%)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
similarity index 88%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
index ae4f85820649f..8837bae6156b1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
@@ -15,12 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2;
+package org.apache.spark.sql.sources.v2.streaming;
 
 import java.util.Optional;
 
-import org.apache.spark.sql.sources.v2.reader.ContinuousReader;
-import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader;
 import org.apache.spark.sql.types.StructType;
 
 /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
similarity index 90%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
index 362d5f52b4d00..ec15e436672b4 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/ContinuousWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
@@ -15,13 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2;
+package org.apache.spark.sql.sources.v2.streaming;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
-import org.apache.spark.sql.sources.v2.writer.ContinuousWriter;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter;
 import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
similarity index 90%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
index 442cad029d211..3c87a3db68243 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchReadSupport.java
@@ -15,12 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2;
+package org.apache.spark.sql.sources.v2.streaming;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
-import org.apache.spark.sql.sources.v2.reader.MicroBatchReader;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
+import org.apache.spark.sql.sources.v2.streaming.reader.MicroBatchReader;
 import org.apache.spark.sql.types.StructType;
 
 /**
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
index 63640779b955c..b5e3e44cd6332 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/MicroBatchWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
@@ -15,12 +15,14 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2;
+package org.apache.spark.sql.sources.v2.streaming;
 
 import java.util.Optional;
 
 import org.apache.spark.annotation.InterfaceStability;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSink;
+import org.apache.spark.sql.sources.v2.DataSourceV2;
+import org.apache.spark.sql.sources.v2.DataSourceV2Options;
 import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
 import org.apache.spark.sql.streaming.OutputMode;
 import org.apache.spark.sql.types.StructType;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
similarity index 90%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
index 11b99a93f1494..ca9a290e97a02 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousDataReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousDataReader.java
@@ -15,11 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.streaming.reader;
 
-import org.apache.spark.sql.sources.v2.reader.PartitionOffset;
-
-import java.io.IOException;
+import org.apache.spark.sql.sources.v2.reader.DataReader;
 
 /**
  * A variation on {@link DataReader} for use with streaming in continuous processing mode.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
index 34141d6cd85fd..f0b205869ed6c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/ContinuousReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/ContinuousReader.java
@@ -15,10 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.streaming.reader;
 
-import org.apache.spark.sql.sources.v2.reader.PartitionOffset;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 
 import java.util.Optional;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
index bd15c07d87f6c..70ff756806032 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/MicroBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/MicroBatchReader.java
@@ -15,9 +15,9 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.streaming.reader;
 
-import org.apache.spark.sql.sources.v2.reader.Offset;
+import org.apache.spark.sql.sources.v2.reader.DataSourceV2Reader;
 import org.apache.spark.sql.execution.streaming.BaseStreamingSource;
 
 import java.util.Optional;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
similarity index 97%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
index ce1c489742054..517fdab16684c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.streaming.reader;
 
 /**
  * An abstract representation of progress through a [[MicroBatchReader]] or [[ContinuousReader]].
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
index 07826b6688476..729a6123034f0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/PartitionOffset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.reader;
+package org.apache.spark.sql.sources.v2.streaming.reader;
 
 import java.io.Serializable;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java
similarity index 87%
rename from sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java
rename to sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java
index 618f47ed79ca5..723395bd1e963 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/ContinuousWriter.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/writer/ContinuousWriter.java
@@ -15,9 +15,12 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.sources.v2.writer;
+package org.apache.spark.sql.sources.v2.streaming.writer;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.sources.v2.writer.DataSourceV2Writer;
+import org.apache.spark.sql.sources.v2.writer.DataWriter;
+import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage;
 
 /**
  * A {@link DataSourceV2Writer} for use with continuous stream processing.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index e4fca1b10dfad..49c506bc560cf 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.execution.streaming.StreamExecution
 import org.apache.spark.sql.execution.streaming.continuous.{ContinuousDataSourceRDD, ContinuousExecution, EpochCoordinatorRef, SetReaderPartitions}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
 import org.apache.spark.sql.types.StructType
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index 1862da8892cb2..f0bdf84bb7a84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming.StreamExecution
 import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 20f9810faa5c8..9a7a13fcc5806 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.MicroBatchReadSupport
+import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index 3f85fa913f28c..d02cf882b61ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -33,7 +33,8 @@ import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamR
 import org.apache.spark.sql.execution.streaming.sources.RateStreamV2Reader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, MicroBatchReader}
+import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
index 65d6d18936167..261d69bbd9843 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateStreamOffset.scala
@@ -23,7 +23,7 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.sources.v2
 
 case class RateStreamOffset(partitionToValueAndRunTimeMs: Map[Int, ValueRunTimeMsPair])
-  extends v2.reader.Offset {
+  extends v2.streaming.reader.Offset {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
   override val json = Serialization.write(partitionToValueAndRunTimeMs)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index 0ca2e7854d94b..a9d50e3a112e7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -25,7 +25,8 @@ import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
 import org.apache.spark.sql.execution.LeafExecNode
 import org.apache.spark.sql.execution.datasources.DataSource
-import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}
+import org.apache.spark.sql.sources.v2.DataSourceV2
+import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
 
 object StreamingRelation {
   def apply(dataSource: DataSource): StreamingRelation = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index 89fb2ace20917..d79e4bd65f563 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.execution.datasources.v2.{DataSourceRDDPartition, Ro
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, PartitionOffset}
 import org.apache.spark.sql.streaming.ProcessingTime
 import org.apache.spark.util.{SystemClock, ThreadUtils}
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 1c35b06bd4b85..2843ab13bde2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -29,9 +29,10 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
-import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, ContinuousWriteSupport, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, Offset, PartitionOffset}
-import org.apache.spark.sql.sources.v2.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.{Clock, Utils}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index 89a8562b4b59e..c9aa78a5a2e28 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -27,8 +27,9 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateSourceProvider, RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.execution.streaming.sources.RateStreamSourceV2
-import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 
 case class ContinuousRateStreamPartitionOffset(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 7f1e8abd79b99..98017c3ac6a33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -26,8 +26,9 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.execution.streaming.StreamingQueryWrapper
-import org.apache.spark.sql.sources.v2.reader.{ContinuousReader, PartitionOffset}
-import org.apache.spark.sql.sources.v2.writer.{ContinuousWriter, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, PartitionOffset}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage
 import org.apache.spark.util.RpcUtils
 
 private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 1c66aed8690a7..97bada08bcd2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 import org.apache.spark.util.SystemClock
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
index 972248d5e4df8..da7c31cf62428 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -29,7 +29,9 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics}
 import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Complete, Update}
 import org.apache.spark.sql.execution.streaming.Sink
-import org.apache.spark.sql.sources.v2.{ContinuousWriteSupport, DataSourceV2, DataSourceV2Options, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.sources.v2.writer._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index f17935e86f459..acd5ca17dcf76 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -27,7 +27,8 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
-import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Options, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index e808ffaa96410..b508f4406138f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.ContinuousWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
index 4868ba4e68934..e6cdc063c4e9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/OffsetSeqLogSuite.scala
@@ -22,7 +22,6 @@ import java.io.File
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.util.stringToFile
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.test.SharedSQLContext
 
 class OffsetSeqLogSuite extends SparkFunSuite with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
index ceba27b26e578..03d0f63fa4d7f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceSuite.scala
@@ -21,7 +21,6 @@ import java.util.concurrent.TimeUnit
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.sources.v2.reader.Offset
 import org.apache.spark.sql.streaming.{StreamingQueryException, StreamTest}
 import org.apache.spark.util.ManualClock
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index dc833b2ccaa22..e11705a227f48 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -25,7 +25,8 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamSourceV2, RateStreamV2Reader}
-import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2Options, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
 import org.apache.spark.sql.streaming.StreamTest
 
 class RateSourceV2Suite extends StreamTest {

From 171f6ddadc6185ffcc6ad82e5f48952fb49095b2 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 28 Dec 2017 13:44:44 +0900
Subject: [PATCH 202/356] [SPARK-22757][KUBERNETES] Enable use of remote
 dependencies (http, s3, gcs, etc.) in Kubernetes mode

## What changes were proposed in this pull request?

This PR expands the Kubernetes mode to be able to use remote dependencies on http/https endpoints, GCS, S3, etc. It adds steps for configuring and appending the Kubernetes init-container into the driver and executor pods for downloading remote dependencies.
[Init-containers](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/), as the name suggests, are containers that are run to completion before the main containers start, and are often used to perform initialization tasks prior to starting the main containers. We use init-containers to localize remote application dependencies before the driver/executors start running. The code that the init-container runs is also included. This PR also adds a step to the driver and executors for mounting user-specified secrets that may store credentials for accessing data storage, e.g., S3 and Google Cloud Storage (GCS), into the driver and executors.

## How was this patch tested?

* The patch contains unit tests which are passing.
* Manual testing: `./build/mvn -Pkubernetes clean package` succeeded.
* Manual testing of the following cases:
  * [x] Running SparkPi using container-local spark-example jar.
  * [x] Running SparkPi using container-local spark-example jar with user-specific secret mounted.
  * [x] Running SparkPi using spark-example jar hosted remotely on an https endpoint.

cc rxin felixcheung mateiz (shepherd)
k8s-big-data SIG members & contributors: mccheah foxish ash211 ssuchter varunkatta kimoonkim erikerlandson tnachen ifilonenko liyinan926
reviewers: vanzin felixcheung jiangxb1987 mridulm

Author: Yinan Li <liyinan926@gmail.com>

Closes #19954 from liyinan926/init-container.
---
 .../org/apache/spark/deploy/k8s/Config.scala  |  67 +++++++-
 .../apache/spark/deploy/k8s/Constants.scala   |  11 ++
 .../deploy/k8s/InitContainerBootstrap.scala   | 119 +++++++++++++
 ...sFileUtils.scala => KubernetesUtils.scala} |  50 +++++-
 .../deploy/k8s/MountSecretsBootstrap.scala    |  62 +++++++
 ...ala => PodWithDetachedInitContainer.scala} |  34 ++--
 .../k8s/SparkKubernetesClientFactory.scala    |   2 +-
 ...r.scala => DriverConfigOrchestrator.scala} |  91 +++++++---
 .../submit/KubernetesClientApplication.scala  |  26 ++-
 ...ala => BasicDriverConfigurationStep.scala} |  63 ++++---
 .../steps/DependencyResolutionStep.scala      |  27 +--
 .../steps/DriverConfigurationStep.scala       |   2 +-
 .../DriverInitContainerBootstrapStep.scala    |  95 +++++++++++
 .../submit/steps/DriverMountSecretsStep.scala |  38 +++++
 .../steps/DriverServiceBootstrapStep.scala    |  17 +-
 .../BasicInitContainerConfigurationStep.scala |  67 ++++++++
 .../InitContainerConfigOrchestrator.scala     |  79 +++++++++
 .../InitContainerConfigurationStep.scala      |  25 +++
 .../InitContainerMountSecretsStep.scala       |  39 +++++
 .../initcontainer/InitContainerSpec.scala     |  37 ++++
 .../rest/k8s/SparkPodInitContainer.scala      | 116 +++++++++++++
 .../cluster/k8s/ExecutorPodFactory.scala      |  77 ++++++---
 .../k8s/KubernetesClusterManager.scala        |  62 ++++++-
 ...la => DriverConfigOrchestratorSuite.scala} |  69 ++++++--
 .../deploy/k8s/submit/SecretVolumeUtils.scala |  36 ++++
 ...> BasicDriverConfigurationStepSuite.scala} |   4 +-
 ...riverInitContainerBootstrapStepSuite.scala | 160 ++++++++++++++++++
 .../steps/DriverMountSecretsStepSuite.scala   |  49 ++++++
 ...cInitContainerConfigurationStepSuite.scala |  95 +++++++++++
 ...InitContainerConfigOrchestratorSuite.scala |  80 +++++++++
 .../InitContainerMountSecretsStepSuite.scala  |  57 +++++++
 .../rest/k8s/SparkPodInitContainerSuite.scala |  86 ++++++++++
 .../cluster/k8s/ExecutorPodFactorySuite.scala |  82 ++++++++-
 .../src/main/dockerfiles/driver/Dockerfile    |   3 +-
 .../src/main/dockerfiles/executor/Dockerfile  |   3 +-
 .../dockerfiles/init-container/Dockerfile     |  24 +++
 36 files changed, 1783 insertions(+), 171 deletions(-)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/{submit/KubernetesFileUtils.scala => KubernetesUtils.scala} (57%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/{ConfigurationUtils.scala => PodWithDetachedInitContainer.scala} (53%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/{DriverConfigurationStepsOrchestrator.scala => DriverConfigOrchestrator.scala} (53%)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/{BaseDriverConfigurationStep.scala => BasicDriverConfigurationStep.scala} (70%)
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
 create mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/{DriverConfigurationStepsOrchestratorSuite.scala => DriverConfigOrchestratorSuite.scala} (51%)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/{BaseDriverConfigurationStepSuite.scala => BasicDriverConfigurationStepSuite.scala} (97%)
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
 create mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
 create mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index 45f527959cbe1..e5d79d9a9d9da 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -20,7 +20,6 @@ import java.util.concurrent.TimeUnit
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.ConfigBuilder
-import org.apache.spark.network.util.ByteUnit
 
 private[spark] object Config extends Logging {
 
@@ -132,20 +131,72 @@ private[spark] object Config extends Logging {
 
   val JARS_DOWNLOAD_LOCATION =
     ConfigBuilder("spark.kubernetes.mountDependencies.jarsDownloadDir")
-      .doc("Location to download jars to in the driver and executors. When using" +
-        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
-        " volume on the driver and executor pod.")
+      .doc("Location to download jars to in the driver and executors. When using " +
+        "spark-submit, this directory must be empty and will be mounted as an empty directory " +
+        "volume on the driver and executor pod.")
       .stringConf
       .createWithDefault("/var/spark-data/spark-jars")
 
   val FILES_DOWNLOAD_LOCATION =
     ConfigBuilder("spark.kubernetes.mountDependencies.filesDownloadDir")
-      .doc("Location to download files to in the driver and executors. When using" +
-        " spark-submit, this directory must be empty and will be mounted as an empty directory" +
-        " volume on the driver and executor pods.")
+      .doc("Location to download files to in the driver and executors. When using " +
+        "spark-submit, this directory must be empty and will be mounted as an empty directory " +
+        "volume on the driver and executor pods.")
       .stringConf
       .createWithDefault("/var/spark-data/spark-files")
 
+  val INIT_CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.initContainer.image")
+      .doc("Image for the driver and executor's init-container for downloading dependencies.")
+      .stringConf
+      .createOptional
+
+  val INIT_CONTAINER_MOUNT_TIMEOUT =
+    ConfigBuilder("spark.kubernetes.mountDependencies.timeout")
+      .doc("Timeout before aborting the attempt to download and unpack dependencies from remote " +
+        "locations into the driver and executor pods.")
+      .timeConf(TimeUnit.SECONDS)
+      .createWithDefault(300)
+
+  val INIT_CONTAINER_MAX_THREAD_POOL_SIZE =
+    ConfigBuilder("spark.kubernetes.mountDependencies.maxSimultaneousDownloads")
+      .doc("Maximum number of remote dependencies to download simultaneously in a driver or " +
+        "executor pod.")
+      .intConf
+      .createWithDefault(5)
+
+  val INIT_CONTAINER_REMOTE_JARS =
+    ConfigBuilder("spark.kubernetes.initContainer.remoteJars")
+      .doc("Comma-separated list of jar URIs to download in the init-container. This is " +
+        "calculated from spark.jars.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  val INIT_CONTAINER_REMOTE_FILES =
+    ConfigBuilder("spark.kubernetes.initContainer.remoteFiles")
+      .doc("Comma-separated list of file URIs to download in the init-container. This is " +
+        "calculated from spark.files.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  val INIT_CONTAINER_CONFIG_MAP_NAME =
+    ConfigBuilder("spark.kubernetes.initContainer.configMapName")
+      .doc("Name of the config map to use in the init-container that retrieves submitted files " +
+        "for the executor.")
+      .internal()
+      .stringConf
+      .createOptional
+
+  val INIT_CONTAINER_CONFIG_MAP_KEY_CONF =
+    ConfigBuilder("spark.kubernetes.initContainer.configMapKey")
+      .doc("Key for the entry in the init container config map for submitted files that " +
+        "corresponds to the properties for this init-container.")
+      .internal()
+      .stringConf
+      .createOptional
+
   val KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX =
     "spark.kubernetes.authenticate.submission"
 
@@ -153,9 +204,11 @@ private[spark] object Config extends Logging {
 
   val KUBERNETES_DRIVER_LABEL_PREFIX = "spark.kubernetes.driver.label."
   val KUBERNETES_DRIVER_ANNOTATION_PREFIX = "spark.kubernetes.driver.annotation."
+  val KUBERNETES_DRIVER_SECRETS_PREFIX = "spark.kubernetes.driver.secrets."
 
   val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
   val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
+  val KUBERNETES_EXECUTOR_SECRETS_PREFIX = "spark.kubernetes.executor.secrets."
 
   val KUBERNETES_DRIVER_ENV_KEY = "spark.kubernetes.driverEnv."
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 0b91145405d3a..111cb2a3b75e5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -69,6 +69,17 @@ private[spark] object Constants {
   val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
   val ENV_DRIVER_BIND_ADDRESS = "SPARK_DRIVER_BIND_ADDRESS"
   val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
+  val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
+
+  // Bootstrapping dependencies with the init-container
+  val INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME = "download-jars-volume"
+  val INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME = "download-files-volume"
+  val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "spark-init-properties"
+  val INIT_CONTAINER_PROPERTIES_FILE_DIR = "/etc/spark-init"
+  val INIT_CONTAINER_PROPERTIES_FILE_NAME = "spark-init.properties"
+  val INIT_CONTAINER_PROPERTIES_FILE_PATH =
+    s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
+  val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
 
   // Miscellaneous
   val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
new file mode 100644
index 0000000000000..dfeccf9e2bd1c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, EnvVarBuilder, PodBuilder, VolumeMount, VolumeMountBuilder}
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+
+/**
+ * Bootstraps an init-container for downloading remote dependencies. This is separated out from
+ * the init-container steps API because this component can be used to bootstrap init-containers
+ * for both the driver and executors.
+ */
+private[spark] class InitContainerBootstrap(
+    initContainerImage: String,
+    imagePullPolicy: String,
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    configMapName: String,
+    configMapKey: String,
+    sparkRole: String,
+    sparkConf: SparkConf) {
+
+  /**
+   * Bootstraps an init-container that downloads dependencies to be used by a main container.
+   */
+  def bootstrapInitContainer(
+      original: PodWithDetachedInitContainer): PodWithDetachedInitContainer = {
+    val sharedVolumeMounts = Seq[VolumeMount](
+      new VolumeMountBuilder()
+        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
+        .withMountPath(jarsDownloadPath)
+        .build(),
+      new VolumeMountBuilder()
+        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
+        .withMountPath(filesDownloadPath)
+        .build())
+
+    val customEnvVarKeyPrefix = sparkRole match {
+      case SPARK_POD_DRIVER_ROLE => KUBERNETES_DRIVER_ENV_KEY
+      case SPARK_POD_EXECUTOR_ROLE => "spark.executorEnv."
+      case _ => throw new SparkException(s"$sparkRole is not a valid Spark pod role")
+    }
+    val customEnvVars = sparkConf.getAllWithPrefix(customEnvVarKeyPrefix).toSeq.map {
+      case (key, value) =>
+        new EnvVarBuilder()
+          .withName(key)
+          .withValue(value)
+          .build()
+    }
+
+    val initContainer = new ContainerBuilder(original.initContainer)
+      .withName("spark-init")
+      .withImage(initContainerImage)
+      .withImagePullPolicy(imagePullPolicy)
+      .addAllToEnv(customEnvVars.asJava)
+      .addNewVolumeMount()
+        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
+        .endVolumeMount()
+      .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
+      .build()
+
+    val podWithBasicVolumes = new PodBuilder(original.pod)
+      .editSpec()
+      .addNewVolume()
+        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
+        .withNewConfigMap()
+          .withName(configMapName)
+          .addNewItem()
+            .withKey(configMapKey)
+            .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
+            .endItem()
+          .endConfigMap()
+        .endVolume()
+      .addNewVolume()
+        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
+        .withEmptyDir(new EmptyDirVolumeSource())
+        .endVolume()
+      .addNewVolume()
+        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
+        .withEmptyDir(new EmptyDirVolumeSource())
+        .endVolume()
+      .endSpec()
+      .build()
+
+    val mainContainer = new ContainerBuilder(original.mainContainer)
+      .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addNewEnv()
+        .withName(ENV_MOUNTED_FILES_DIR)
+        .withValue(filesDownloadPath)
+        .endEnv()
+      .build()
+
+    PodWithDetachedInitContainer(
+      podWithBasicVolumes,
+      initContainer,
+      mainContainer)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
similarity index 57%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
index a38cf55fc3d58..37331d8bbf9b7 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesFileUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
@@ -14,13 +14,49 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.k8s.submit
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 
+import io.fabric8.kubernetes.api.model.{Container, Pod, PodBuilder}
+
+import org.apache.spark.SparkConf
 import org.apache.spark.util.Utils
 
-private[spark] object KubernetesFileUtils {
+private[spark] object KubernetesUtils {
+
+  /**
+   * Extract and parse Spark configuration properties with a given name prefix and
+   * return the result as a Map. Keys must not have more than one value.
+   *
+   * @param sparkConf Spark configuration
+   * @param prefix the given property name prefix
+   * @return a Map storing the configuration property keys and values
+   */
+  def parsePrefixedKeyValuePairs(
+      sparkConf: SparkConf,
+      prefix: String): Map[String, String] = {
+    sparkConf.getAllWithPrefix(prefix).toMap
+  }
+
+  def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
+    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
+  }
+
+  /**
+   * Append the given init-container to a pod's list of init-containers.
+   *
+   * @param originalPodSpec original specification of the pod
+   * @param initContainer the init-container to add to the pod
+   * @return the pod with the init-container added to the list of InitContainers
+   */
+  def appendInitContainer(originalPodSpec: Pod, initContainer: Container): Pod = {
+    new PodBuilder(originalPodSpec)
+      .editOrNewSpec()
+        .addToInitContainers(initContainer)
+        .endSpec()
+      .build()
+  }
 
   /**
    * For the given collection of file URIs, resolves them as follows:
@@ -47,6 +83,16 @@ private[spark] object KubernetesFileUtils {
     }
   }
 
+  /**
+   * Get from a given collection of file URIs the ones that represent remote files.
+   */
+  def getOnlyRemoteFiles(uris: Iterable[String]): Iterable[String] = {
+    uris.filter { uri =>
+      val scheme = Utils.resolveURI(uri).getScheme
+      scheme != "file" && scheme != "local"
+    }
+  }
+
   private def resolveFileUri(
       uri: String,
       fileDownloadPath: String,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
new file mode 100644
index 0000000000000..8286546ce0641
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s
+
+import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBuilder}
+
+/**
+ * Bootstraps a driver or executor container or an init-container with needed secrets mounted.
+ */
+private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String, String]) {
+
+  /**
+   * Mounts Kubernetes secrets as secret volumes into the given container in the given pod.
+   *
+   * @param pod the pod into which the secret volumes are being added.
+   * @param container the container into which the secret volumes are being mounted.
+   * @return the updated pod and container with the secrets mounted.
+   */
+  def mountSecrets(pod: Pod, container: Container): (Pod, Container) = {
+    var podBuilder = new PodBuilder(pod)
+    secretNamesToMountPaths.keys.foreach { name =>
+      podBuilder = podBuilder
+        .editOrNewSpec()
+          .addNewVolume()
+          .withName(secretVolumeName(name))
+          .withNewSecret()
+            .withSecretName(name)
+            .endSecret()
+          .endVolume()
+          .endSpec()
+    }
+
+    var containerBuilder = new ContainerBuilder(container)
+    secretNamesToMountPaths.foreach { case (name, path) =>
+      containerBuilder = containerBuilder
+        .addNewVolumeMount()
+          .withName(secretVolumeName(name))
+          .withMountPath(path)
+          .endVolumeMount()
+    }
+
+    (podBuilder.build(), containerBuilder.build())
+  }
+
+  private def secretVolumeName(secretName: String): String = {
+    secretName + "-volume"
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
similarity index 53%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
index 01717479fddd9..0b79f8b12e806 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/ConfigurationUtils.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
@@ -14,28 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.spark.deploy.k8s
 
-import org.apache.spark.SparkConf
-
-private[spark] object ConfigurationUtils {
-
-  /**
-   * Extract and parse Spark configuration properties with a given name prefix and
-   * return the result as a Map. Keys must not have more than one value.
-   *
-   * @param sparkConf Spark configuration
-   * @param prefix the given property name prefix
-   * @return a Map storing the configuration property keys and values
-   */
-  def parsePrefixedKeyValuePairs(
-      sparkConf: SparkConf,
-      prefix: String): Map[String, String] = {
-    sparkConf.getAllWithPrefix(prefix).toMap
-  }
+import io.fabric8.kubernetes.api.model.{Container, Pod}
 
-  def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
-    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
-  }
-}
+/**
+ * Represents a pod with a detached init-container (not yet added to the pod).
+ *
+ * @param pod the pod
+ * @param initContainer the init-container in the pod
+ * @param mainContainer the main container in the pod
+ */
+private[spark] case class PodWithDetachedInitContainer(
+    pod: Pod,
+    initContainer: Container,
+    mainContainer: Container)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
index 1e3f055e05766..c47e78cbf19e3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
@@ -48,7 +48,7 @@ private[spark] object SparkKubernetesClientFactory {
       .map(new File(_))
       .orElse(defaultServiceAccountToken)
     val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
-    ConfigurationUtils.requireNandDefined(
+    KubernetesUtils.requireNandDefined(
       oauthTokenFile,
       oauthTokenValue,
       s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a " +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
similarity index 53%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
index 1411e6f40b468..00c9c4ee49177 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
@@ -21,25 +21,31 @@ import java.util.UUID
 import com.google.common.primitives.Longs
 
 import org.apache.spark.SparkConf
+import org.apache.spark.deploy.k8s.{KubernetesUtils, MountSecretsBootstrap}
 import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.ConfigurationUtils
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.k8s.submit.steps._
+import org.apache.spark.deploy.k8s.submit.steps.initcontainer.InitContainerConfigOrchestrator
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.util.SystemClock
+import org.apache.spark.util.Utils
 
 /**
- * Constructs the complete list of driver configuration steps to run to deploy the Spark driver.
+ * Figures out and returns the complete ordered list of needed DriverConfigurationSteps to
+ * configure the Spark driver pod. The returned steps will be applied one by one in the given
+ * order to produce a final KubernetesDriverSpec that is used in KubernetesClientApplication
+ * to construct and create the driver pod. It uses the InitContainerConfigOrchestrator to
+ * configure the driver init-container if one is needed, i.e., when there are remote dependencies
+ * to localize.
  */
-private[spark] class DriverConfigurationStepsOrchestrator(
-    namespace: String,
+private[spark] class DriverConfigOrchestrator(
     kubernetesAppId: String,
     launchTime: Long,
     mainAppResource: Option[MainAppResource],
     appName: String,
     mainClass: String,
     appArgs: Array[String],
-    submissionSparkConf: SparkConf) {
+    sparkConf: SparkConf) {
 
   // The resource name prefix is derived from the Spark application name, making it easy to connect
   // the names of the Kubernetes resources from e.g. kubectl or the Kubernetes dashboard to the
@@ -49,13 +55,14 @@ private[spark] class DriverConfigurationStepsOrchestrator(
     s"$appName-$uuid".toLowerCase.replaceAll("\\.", "-")
   }
 
-  private val imagePullPolicy = submissionSparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
-  private val jarsDownloadPath = submissionSparkConf.get(JARS_DOWNLOAD_LOCATION)
-  private val filesDownloadPath = submissionSparkConf.get(FILES_DOWNLOAD_LOCATION)
+  private val imagePullPolicy = sparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
+  private val initContainerConfigMapName = s"$kubernetesResourceNamePrefix-init-config"
+  private val jarsDownloadPath = sparkConf.get(JARS_DOWNLOAD_LOCATION)
+  private val filesDownloadPath = sparkConf.get(FILES_DOWNLOAD_LOCATION)
 
-  def getAllConfigurationSteps(): Seq[DriverConfigurationStep] = {
-    val driverCustomLabels = ConfigurationUtils.parsePrefixedKeyValuePairs(
-      submissionSparkConf,
+  def getAllConfigurationSteps: Seq[DriverConfigurationStep] = {
+    val driverCustomLabels = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf,
       KUBERNETES_DRIVER_LABEL_PREFIX)
     require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), "Label with key " +
       s"$SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping " +
@@ -64,11 +71,15 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       s"$SPARK_ROLE_LABEL is not allowed as it is reserved for Spark bookkeeping " +
       "operations.")
 
+    val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf,
+      KUBERNETES_DRIVER_SECRETS_PREFIX)
+
     val allDriverLabels = driverCustomLabels ++ Map(
       SPARK_APP_ID_LABEL -> kubernetesAppId,
       SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
 
-    val initialSubmissionStep = new BaseDriverConfigurationStep(
+    val initialSubmissionStep = new BasicDriverConfigurationStep(
       kubernetesAppId,
       kubernetesResourceNamePrefix,
       allDriverLabels,
@@ -76,16 +87,16 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       appName,
       mainClass,
       appArgs,
-      submissionSparkConf)
+      sparkConf)
 
-    val driverAddressStep = new DriverServiceBootstrapStep(
+    val serviceBootstrapStep = new DriverServiceBootstrapStep(
       kubernetesResourceNamePrefix,
       allDriverLabels,
-      submissionSparkConf,
+      sparkConf,
       new SystemClock)
 
     val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
-      submissionSparkConf, kubernetesResourceNamePrefix)
+      sparkConf, kubernetesResourceNamePrefix)
 
     val additionalMainAppJar = if (mainAppResource.nonEmpty) {
        val mayBeResource = mainAppResource.get match {
@@ -98,28 +109,62 @@ private[spark] class DriverConfigurationStepsOrchestrator(
       None
     }
 
-    val sparkJars = submissionSparkConf.getOption("spark.jars")
+    val sparkJars = sparkConf.getOption("spark.jars")
       .map(_.split(","))
       .getOrElse(Array.empty[String]) ++
       additionalMainAppJar.toSeq
-    val sparkFiles = submissionSparkConf.getOption("spark.files")
+    val sparkFiles = sparkConf.getOption("spark.files")
       .map(_.split(","))
       .getOrElse(Array.empty[String])
 
-    val maybeDependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
-      Some(new DependencyResolutionStep(
+    val dependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
+      Seq(new DependencyResolutionStep(
         sparkJars,
         sparkFiles,
         jarsDownloadPath,
         filesDownloadPath))
     } else {
-      None
+      Nil
+    }
+
+    val initContainerBootstrapStep = if (existNonContainerLocalFiles(sparkJars ++ sparkFiles)) {
+      val orchestrator = new InitContainerConfigOrchestrator(
+        sparkJars,
+        sparkFiles,
+        jarsDownloadPath,
+        filesDownloadPath,
+        imagePullPolicy,
+        initContainerConfigMapName,
+        INIT_CONTAINER_PROPERTIES_FILE_NAME,
+        sparkConf)
+      val bootstrapStep = new DriverInitContainerBootstrapStep(
+        orchestrator.getAllConfigurationSteps,
+        initContainerConfigMapName,
+        INIT_CONTAINER_PROPERTIES_FILE_NAME)
+
+      Seq(bootstrapStep)
+    } else {
+      Nil
+    }
+
+    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
+      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
+    } else {
+      Nil
     }
 
     Seq(
       initialSubmissionStep,
-      driverAddressStep,
+      serviceBootstrapStep,
       kubernetesCredentialsStep) ++
-      maybeDependencyResolutionStep.toSeq
+      dependencyResolutionStep ++
+      initContainerBootstrapStep ++
+      mountSecretsStep
+  }
+
+  private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
+    files.exists { uri =>
+      Utils.resolveURI(uri).getScheme != "local"
+    }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
index 240a1144577b0..5884348cb3e41 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
@@ -80,22 +80,22 @@ private[spark] object ClientArguments {
  * spark.kubernetes.submission.waitAppCompletion is true.
  *
  * @param submissionSteps steps that collectively configure the driver
- * @param submissionSparkConf the submission client Spark configuration
+ * @param sparkConf the submission client Spark configuration
  * @param kubernetesClient the client to talk to the Kubernetes API server
  * @param waitForAppCompletion a flag indicating whether the client should wait for the application
  *                             to complete
  * @param appName the application name
- * @param loggingPodStatusWatcher a watcher that monitors and logs the application status
+ * @param watcher a watcher that monitors and logs the application status
  */
 private[spark] class Client(
     submissionSteps: Seq[DriverConfigurationStep],
-    submissionSparkConf: SparkConf,
+    sparkConf: SparkConf,
     kubernetesClient: KubernetesClient,
     waitForAppCompletion: Boolean,
     appName: String,
-    loggingPodStatusWatcher: LoggingPodStatusWatcher) extends Logging {
+    watcher: LoggingPodStatusWatcher) extends Logging {
 
-  private val driverJavaOptions = submissionSparkConf.get(
+  private val driverJavaOptions = sparkConf.get(
     org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
 
    /**
@@ -104,7 +104,7 @@ private[spark] class Client(
     * will be used to build the Driver Container, Driver Pod, and Kubernetes Resources
     */
   def run(): Unit = {
-    var currentDriverSpec = KubernetesDriverSpec.initialSpec(submissionSparkConf)
+    var currentDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf)
     // submissionSteps contain steps necessary to take, to resolve varying
     // client arguments that are passed in, created by orchestrator
     for (nextStep <- submissionSteps) {
@@ -141,7 +141,7 @@ private[spark] class Client(
       kubernetesClient
         .pods()
         .withName(resolvedDriverPod.getMetadata.getName)
-        .watch(loggingPodStatusWatcher)) { _ =>
+        .watch(watcher)) { _ =>
       val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
       try {
         if (currentDriverSpec.otherKubernetesResources.nonEmpty) {
@@ -157,7 +157,7 @@ private[spark] class Client(
 
       if (waitForAppCompletion) {
         logInfo(s"Waiting for application $appName to finish...")
-        loggingPodStatusWatcher.awaitCompletion()
+        watcher.awaitCompletion()
         logInfo(s"Application $appName finished.")
       } else {
         logInfo(s"Deployed Spark application $appName into Kubernetes.")
@@ -207,11 +207,9 @@ private[spark] class KubernetesClientApplication extends SparkApplication {
     val master = sparkConf.get("spark.master").substring("k8s://".length)
     val loggingInterval = if (waitForAppCompletion) Some(sparkConf.get(REPORT_INTERVAL)) else None
 
-    val loggingPodStatusWatcher = new LoggingPodStatusWatcherImpl(
-      kubernetesAppId, loggingInterval)
+    val watcher = new LoggingPodStatusWatcherImpl(kubernetesAppId, loggingInterval)
 
-    val configurationStepsOrchestrator = new DriverConfigurationStepsOrchestrator(
-      namespace,
+    val orchestrator = new DriverConfigOrchestrator(
       kubernetesAppId,
       launchTime,
       clientArguments.mainAppResource,
@@ -228,12 +226,12 @@ private[spark] class KubernetesClientApplication extends SparkApplication {
       None,
       None)) { kubernetesClient =>
         val client = new Client(
-          configurationStepsOrchestrator.getAllConfigurationSteps(),
+          orchestrator.getAllConfigurationSteps,
           sparkConf,
           kubernetesClient,
           waitForAppCompletion,
           appName,
-          loggingPodStatusWatcher)
+          watcher)
         client.run()
     }
   }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
similarity index 70%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index c335fcce4036e..b7a69a7dfd472 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -22,49 +22,46 @@ import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, EnvVarS
 
 import org.apache.spark.{SparkConf, SparkException}
 import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.ConfigurationUtils
 import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.KubernetesUtils
 import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 import org.apache.spark.internal.config.{DRIVER_CLASS_PATH, DRIVER_MEMORY, DRIVER_MEMORY_OVERHEAD}
 
 /**
- * Represents the initial setup required for the driver.
+ * Performs basic configuration for the driver pod.
  */
-private[spark] class BaseDriverConfigurationStep(
+private[spark] class BasicDriverConfigurationStep(
     kubernetesAppId: String,
-    kubernetesResourceNamePrefix: String,
+    resourceNamePrefix: String,
     driverLabels: Map[String, String],
     imagePullPolicy: String,
     appName: String,
     mainClass: String,
     appArgs: Array[String],
-    submissionSparkConf: SparkConf) extends DriverConfigurationStep {
+    sparkConf: SparkConf) extends DriverConfigurationStep {
 
-  private val kubernetesDriverPodName = submissionSparkConf.get(KUBERNETES_DRIVER_POD_NAME)
-    .getOrElse(s"$kubernetesResourceNamePrefix-driver")
+  private val driverPodName = sparkConf
+    .get(KUBERNETES_DRIVER_POD_NAME)
+    .getOrElse(s"$resourceNamePrefix-driver")
 
-  private val driverExtraClasspath = submissionSparkConf.get(
-    DRIVER_CLASS_PATH)
+  private val driverExtraClasspath = sparkConf.get(DRIVER_CLASS_PATH)
 
-  private val driverContainerImage = submissionSparkConf
+  private val driverContainerImage = sparkConf
     .get(DRIVER_CONTAINER_IMAGE)
     .getOrElse(throw new SparkException("Must specify the driver container image"))
 
   // CPU settings
-  private val driverCpuCores = submissionSparkConf.getOption("spark.driver.cores").getOrElse("1")
-  private val driverLimitCores = submissionSparkConf.get(KUBERNETES_DRIVER_LIMIT_CORES)
+  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
+  private val driverLimitCores = sparkConf.get(KUBERNETES_DRIVER_LIMIT_CORES)
 
   // Memory settings
-  private val driverMemoryMiB = submissionSparkConf.get(
-    DRIVER_MEMORY)
-  private val driverMemoryString = submissionSparkConf.get(
-    DRIVER_MEMORY.key,
-    DRIVER_MEMORY.defaultValueString)
-  private val memoryOverheadMiB = submissionSparkConf
+  private val driverMemoryMiB = sparkConf.get(DRIVER_MEMORY)
+  private val driverMemoryString = sparkConf.get(
+    DRIVER_MEMORY.key, DRIVER_MEMORY.defaultValueString)
+  private val memoryOverheadMiB = sparkConf
     .get(DRIVER_MEMORY_OVERHEAD)
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMiB).toInt,
-      MEMORY_OVERHEAD_MIN_MIB))
-  private val driverContainerMemoryWithOverheadMiB = driverMemoryMiB + memoryOverheadMiB
+    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMiB).toInt, MEMORY_OVERHEAD_MIN_MIB))
+  private val driverMemoryWithOverheadMiB = driverMemoryMiB + memoryOverheadMiB
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
     val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
@@ -74,15 +71,13 @@ private[spark] class BaseDriverConfigurationStep(
         .build()
     }
 
-    val driverCustomAnnotations = ConfigurationUtils
-      .parsePrefixedKeyValuePairs(
-        submissionSparkConf,
-        KUBERNETES_DRIVER_ANNOTATION_PREFIX)
+    val driverCustomAnnotations = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf, KUBERNETES_DRIVER_ANNOTATION_PREFIX)
     require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
       s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
         " Spark bookkeeping operations.")
 
-    val driverCustomEnvs = submissionSparkConf.getAllWithPrefix(KUBERNETES_DRIVER_ENV_KEY).toSeq
+    val driverCustomEnvs = sparkConf.getAllWithPrefix(KUBERNETES_DRIVER_ENV_KEY).toSeq
       .map { env =>
         new EnvVarBuilder()
           .withName(env._1)
@@ -90,10 +85,10 @@ private[spark] class BaseDriverConfigurationStep(
           .build()
       }
 
-    val allDriverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
+    val driverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
 
-    val nodeSelector = ConfigurationUtils.parsePrefixedKeyValuePairs(
-      submissionSparkConf, KUBERNETES_NODE_SELECTOR_PREFIX)
+    val nodeSelector = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf, KUBERNETES_NODE_SELECTOR_PREFIX)
 
     val driverCpuQuantity = new QuantityBuilder(false)
       .withAmount(driverCpuCores)
@@ -102,7 +97,7 @@ private[spark] class BaseDriverConfigurationStep(
       .withAmount(s"${driverMemoryMiB}Mi")
       .build()
     val driverMemoryLimitQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverContainerMemoryWithOverheadMiB}Mi")
+      .withAmount(s"${driverMemoryWithOverheadMiB}Mi")
       .build()
     val maybeCpuLimitQuantity = driverLimitCores.map { limitCores =>
       ("cpu", new QuantityBuilder(false).withAmount(limitCores).build())
@@ -142,9 +137,9 @@ private[spark] class BaseDriverConfigurationStep(
 
     val baseDriverPod = new PodBuilder(driverSpec.driverPod)
       .editOrNewMetadata()
-        .withName(kubernetesDriverPodName)
+        .withName(driverPodName)
         .addToLabels(driverLabels.asJava)
-        .addToAnnotations(allDriverAnnotations.asJava)
+        .addToAnnotations(driverAnnotations.asJava)
       .endMetadata()
       .withNewSpec()
         .withRestartPolicy("Never")
@@ -153,9 +148,9 @@ private[spark] class BaseDriverConfigurationStep(
       .build()
 
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
-      .setIfMissing(KUBERNETES_DRIVER_POD_NAME, kubernetesDriverPodName)
+      .setIfMissing(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set("spark.app.id", kubernetesAppId)
-      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, kubernetesResourceNamePrefix)
+      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, resourceNamePrefix)
 
     driverSpec.copy(
       driverPod = baseDriverPod,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
index 44e0ecffc0e93..d4b83235b4e3b 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
@@ -21,7 +21,8 @@ import java.io.File
 import io.fabric8.kubernetes.api.model.ContainerBuilder
 
 import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, KubernetesFileUtils}
+import org.apache.spark.deploy.k8s.KubernetesUtils
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
 /**
  * Step that configures the classpath, spark.jars, and spark.files for the driver given that the
@@ -31,21 +32,22 @@ private[spark] class DependencyResolutionStep(
     sparkJars: Seq[String],
     sparkFiles: Seq[String],
     jarsDownloadPath: String,
-    localFilesDownloadPath: String) extends DriverConfigurationStep {
+    filesDownloadPath: String) extends DriverConfigurationStep {
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val resolvedSparkJars = KubernetesFileUtils.resolveFileUris(sparkJars, jarsDownloadPath)
-    val resolvedSparkFiles = KubernetesFileUtils.resolveFileUris(
-      sparkFiles, localFilesDownloadPath)
-    val sparkConfResolvedSparkDependencies = driverSpec.driverSparkConf.clone()
+    val resolvedSparkJars = KubernetesUtils.resolveFileUris(sparkJars, jarsDownloadPath)
+    val resolvedSparkFiles = KubernetesUtils.resolveFileUris(sparkFiles, filesDownloadPath)
+
+    val sparkConf = driverSpec.driverSparkConf.clone()
     if (resolvedSparkJars.nonEmpty) {
-      sparkConfResolvedSparkDependencies.set("spark.jars", resolvedSparkJars.mkString(","))
+      sparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
     }
     if (resolvedSparkFiles.nonEmpty) {
-      sparkConfResolvedSparkDependencies.set("spark.files", resolvedSparkFiles.mkString(","))
+      sparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
     }
-    val resolvedClasspath = KubernetesFileUtils.resolveFilePaths(sparkJars, jarsDownloadPath)
-    val driverContainerWithResolvedClasspath = if (resolvedClasspath.nonEmpty) {
+
+    val resolvedClasspath = KubernetesUtils.resolveFilePaths(sparkJars, jarsDownloadPath)
+    val resolvedDriverContainer = if (resolvedClasspath.nonEmpty) {
       new ContainerBuilder(driverSpec.driverContainer)
         .addNewEnv()
         .withName(ENV_MOUNTED_CLASSPATH)
@@ -55,8 +57,9 @@ private[spark] class DependencyResolutionStep(
     } else {
       driverSpec.driverContainer
     }
+
     driverSpec.copy(
-      driverContainer = driverContainerWithResolvedClasspath,
-      driverSparkConf = sparkConfResolvedSparkDependencies)
+      driverContainer = resolvedDriverContainer,
+      driverSparkConf = sparkConf)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
index c99c0436cf25f..17614e040e587 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
@@ -19,7 +19,7 @@ package org.apache.spark.deploy.k8s.submit.steps
 import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
 /**
- * Represents a step in preparing the Kubernetes driver.
+ * Represents a step in configuring the Spark driver pod.
  */
 private[spark] trait DriverConfigurationStep {
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
new file mode 100644
index 0000000000000..9fb3dafdda540
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.StringWriter
+import java.util.Properties
+
+import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, ContainerBuilder, HasMetadata}
+
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.KubernetesUtils
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.deploy.k8s.submit.steps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
+
+/**
+ * Configures the driver init-container that localizes remote dependencies into the driver pod.
+ * It applies the given InitContainerConfigurationSteps in the given order to produce a final
+ * InitContainerSpec that is then used to configure the driver pod with the init-container attached.
+ * It also builds a ConfigMap that will be mounted into the init-container. The ConfigMap carries
+ * configuration properties for the init-container.
+ */
+private[spark] class DriverInitContainerBootstrapStep(
+    steps: Seq[InitContainerConfigurationStep],
+    configMapName: String,
+    configMapKey: String)
+  extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    var initContainerSpec = InitContainerSpec(
+      properties = Map.empty[String, String],
+      driverSparkConf = Map.empty[String, String],
+      initContainer = new ContainerBuilder().build(),
+      driverContainer = driverSpec.driverContainer,
+      driverPod = driverSpec.driverPod,
+      dependentResources = Seq.empty[HasMetadata])
+    for (nextStep <- steps) {
+      initContainerSpec = nextStep.configureInitContainer(initContainerSpec)
+    }
+
+    val configMap = buildConfigMap(
+      configMapName,
+      configMapKey,
+      initContainerSpec.properties)
+    val resolvedDriverSparkConf = driverSpec.driverSparkConf
+      .clone()
+      .set(INIT_CONTAINER_CONFIG_MAP_NAME, configMapName)
+      .set(INIT_CONTAINER_CONFIG_MAP_KEY_CONF, configMapKey)
+      .setAll(initContainerSpec.driverSparkConf)
+    val resolvedDriverPod = KubernetesUtils.appendInitContainer(
+      initContainerSpec.driverPod, initContainerSpec.initContainer)
+
+    driverSpec.copy(
+      driverPod = resolvedDriverPod,
+      driverContainer = initContainerSpec.driverContainer,
+      driverSparkConf = resolvedDriverSparkConf,
+      otherKubernetesResources =
+        driverSpec.otherKubernetesResources ++
+          initContainerSpec.dependentResources ++
+          Seq(configMap))
+  }
+
+  private def buildConfigMap(
+      configMapName: String,
+      configMapKey: String,
+      config: Map[String, String]): ConfigMap = {
+    val properties = new Properties()
+    config.foreach { entry =>
+      properties.setProperty(entry._1, entry._2)
+    }
+    val propertiesWriter = new StringWriter()
+    properties.store(propertiesWriter,
+      s"Java properties built from Kubernetes config map with name: $configMapName " +
+        s"and config map key: $configMapKey")
+    new ConfigMapBuilder()
+      .withNewMetadata()
+        .withName(configMapName)
+        .endMetadata()
+      .addToData(configMapKey, propertiesWriter.toString)
+      .build()
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
new file mode 100644
index 0000000000000..f872e0f4b65d1
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import org.apache.spark.deploy.k8s.MountSecretsBootstrap
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+
+/**
+ * A driver configuration step for mounting user-specified secrets onto user-specified paths.
+ *
+ * @param bootstrap a utility actually handling mounting of the secrets.
+ */
+private[spark] class DriverMountSecretsStep(
+    bootstrap: MountSecretsBootstrap) extends DriverConfigurationStep {
+
+  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
+    val (pod, container) = bootstrap.mountSecrets(
+      driverSpec.driverPod, driverSpec.driverContainer)
+    driverSpec.copy(
+      driverPod = pod,
+      driverContainer = container
+    )
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
index 696d11f15ed95..eb594e4f16ec0 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
@@ -32,21 +32,22 @@ import org.apache.spark.util.Clock
  * ports should correspond to the ports that the executor will reach the pod at for RPC.
  */
 private[spark] class DriverServiceBootstrapStep(
-    kubernetesResourceNamePrefix: String,
+    resourceNamePrefix: String,
     driverLabels: Map[String, String],
-    submissionSparkConf: SparkConf,
+    sparkConf: SparkConf,
     clock: Clock) extends DriverConfigurationStep with Logging {
+
   import DriverServiceBootstrapStep._
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    require(submissionSparkConf.getOption(DRIVER_BIND_ADDRESS_KEY).isEmpty,
+    require(sparkConf.getOption(DRIVER_BIND_ADDRESS_KEY).isEmpty,
       s"$DRIVER_BIND_ADDRESS_KEY is not supported in Kubernetes mode, as the driver's bind " +
       "address is managed and set to the driver pod's IP address.")
-    require(submissionSparkConf.getOption(DRIVER_HOST_KEY).isEmpty,
+    require(sparkConf.getOption(DRIVER_HOST_KEY).isEmpty,
       s"$DRIVER_HOST_KEY is not supported in Kubernetes mode, as the driver's hostname will be " +
       "managed via a Kubernetes service.")
 
-    val preferredServiceName = s"$kubernetesResourceNamePrefix$DRIVER_SVC_POSTFIX"
+    val preferredServiceName = s"$resourceNamePrefix$DRIVER_SVC_POSTFIX"
     val resolvedServiceName = if (preferredServiceName.length <= MAX_SERVICE_NAME_LENGTH) {
       preferredServiceName
     } else {
@@ -58,8 +59,8 @@ private[spark] class DriverServiceBootstrapStep(
       shorterServiceName
     }
 
-    val driverPort = submissionSparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)
-    val driverBlockManagerPort = submissionSparkConf.getInt(
+    val driverPort = sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)
+    val driverBlockManagerPort = sparkConf.getInt(
         org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT.key, DEFAULT_BLOCKMANAGER_PORT)
     val driverService = new ServiceBuilder()
       .withNewMetadata()
@@ -81,7 +82,7 @@ private[spark] class DriverServiceBootstrapStep(
         .endSpec()
       .build()
 
-    val namespace = submissionSparkConf.get(KUBERNETES_NAMESPACE)
+    val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
     val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc.cluster.local"
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
       .set(DRIVER_HOST_KEY, driverHostname)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
new file mode 100644
index 0000000000000..01469853dacc2
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, PodWithDetachedInitContainer}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.KubernetesUtils
+
+/**
+ * Performs basic configuration for the driver init-container with most of the work delegated to
+ * the given InitContainerBootstrap.
+ */
+private[spark] class BasicInitContainerConfigurationStep(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    bootstrap: InitContainerBootstrap)
+  extends InitContainerConfigurationStep {
+
+  override def configureInitContainer(spec: InitContainerSpec): InitContainerSpec = {
+    val remoteJarsToDownload = KubernetesUtils.getOnlyRemoteFiles(sparkJars)
+    val remoteFilesToDownload = KubernetesUtils.getOnlyRemoteFiles(sparkFiles)
+    val remoteJarsConf = if (remoteJarsToDownload.nonEmpty) {
+      Map(INIT_CONTAINER_REMOTE_JARS.key -> remoteJarsToDownload.mkString(","))
+    } else {
+      Map()
+    }
+    val remoteFilesConf = if (remoteFilesToDownload.nonEmpty) {
+      Map(INIT_CONTAINER_REMOTE_FILES.key -> remoteFilesToDownload.mkString(","))
+    } else {
+      Map()
+    }
+
+    val baseInitContainerConfig = Map(
+      JARS_DOWNLOAD_LOCATION.key -> jarsDownloadPath,
+      FILES_DOWNLOAD_LOCATION.key -> filesDownloadPath) ++
+      remoteJarsConf ++
+      remoteFilesConf
+
+    val bootstrapped = bootstrap.bootstrapInitContainer(
+      PodWithDetachedInitContainer(
+        spec.driverPod,
+        spec.initContainer,
+        spec.driverContainer))
+
+    spec.copy(
+      initContainer = bootstrapped.initContainer,
+      driverContainer = bootstrapped.mainContainer,
+      driverPod = bootstrapped.pod,
+      properties = spec.properties ++ baseInitContainerConfig)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
new file mode 100644
index 0000000000000..f2c29c7ce1076
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+
+/**
+ * Figures out and returns the complete ordered list of InitContainerConfigurationSteps required to
+ * configure the driver init-container. The returned steps will be applied in the given order to
+ * produce a final InitContainerSpec that is used to construct the driver init-container in
+ * DriverInitContainerBootstrapStep. This class is only used when an init-container is needed, i.e.,
+ * when there are remote application dependencies to localize.
+ */
+private[spark] class InitContainerConfigOrchestrator(
+    sparkJars: Seq[String],
+    sparkFiles: Seq[String],
+    jarsDownloadPath: String,
+    filesDownloadPath: String,
+    imagePullPolicy: String,
+    configMapName: String,
+    configMapKey: String,
+    sparkConf: SparkConf) {
+
+  private val initContainerImage = sparkConf
+    .get(INIT_CONTAINER_IMAGE)
+    .getOrElse(throw new SparkException(
+      "Must specify the init-container image when there are remote dependencies"))
+
+  def getAllConfigurationSteps: Seq[InitContainerConfigurationStep] = {
+    val initContainerBootstrap = new InitContainerBootstrap(
+      initContainerImage,
+      imagePullPolicy,
+      jarsDownloadPath,
+      filesDownloadPath,
+      configMapName,
+      configMapKey,
+      SPARK_POD_DRIVER_ROLE,
+      sparkConf)
+    val baseStep = new BasicInitContainerConfigurationStep(
+      sparkJars,
+      sparkFiles,
+      jarsDownloadPath,
+      filesDownloadPath,
+      initContainerBootstrap)
+
+    val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf,
+      KUBERNETES_DRIVER_SECRETS_PREFIX)
+    // Mount user-specified driver secrets also into the driver's init-container. The
+    // init-container may need credentials in the secrets to be able to download remote
+    // dependencies. The driver's main container and its init-container share the secrets
+    // because the init-container is sort of an implementation details and this sharing
+    // avoids introducing a dedicated configuration property just for the init-container.
+    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
+      Seq(new InitContainerMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
+    } else {
+      Nil
+    }
+
+    Seq(baseStep) ++ mountSecretsStep
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
new file mode 100644
index 0000000000000..0372ad5270951
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+/**
+ * Represents a step in configuring the driver init-container.
+ */
+private[spark] trait InitContainerConfigurationStep {
+
+  def configureInitContainer(spec: InitContainerSpec): InitContainerSpec
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
new file mode 100644
index 0000000000000..c0e7bb20cce8c
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import org.apache.spark.deploy.k8s.MountSecretsBootstrap
+
+/**
+ * An init-container configuration step for mounting user-specified secrets onto user-specified
+ * paths.
+ *
+ * @param bootstrap a utility actually handling mounting of the secrets
+ */
+private[spark] class InitContainerMountSecretsStep(
+    bootstrap: MountSecretsBootstrap) extends InitContainerConfigurationStep {
+
+  override def configureInitContainer(spec: InitContainerSpec) : InitContainerSpec = {
+    val (driverPod, initContainer) = bootstrap.mountSecrets(
+      spec.driverPod,
+      spec.initContainer)
+    spec.copy(
+      driverPod = driverPod,
+      initContainer = initContainer
+    )
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
new file mode 100644
index 0000000000000..b52c343f0c0ed
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
+
+/**
+ * Represents a specification of the init-container for the driver pod.
+ *
+ * @param properties properties that should be set on the init-container
+ * @param driverSparkConf Spark configuration properties that will be carried back to the driver
+ * @param initContainer the init-container object
+ * @param driverContainer the driver container object
+ * @param driverPod the driver pod object
+ * @param dependentResources resources the init-container depends on to work
+ */
+private[spark] case class InitContainerSpec(
+    properties: Map[String, String],
+    driverSparkConf: Map[String, String],
+    initContainer: Container,
+    driverContainer: Container,
+    driverPod: Pod,
+    dependentResources: Seq[HasMetadata])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
new file mode 100644
index 0000000000000..4a4b628aedbbf
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.k8s
+
+import java.io.File
+import java.util.concurrent.TimeUnit
+
+import scala.concurrent.{ExecutionContext, Future}
+
+import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+/**
+ * Process that fetches files from a resource staging server and/or arbitrary remote locations.
+ *
+ * The init-container can handle fetching files from any of those sources, but not all of the
+ * sources need to be specified. This allows for composing multiple instances of this container
+ * with different configurations for different download sources, or using the same container to
+ * download everything at once.
+ */
+private[spark] class SparkPodInitContainer(
+    sparkConf: SparkConf,
+    fileFetcher: FileFetcher) extends Logging {
+
+  private val maxThreadPoolSize = sparkConf.get(INIT_CONTAINER_MAX_THREAD_POOL_SIZE)
+  private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("download-executor", maxThreadPoolSize))
+
+  private val jarsDownloadDir = new File(sparkConf.get(JARS_DOWNLOAD_LOCATION))
+  private val filesDownloadDir = new File(sparkConf.get(FILES_DOWNLOAD_LOCATION))
+
+  private val remoteJars = sparkConf.get(INIT_CONTAINER_REMOTE_JARS)
+  private val remoteFiles = sparkConf.get(INIT_CONTAINER_REMOTE_FILES)
+
+  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
+
+  def run(): Unit = {
+    logInfo(s"Downloading remote jars: $remoteJars")
+    downloadFiles(
+      remoteJars,
+      jarsDownloadDir,
+      s"Remote jars download directory specified at $jarsDownloadDir does not exist " +
+        "or is not a directory.")
+
+    logInfo(s"Downloading remote files: $remoteFiles")
+    downloadFiles(
+      remoteFiles,
+      filesDownloadDir,
+      s"Remote files download directory specified at $filesDownloadDir does not exist " +
+        "or is not a directory.")
+
+    downloadExecutor.shutdown()
+    downloadExecutor.awaitTermination(downloadTimeoutMinutes, TimeUnit.MINUTES)
+  }
+
+  private def downloadFiles(
+      filesCommaSeparated: Option[String],
+      downloadDir: File,
+      errMessage: String): Unit = {
+    filesCommaSeparated.foreach { files =>
+      require(downloadDir.isDirectory, errMessage)
+      Utils.stringToSeq(files).foreach { file =>
+        Future[Unit] {
+          fileFetcher.fetchFile(file, downloadDir)
+        }
+      }
+    }
+  }
+}
+
+private class FileFetcher(sparkConf: SparkConf, securityManager: SparkSecurityManager) {
+
+  def fetchFile(uri: String, targetDir: File): Unit = {
+    Utils.fetchFile(
+      url = uri,
+      targetDir = targetDir,
+      conf = sparkConf,
+      securityMgr = securityManager,
+      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
+      timestamp = System.currentTimeMillis(),
+      useCache = false)
+  }
+}
+
+object SparkPodInitContainer extends Logging {
+
+  def main(args: Array[String]): Unit = {
+    logInfo("Starting init-container to download Spark application dependencies.")
+    val sparkConf = new SparkConf(true)
+    if (args.nonEmpty) {
+      Utils.loadDefaultSparkProperties(sparkConf, args(0))
+    }
+
+    val securityManager = new SparkSecurityManager(sparkConf)
+    val fileFetcher = new FileFetcher(sparkConf, securityManager)
+    new SparkPodInitContainer(sparkConf, fileFetcher).run()
+    logInfo("Finished downloading application dependencies.")
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index 70226157dd68b..ba5d891f4c77e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -21,35 +21,35 @@ import scala.collection.JavaConverters._
 import io.fabric8.kubernetes.api.model._
 
 import org.apache.spark.{SparkConf, SparkException}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap, PodWithDetachedInitContainer}
 import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.ConfigurationUtils
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.internal.config.{EXECUTOR_CLASS_PATH, EXECUTOR_JAVA_OPTIONS, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD}
 import org.apache.spark.util.Utils
 
 /**
- * A factory class for configuring and creating executor pods.
+ * A factory class for bootstrapping and creating executor pods with the given bootstrapping
+ * components.
+ *
+ * @param sparkConf Spark configuration
+ * @param mountSecretsBootstrap an optional component for mounting user-specified secrets onto
+ *                              user-specified paths into the executor container
+ * @param initContainerBootstrap an optional component for bootstrapping the executor init-container
+ *                               if one is needed, i.e., when there are remote dependencies to
+ *                               localize
+ * @param initContainerMountSecretsBootstrap an optional component for mounting user-specified
+ *                                           secrets onto user-specified paths into the executor
+ *                                           init-container
  */
-private[spark] trait ExecutorPodFactory {
-
-  /**
-   * Configure and construct an executor pod with the given parameters.
-   */
-  def createExecutorPod(
-      executorId: String,
-      applicationId: String,
-      driverUrl: String,
-      executorEnvs: Seq[(String, String)],
-      driverPod: Pod,
-      nodeToLocalTaskCount: Map[String, Int]): Pod
-}
-
-private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
-  extends ExecutorPodFactory {
+private[spark] class ExecutorPodFactory(
+    sparkConf: SparkConf,
+    mountSecretsBootstrap: Option[MountSecretsBootstrap],
+    initContainerBootstrap: Option[InitContainerBootstrap],
+    initContainerMountSecretsBootstrap: Option[MountSecretsBootstrap]) {
 
   private val executorExtraClasspath = sparkConf.get(EXECUTOR_CLASS_PATH)
 
-  private val executorLabels = ConfigurationUtils.parsePrefixedKeyValuePairs(
+  private val executorLabels = KubernetesUtils.parsePrefixedKeyValuePairs(
     sparkConf,
     KUBERNETES_EXECUTOR_LABEL_PREFIX)
   require(
@@ -64,11 +64,11 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
     s"Custom executor labels cannot contain $SPARK_ROLE_LABEL as it is reserved for Spark.")
 
   private val executorAnnotations =
-    ConfigurationUtils.parsePrefixedKeyValuePairs(
+    KubernetesUtils.parsePrefixedKeyValuePairs(
       sparkConf,
       KUBERNETES_EXECUTOR_ANNOTATION_PREFIX)
   private val nodeSelector =
-    ConfigurationUtils.parsePrefixedKeyValuePairs(
+    KubernetesUtils.parsePrefixedKeyValuePairs(
       sparkConf,
       KUBERNETES_NODE_SELECTOR_PREFIX)
 
@@ -94,7 +94,10 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
   private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
   private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
 
-  override def createExecutorPod(
+  /**
+   * Configure and construct an executor pod with the given parameters.
+   */
+  def createExecutorPod(
       executorId: String,
       applicationId: String,
       driverUrl: String,
@@ -198,7 +201,7 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
         .endSpec()
       .build()
 
-    val containerWithExecutorLimitCores = executorLimitCores.map { limitCores =>
+    val containerWithLimitCores = executorLimitCores.map { limitCores =>
       val executorCpuLimitQuantity = new QuantityBuilder(false)
         .withAmount(limitCores)
         .build()
@@ -209,9 +212,33 @@ private[spark] class ExecutorPodFactoryImpl(sparkConf: SparkConf)
         .build()
     }.getOrElse(executorContainer)
 
-    new PodBuilder(executorPod)
+    val (maybeSecretsMountedPod, maybeSecretsMountedContainer) =
+      mountSecretsBootstrap.map { bootstrap =>
+        bootstrap.mountSecrets(executorPod, containerWithLimitCores)
+      }.getOrElse((executorPod, containerWithLimitCores))
+
+    val (bootstrappedPod, bootstrappedContainer) =
+      initContainerBootstrap.map { bootstrap =>
+        val podWithInitContainer = bootstrap.bootstrapInitContainer(
+          PodWithDetachedInitContainer(
+            maybeSecretsMountedPod,
+            new ContainerBuilder().build(),
+            maybeSecretsMountedContainer))
+
+        val (pod, mayBeSecretsMountedInitContainer) =
+          initContainerMountSecretsBootstrap.map { bootstrap =>
+            bootstrap.mountSecrets(podWithInitContainer.pod, podWithInitContainer.initContainer)
+          }.getOrElse((podWithInitContainer.pod, podWithInitContainer.initContainer))
+
+        val bootstrappedPod = KubernetesUtils.appendInitContainer(
+          pod, mayBeSecretsMountedInitContainer)
+
+        (bootstrappedPod, podWithInitContainer.mainContainer)
+      }.getOrElse((maybeSecretsMountedPod, maybeSecretsMountedContainer))
+
+    new PodBuilder(bootstrappedPod)
       .editSpec()
-        .addToContainers(containerWithExecutorLimitCores)
+        .addToContainers(bootstrappedContainer)
         .endSpec()
       .build()
   }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
index b8bb152d17910..a942db6ae02db 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
@@ -21,9 +21,9 @@ import java.io.File
 import io.fabric8.kubernetes.client.Config
 
 import org.apache.spark.{SparkContext, SparkException}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap, SparkKubernetesClientFactory}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.SparkKubernetesClientFactory
 import org.apache.spark.internal.Logging
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 import org.apache.spark.util.ThreadUtils
@@ -45,6 +45,59 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       masterURL: String,
       scheduler: TaskScheduler): SchedulerBackend = {
     val sparkConf = sc.getConf
+    val initContainerConfigMap = sparkConf.get(INIT_CONTAINER_CONFIG_MAP_NAME)
+    val initContainerConfigMapKey = sparkConf.get(INIT_CONTAINER_CONFIG_MAP_KEY_CONF)
+
+    if (initContainerConfigMap.isEmpty) {
+      logWarning("The executor's init-container config map is not specified. Executors will " +
+        "therefore not attempt to fetch remote or submitted dependencies.")
+    }
+
+    if (initContainerConfigMapKey.isEmpty) {
+      logWarning("The executor's init-container config map key is not specified. Executors will " +
+        "therefore not attempt to fetch remote or submitted dependencies.")
+    }
+
+    // Only set up the bootstrap if they've provided both the config map key and the config map
+    // name. The config map might not be provided if init-containers aren't being used to
+    // bootstrap dependencies.
+    val initContainerBootstrap = for {
+      configMap <- initContainerConfigMap
+      configMapKey <- initContainerConfigMapKey
+    } yield {
+      val initContainerImage = sparkConf
+        .get(INIT_CONTAINER_IMAGE)
+        .getOrElse(throw new SparkException(
+          "Must specify the init-container image when there are remote dependencies"))
+      new InitContainerBootstrap(
+        initContainerImage,
+        sparkConf.get(CONTAINER_IMAGE_PULL_POLICY),
+        sparkConf.get(JARS_DOWNLOAD_LOCATION),
+        sparkConf.get(FILES_DOWNLOAD_LOCATION),
+        configMap,
+        configMapKey,
+        SPARK_POD_EXECUTOR_ROLE,
+        sparkConf)
+    }
+
+    val executorSecretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
+      sparkConf, KUBERNETES_EXECUTOR_SECRETS_PREFIX)
+    val mountSecretBootstrap = if (executorSecretNamesToMountPaths.nonEmpty) {
+      Some(new MountSecretsBootstrap(executorSecretNamesToMountPaths))
+    } else {
+      None
+    }
+    // Mount user-specified executor secrets also into the executor's init-container. The
+    // init-container may need credentials in the secrets to be able to download remote
+    // dependencies. The executor's main container and its init-container share the secrets
+    // because the init-container is sort of an implementation details and this sharing
+    // avoids introducing a dedicated configuration property just for the init-container.
+    val initContainerMountSecretsBootstrap = if (initContainerBootstrap.nonEmpty &&
+      executorSecretNamesToMountPaths.nonEmpty) {
+      Some(new MountSecretsBootstrap(executorSecretNamesToMountPaths))
+    } else {
+      None
+    }
 
     val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
       KUBERNETES_MASTER_INTERNAL_URL,
@@ -54,7 +107,12 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
       Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
 
-    val executorPodFactory = new ExecutorPodFactoryImpl(sparkConf)
+    val executorPodFactory = new ExecutorPodFactory(
+      sparkConf,
+      mountSecretBootstrap,
+      initContainerBootstrap,
+      initContainerMountSecretsBootstrap)
+
     val allocatorExecutor = ThreadUtils
       .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator")
     val requestExecutorsService = ThreadUtils.newDaemonCachedThreadPool(
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
similarity index 51%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
index 98f9f27da5cde..f193b1f4d3664 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigurationStepsOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
@@ -17,25 +17,27 @@
 package org.apache.spark.deploy.k8s.submit
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config.DRIVER_CONTAINER_IMAGE
+import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.submit.steps._
 
-class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
+class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
-  private val NAMESPACE = "default"
   private val DRIVER_IMAGE = "driver-image"
+  private val IC_IMAGE = "init-container-image"
   private val APP_ID = "spark-app-id"
   private val LAUNCH_TIME = 975256L
   private val APP_NAME = "spark"
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
   private val APP_ARGS = Array("arg1", "arg2")
+  private val SECRET_FOO = "foo"
+  private val SECRET_BAR = "bar"
+  private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
 
   test("Base submission steps with a main app resource.") {
     val sparkConf = new SparkConf(false)
       .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
-    val orchestrator = new DriverConfigurationStepsOrchestrator(
-      NAMESPACE,
+    val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
       LAUNCH_TIME,
       Some(mainAppResource),
@@ -45,7 +47,7 @@ class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
       sparkConf)
     validateStepTypes(
       orchestrator,
-      classOf[BaseDriverConfigurationStep],
+      classOf[BasicDriverConfigurationStep],
       classOf[DriverServiceBootstrapStep],
       classOf[DriverKubernetesCredentialsStep],
       classOf[DependencyResolutionStep]
@@ -55,8 +57,7 @@ class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
   test("Base submission steps without a main app resource.") {
     val sparkConf = new SparkConf(false)
       .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
-    val orchestrator = new DriverConfigurationStepsOrchestrator(
-      NAMESPACE,
+    val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
       LAUNCH_TIME,
       Option.empty,
@@ -66,16 +67,62 @@ class DriverConfigurationStepsOrchestratorSuite extends SparkFunSuite {
       sparkConf)
     validateStepTypes(
       orchestrator,
-      classOf[BaseDriverConfigurationStep],
+      classOf[BasicDriverConfigurationStep],
       classOf[DriverServiceBootstrapStep],
       classOf[DriverKubernetesCredentialsStep]
     )
   }
 
+  test("Submission steps with an init-container.") {
+    val sparkConf = new SparkConf(false)
+      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(INIT_CONTAINER_IMAGE, IC_IMAGE)
+      .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
+    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
+    val orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(mainAppResource),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    validateStepTypes(
+      orchestrator,
+      classOf[BasicDriverConfigurationStep],
+      classOf[DriverServiceBootstrapStep],
+      classOf[DriverKubernetesCredentialsStep],
+      classOf[DependencyResolutionStep],
+      classOf[DriverInitContainerBootstrapStep])
+  }
+
+  test("Submission steps with driver secrets to mount") {
+    val sparkConf = new SparkConf(false)
+      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
+      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
+    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
+    val orchestrator = new DriverConfigOrchestrator(
+      APP_ID,
+      LAUNCH_TIME,
+      Some(mainAppResource),
+      APP_NAME,
+      MAIN_CLASS,
+      APP_ARGS,
+      sparkConf)
+    validateStepTypes(
+      orchestrator,
+      classOf[BasicDriverConfigurationStep],
+      classOf[DriverServiceBootstrapStep],
+      classOf[DriverKubernetesCredentialsStep],
+      classOf[DependencyResolutionStep],
+      classOf[DriverMountSecretsStep])
+  }
+
   private def validateStepTypes(
-      orchestrator: DriverConfigurationStepsOrchestrator,
+      orchestrator: DriverConfigOrchestrator,
       types: Class[_ <: DriverConfigurationStep]*): Unit = {
-    val steps = orchestrator.getAllConfigurationSteps()
+    val steps = orchestrator.getAllConfigurationSteps
     assert(steps.size === types.size)
     assert(steps.map(_.getClass) === types)
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
new file mode 100644
index 0000000000000..8388c16ded268
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model.{Container, Pod}
+
+private[spark] object SecretVolumeUtils {
+
+  def podHasVolume(driverPod: Pod, volumeName: String): Boolean = {
+    driverPod.getSpec.getVolumes.asScala.exists(volume => volume.getName == volumeName)
+  }
+
+  def containerHasVolume(
+      driverContainer: Container,
+      volumeName: String,
+      mountPath: String): Boolean = {
+    driverContainer.getVolumeMounts.asScala.exists(volumeMount =>
+      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
similarity index 97%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index f7c1b3142cf71..e864c6a16eeb1 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BaseDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
-class BaseDriverConfigurationStepSuite extends SparkFunSuite {
+class BasicDriverConfigurationStepSuite extends SparkFunSuite {
 
   private val APP_ID = "spark-app-id"
   private val RESOURCE_NAME_PREFIX = "spark"
@@ -52,7 +52,7 @@ class BaseDriverConfigurationStepSuite extends SparkFunSuite {
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
 
-    val submissionStep = new BaseDriverConfigurationStep(
+    val submissionStep = new BasicDriverConfigurationStep(
       APP_ID,
       RESOURCE_NAME_PREFIX,
       DRIVER_LABELS,
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
new file mode 100644
index 0000000000000..758871e2ba356
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import java.io.StringReader
+import java.util.Properties
+
+import scala.collection.JavaConverters._
+
+import com.google.common.collect.Maps
+import io.fabric8.kubernetes.api.model.{ConfigMap, ContainerBuilder, HasMetadata, PodBuilder, SecretBuilder}
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
+import org.apache.spark.deploy.k8s.submit.steps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
+import org.apache.spark.util.Utils
+
+class DriverInitContainerBootstrapStepSuite extends SparkFunSuite {
+
+  private val CONFIG_MAP_NAME = "spark-init-config-map"
+  private val CONFIG_MAP_KEY = "spark-init-config-map-key"
+
+  test("The init container bootstrap step should use all of the init container steps") {
+    val baseDriverSpec = KubernetesDriverSpec(
+      driverPod = new PodBuilder().build(),
+      driverContainer = new ContainerBuilder().build(),
+      driverSparkConf = new SparkConf(false),
+      otherKubernetesResources = Seq.empty[HasMetadata])
+    val initContainerSteps = Seq(
+      FirstTestInitContainerConfigurationStep,
+      SecondTestInitContainerConfigurationStep)
+    val bootstrapStep = new DriverInitContainerBootstrapStep(
+      initContainerSteps,
+      CONFIG_MAP_NAME,
+      CONFIG_MAP_KEY)
+
+    val preparedDriverSpec = bootstrapStep.configureDriver(baseDriverSpec)
+
+    assert(preparedDriverSpec.driverPod.getMetadata.getLabels.asScala ===
+      FirstTestInitContainerConfigurationStep.additionalLabels)
+    val additionalDriverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
+    assert(additionalDriverEnv.size === 1)
+    assert(additionalDriverEnv.head.getName ===
+      FirstTestInitContainerConfigurationStep.additionalMainContainerEnvKey)
+    assert(additionalDriverEnv.head.getValue ===
+      FirstTestInitContainerConfigurationStep.additionalMainContainerEnvValue)
+
+    assert(preparedDriverSpec.otherKubernetesResources.size === 2)
+    assert(preparedDriverSpec.otherKubernetesResources.contains(
+      FirstTestInitContainerConfigurationStep.additionalKubernetesResource))
+    assert(preparedDriverSpec.otherKubernetesResources.exists {
+      case configMap: ConfigMap =>
+        val hasMatchingName = configMap.getMetadata.getName == CONFIG_MAP_NAME
+        val configMapData = configMap.getData.asScala
+        val hasCorrectNumberOfEntries = configMapData.size == 1
+        val initContainerPropertiesRaw = configMapData(CONFIG_MAP_KEY)
+        val initContainerProperties = new Properties()
+        Utils.tryWithResource(new StringReader(initContainerPropertiesRaw)) {
+          initContainerProperties.load(_)
+        }
+        val initContainerPropertiesMap = Maps.fromProperties(initContainerProperties).asScala
+        val expectedInitContainerProperties = Map(
+          SecondTestInitContainerConfigurationStep.additionalInitContainerPropertyKey ->
+            SecondTestInitContainerConfigurationStep.additionalInitContainerPropertyValue)
+        val hasMatchingProperties = initContainerPropertiesMap == expectedInitContainerProperties
+        hasMatchingName && hasCorrectNumberOfEntries && hasMatchingProperties
+
+      case _ => false
+    })
+
+    val initContainers = preparedDriverSpec.driverPod.getSpec.getInitContainers
+    assert(initContainers.size() === 1)
+    val initContainerEnv = initContainers.get(0).getEnv.asScala
+    assert(initContainerEnv.size === 1)
+    assert(initContainerEnv.head.getName ===
+      SecondTestInitContainerConfigurationStep.additionalInitContainerEnvKey)
+    assert(initContainerEnv.head.getValue ===
+      SecondTestInitContainerConfigurationStep.additionalInitContainerEnvValue)
+
+    val expectedSparkConf = Map(
+      INIT_CONTAINER_CONFIG_MAP_NAME.key -> CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY_CONF.key -> CONFIG_MAP_KEY,
+      SecondTestInitContainerConfigurationStep.additionalDriverSparkConfKey ->
+        SecondTestInitContainerConfigurationStep.additionalDriverSparkConfValue)
+    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
+  }
+}
+
+private object FirstTestInitContainerConfigurationStep extends InitContainerConfigurationStep {
+
+  val additionalLabels = Map("additionalLabelkey" -> "additionalLabelValue")
+  val additionalMainContainerEnvKey = "TEST_ENV_MAIN_KEY"
+  val additionalMainContainerEnvValue = "TEST_ENV_MAIN_VALUE"
+  val additionalKubernetesResource = new SecretBuilder()
+    .withNewMetadata()
+    .withName("test-secret")
+    .endMetadata()
+    .addToData("secret-key", "secret-value")
+    .build()
+
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
+    val driverPod = new PodBuilder(initContainerSpec.driverPod)
+      .editOrNewMetadata()
+      .addToLabels(additionalLabels.asJava)
+      .endMetadata()
+      .build()
+    val mainContainer = new ContainerBuilder(initContainerSpec.driverContainer)
+      .addNewEnv()
+      .withName(additionalMainContainerEnvKey)
+      .withValue(additionalMainContainerEnvValue)
+      .endEnv()
+      .build()
+    initContainerSpec.copy(
+      driverPod = driverPod,
+      driverContainer = mainContainer,
+      dependentResources = initContainerSpec.dependentResources ++
+        Seq(additionalKubernetesResource))
+  }
+}
+
+private object SecondTestInitContainerConfigurationStep extends InitContainerConfigurationStep {
+  val additionalInitContainerEnvKey = "TEST_ENV_INIT_KEY"
+  val additionalInitContainerEnvValue = "TEST_ENV_INIT_VALUE"
+  val additionalInitContainerPropertyKey = "spark.initcontainer.testkey"
+  val additionalInitContainerPropertyValue = "testvalue"
+  val additionalDriverSparkConfKey = "spark.driver.testkey"
+  val additionalDriverSparkConfValue = "spark.driver.testvalue"
+
+  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
+    val initContainer = new ContainerBuilder(initContainerSpec.initContainer)
+      .addNewEnv()
+      .withName(additionalInitContainerEnvKey)
+      .withValue(additionalInitContainerEnvValue)
+      .endEnv()
+      .build()
+    val initContainerProperties = initContainerSpec.properties ++
+      Map(additionalInitContainerPropertyKey -> additionalInitContainerPropertyValue)
+    val driverSparkConf = initContainerSpec.driverSparkConf ++
+      Map(additionalDriverSparkConfKey -> additionalDriverSparkConfValue)
+    initContainerSpec.copy(
+      initContainer = initContainer,
+      properties = initContainerProperties,
+      driverSparkConf = driverSparkConf)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
new file mode 100644
index 0000000000000..9ec0cb55de5aa
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.MountSecretsBootstrap
+import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, SecretVolumeUtils}
+
+class DriverMountSecretsStepSuite extends SparkFunSuite {
+
+  private val SECRET_FOO = "foo"
+  private val SECRET_BAR = "bar"
+  private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
+
+  test("mounts all given secrets") {
+    val baseDriverSpec = KubernetesDriverSpec.initialSpec(new SparkConf(false))
+    val secretNamesToMountPaths = Map(
+      SECRET_FOO -> SECRET_MOUNT_PATH,
+      SECRET_BAR -> SECRET_MOUNT_PATH)
+
+    val mountSecretsBootstrap = new MountSecretsBootstrap(secretNamesToMountPaths)
+    val mountSecretsStep = new DriverMountSecretsStep(mountSecretsBootstrap)
+    val configuredDriverSpec = mountSecretsStep.configureDriver(baseDriverSpec)
+    val driverPodWithSecretsMounted = configuredDriverSpec.driverPod
+    val driverContainerWithSecretsMounted = configuredDriverSpec.driverContainer
+
+    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach { volumeName =>
+      assert(SecretVolumeUtils.podHasVolume(driverPodWithSecretsMounted, volumeName))
+    }
+    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach { volumeName =>
+      assert(SecretVolumeUtils.containerHasVolume(
+        driverContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH))
+    }
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
new file mode 100644
index 0000000000000..4553f9f6b1d45
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import scala.collection.JavaConverters._
+
+import io.fabric8.kubernetes.api.model._
+import org.mockito.{Mock, MockitoAnnotations}
+import org.mockito.Matchers.any
+import org.mockito.Mockito.when
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, PodWithDetachedInitContainer}
+import org.apache.spark.deploy.k8s.Config._
+
+class BasicInitContainerConfigurationStepSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
+  private val SPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/files"
+  private val POD_LABEL = Map("bootstrap" -> "true")
+  private val INIT_CONTAINER_NAME = "init-container"
+  private val DRIVER_CONTAINER_NAME = "driver-container"
+
+  @Mock
+  private var podAndInitContainerBootstrap : InitContainerBootstrap = _
+
+  before {
+    MockitoAnnotations.initMocks(this)
+    when(podAndInitContainerBootstrap.bootstrapInitContainer(
+      any[PodWithDetachedInitContainer])).thenAnswer(new Answer[PodWithDetachedInitContainer] {
+      override def answer(invocation: InvocationOnMock) : PodWithDetachedInitContainer = {
+        val pod = invocation.getArgumentAt(0, classOf[PodWithDetachedInitContainer])
+        pod.copy(
+          pod = new PodBuilder(pod.pod)
+            .withNewMetadata()
+            .addToLabels("bootstrap", "true")
+            .endMetadata()
+            .withNewSpec().endSpec()
+            .build(),
+          initContainer = new ContainerBuilder()
+            .withName(INIT_CONTAINER_NAME)
+            .build(),
+          mainContainer = new ContainerBuilder()
+            .withName(DRIVER_CONTAINER_NAME)
+            .build()
+        )}})
+  }
+
+  test("additionalDriverSparkConf with mix of remote files and jars") {
+    val baseInitStep = new BasicInitContainerConfigurationStep(
+      SPARK_JARS,
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      podAndInitContainerBootstrap)
+    val expectedDriverSparkConf = Map(
+      JARS_DOWNLOAD_LOCATION.key -> JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_LOCATION.key -> FILES_DOWNLOAD_PATH,
+      INIT_CONTAINER_REMOTE_JARS.key -> "hdfs://localhost:9000/app/jars/jar1.jar",
+      INIT_CONTAINER_REMOTE_FILES.key -> "hdfs://localhost:9000/app/files/file1.txt")
+    val initContainerSpec = InitContainerSpec(
+      Map.empty[String, String],
+      Map.empty[String, String],
+      new Container(),
+      new Container(),
+      new Pod,
+      Seq.empty[HasMetadata])
+    val returnContainerSpec = baseInitStep.configureInitContainer(initContainerSpec)
+    assert(expectedDriverSparkConf === returnContainerSpec.properties)
+    assert(returnContainerSpec.initContainer.getName === INIT_CONTAINER_NAME)
+    assert(returnContainerSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
+    assert(returnContainerSpec.driverPod.getMetadata.getLabels.asScala === POD_LABEL)
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
new file mode 100644
index 0000000000000..20f2e5bc15df3
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.deploy.k8s.Constants._
+
+class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
+
+  private val DOCKER_IMAGE = "init-container"
+  private val SPARK_JARS = Seq(
+    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
+  private val SPARK_FILES = Seq(
+    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
+  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
+  private val FILES_DOWNLOAD_PATH = "/var/data/files"
+  private val DOCKER_IMAGE_PULL_POLICY: String = "IfNotPresent"
+  private val CUSTOM_LABEL_KEY = "customLabel"
+  private val CUSTOM_LABEL_VALUE = "customLabelValue"
+  private val INIT_CONTAINER_CONFIG_MAP_NAME = "spark-init-config-map"
+  private val INIT_CONTAINER_CONFIG_MAP_KEY = "spark-init-config-map-key"
+  private val SECRET_FOO = "foo"
+  private val SECRET_BAR = "bar"
+  private val SECRET_MOUNT_PATH = "/etc/secrets/init-container"
+
+  test("including basic configuration step") {
+    val sparkConf = new SparkConf(true)
+      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
+
+    val orchestrator = new InitContainerConfigOrchestrator(
+      SPARK_JARS.take(1),
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOCKER_IMAGE_PULL_POLICY,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      sparkConf)
+    val initSteps = orchestrator.getAllConfigurationSteps
+    assert(initSteps.lengthCompare(1) == 0)
+    assert(initSteps.head.isInstanceOf[BasicInitContainerConfigurationStep])
+  }
+
+  test("including step to mount user-specified secrets") {
+    val sparkConf = new SparkConf(false)
+      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
+      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
+
+    val orchestrator = new InitContainerConfigOrchestrator(
+      SPARK_JARS.take(1),
+      SPARK_FILES,
+      JARS_DOWNLOAD_PATH,
+      FILES_DOWNLOAD_PATH,
+      DOCKER_IMAGE_PULL_POLICY,
+      INIT_CONTAINER_CONFIG_MAP_NAME,
+      INIT_CONTAINER_CONFIG_MAP_KEY,
+      sparkConf)
+    val initSteps = orchestrator.getAllConfigurationSteps
+    assert(initSteps.length === 2)
+    assert(initSteps.head.isInstanceOf[BasicInitContainerConfigurationStep])
+    assert(initSteps(1).isInstanceOf[InitContainerMountSecretsStep])
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
new file mode 100644
index 0000000000000..eab4e17659456
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.k8s.submit.steps.initcontainer
+
+import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.deploy.k8s.MountSecretsBootstrap
+import org.apache.spark.deploy.k8s.submit.SecretVolumeUtils
+
+class InitContainerMountSecretsStepSuite extends SparkFunSuite {
+
+  private val SECRET_FOO = "foo"
+  private val SECRET_BAR = "bar"
+  private val SECRET_MOUNT_PATH = "/etc/secrets/init-container"
+
+  test("mounts all given secrets") {
+    val baseInitContainerSpec = InitContainerSpec(
+      Map.empty,
+      Map.empty,
+      new ContainerBuilder().build(),
+      new ContainerBuilder().build(),
+      new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build(),
+      Seq.empty)
+    val secretNamesToMountPaths = Map(
+      SECRET_FOO -> SECRET_MOUNT_PATH,
+      SECRET_BAR -> SECRET_MOUNT_PATH)
+
+    val mountSecretsBootstrap = new MountSecretsBootstrap(secretNamesToMountPaths)
+    val initContainerMountSecretsStep = new InitContainerMountSecretsStep(mountSecretsBootstrap)
+    val configuredInitContainerSpec = initContainerMountSecretsStep.configureInitContainer(
+      baseInitContainerSpec)
+
+    val podWithSecretsMounted = configuredInitContainerSpec.driverPod
+    val initContainerWithSecretsMounted = configuredInitContainerSpec.initContainer
+
+    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
+      assert(SecretVolumeUtils.podHasVolume(podWithSecretsMounted, volumeName)))
+    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
+      assert(SecretVolumeUtils.containerHasVolume(
+        initContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH)))
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
new file mode 100644
index 0000000000000..6c557ec4a7c9a
--- /dev/null
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.rest.k8s
+
+import java.io.File
+import java.util.UUID
+
+import com.google.common.base.Charsets
+import com.google.common.io.Files
+import org.mockito.Mockito
+import org.scalatest.BeforeAndAfter
+import org.scalatest.mockito.MockitoSugar._
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.Config._
+import org.apache.spark.util.Utils
+
+class SparkPodInitContainerSuite extends SparkFunSuite with BeforeAndAfter {
+
+  private val DOWNLOAD_JARS_SECRET_LOCATION = createTempFile("txt")
+  private val DOWNLOAD_FILES_SECRET_LOCATION = createTempFile("txt")
+
+  private var downloadJarsDir: File = _
+  private var downloadFilesDir: File = _
+  private var downloadJarsSecretValue: String = _
+  private var downloadFilesSecretValue: String = _
+  private var fileFetcher: FileFetcher = _
+
+  override def beforeAll(): Unit = {
+    downloadJarsSecretValue = Files.toString(
+      new File(DOWNLOAD_JARS_SECRET_LOCATION), Charsets.UTF_8)
+    downloadFilesSecretValue = Files.toString(
+      new File(DOWNLOAD_FILES_SECRET_LOCATION), Charsets.UTF_8)
+  }
+
+  before {
+    downloadJarsDir = Utils.createTempDir()
+    downloadFilesDir = Utils.createTempDir()
+    fileFetcher = mock[FileFetcher]
+  }
+
+  after {
+    downloadJarsDir.delete()
+    downloadFilesDir.delete()
+  }
+
+  test("Downloads from remote server should invoke the file fetcher") {
+    val sparkConf = getSparkConfForRemoteFileDownloads
+    val initContainerUnderTest = new SparkPodInitContainer(sparkConf, fileFetcher)
+    initContainerUnderTest.run()
+    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir)
+    Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir)
+    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/file.txt", downloadFilesDir)
+  }
+
+  private def getSparkConfForRemoteFileDownloads: SparkConf = {
+    new SparkConf(true)
+      .set(INIT_CONTAINER_REMOTE_JARS,
+        "http://localhost:9000/jar1.jar,hdfs://localhost:9000/jar2.jar")
+      .set(INIT_CONTAINER_REMOTE_FILES,
+        "http://localhost:9000/file.txt")
+      .set(JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
+      .set(FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
+  }
+
+  private def createTempFile(extension: String): String = {
+    val dir = Utils.createTempDir()
+    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
+    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
+    file.getAbsolutePath
+  }
+}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 3a55d7cb37b1f..7121a802c69c1 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -18,15 +18,19 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import scala.collection.JavaConverters._
 
-import io.fabric8.kubernetes.api.model.{Pod, _}
-import org.mockito.MockitoAnnotations
+import io.fabric8.kubernetes.api.model._
+import org.mockito.{AdditionalAnswers, MockitoAnnotations}
+import org.mockito.Matchers.any
+import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 
 class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with BeforeAndAfterEach {
+
   private val driverPodName: String = "driver-pod"
   private val driverPodUid: String = "driver-uid"
   private val executorPrefix: String = "base"
@@ -54,7 +58,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
   }
 
   test("basic executor pod has reasonable defaults") {
-    val factory = new ExecutorPodFactoryImpl(baseConf)
+    val factory = new ExecutorPodFactory(baseConf, None, None, None)
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
 
@@ -85,7 +89,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     conf.set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX,
       "loremipsumdolorsitametvimatelitrefficiendisuscipianturvixlegeresple")
 
-    val factory = new ExecutorPodFactoryImpl(conf)
+    val factory = new ExecutorPodFactory(conf, None, None, None)
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
 
@@ -97,7 +101,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     conf.set(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS, "foo=bar")
     conf.set(org.apache.spark.internal.config.EXECUTOR_CLASS_PATH, "bar=baz")
 
-    val factory = new ExecutorPodFactoryImpl(conf)
+    val factory = new ExecutorPodFactory(conf, None, None, None)
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)]("qux" -> "quux"), driverPod, Map[String, Int]())
 
@@ -108,6 +112,74 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     checkOwnerReferences(executor, driverPodUid)
   }
 
+  test("executor secrets get mounted") {
+    val conf = baseConf.clone()
+
+    val secretsBootstrap = new MountSecretsBootstrap(Map("secret1" -> "/var/secret1"))
+    val factory = new ExecutorPodFactory(
+      conf,
+      Some(secretsBootstrap),
+      None,
+      None)
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
+
+    assert(executor.getSpec.getContainers.size() === 1)
+    assert(executor.getSpec.getContainers.get(0).getVolumeMounts.size() === 1)
+    assert(executor.getSpec.getContainers.get(0).getVolumeMounts.get(0).getName
+      === "secret1-volume")
+    assert(executor.getSpec.getContainers.get(0).getVolumeMounts.get(0)
+      .getMountPath === "/var/secret1")
+
+    // check volume mounted.
+    assert(executor.getSpec.getVolumes.size() === 1)
+    assert(executor.getSpec.getVolumes.get(0).getSecret.getSecretName === "secret1")
+
+    checkOwnerReferences(executor, driverPodUid)
+  }
+
+  test("init-container bootstrap step adds an init container") {
+    val conf = baseConf.clone()
+    val initContainerBootstrap = mock(classOf[InitContainerBootstrap])
+    when(initContainerBootstrap.bootstrapInitContainer(
+      any(classOf[PodWithDetachedInitContainer]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
+
+    val factory = new ExecutorPodFactory(
+      conf,
+      None,
+      Some(initContainerBootstrap),
+      None)
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
+
+    assert(executor.getSpec.getInitContainers.size() === 1)
+    checkOwnerReferences(executor, driverPodUid)
+  }
+
+  test("init-container with secrets mount bootstrap") {
+    val conf = baseConf.clone()
+    val initContainerBootstrap = mock(classOf[InitContainerBootstrap])
+    when(initContainerBootstrap.bootstrapInitContainer(
+      any(classOf[PodWithDetachedInitContainer]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
+    val secretsBootstrap = new MountSecretsBootstrap(Map("secret1" -> "/var/secret1"))
+
+    val factory = new ExecutorPodFactory(
+      conf,
+      None,
+      Some(initContainerBootstrap),
+      Some(secretsBootstrap))
+    val executor = factory.createExecutorPod(
+      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
+
+    assert(executor.getSpec.getInitContainers.size() === 1)
+    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0).getName
+      === "secret1-volume")
+    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0)
+      .getMountPath === "/var/secret1")
+
+    checkOwnerReferences(executor, driverPodUid)
+  }
+
   // There is always exactly one controller reference, and it points to the driver pod.
   private def checkOwnerReferences(executor: Pod, driverPodUid: String): Unit = {
     assert(executor.getMetadata.getOwnerReferences.size() === 1)
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index 9b682f8673c69..45fbcd9cd0deb 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -22,7 +22,7 @@ FROM spark-base
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
+# docker build -t spark-driver:latest -f kubernetes/dockerfiles/driver/Dockerfile .
 
 COPY examples /opt/spark/examples
 
@@ -31,4 +31,5 @@ CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
     ${JAVA_HOME}/bin/java "${SPARK_DRIVER_JAVA_OPTS[@]}" -cp "$SPARK_CLASSPATH" -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 168cd4cb6c57a..0f806cf7e148e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -22,7 +22,7 @@ FROM spark-base
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
+# docker build -t spark-executor:latest -f kubernetes/dockerfiles/executor/Dockerfile .
 
 COPY examples /opt/spark/examples
 
@@ -31,4 +31,5 @@ CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
     readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt && \
     if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
     if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
+    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
     ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp "$SPARK_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
new file mode 100644
index 0000000000000..055493188fcb7
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -0,0 +1,24 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM spark-base
+
+# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
+# command should be invoked from the top level directory of the Spark distribution. E.g.:
+# docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
+
+ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.rest.k8s.SparkPodInitContainer" ]

From ded6d27e4eb02e4530015a95794e6ed0586faaa7 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 28 Dec 2017 13:53:04 +0900
Subject: [PATCH 203/356] [SPARK-22648][K8S] Add documentation covering init
 containers and secrets

## What changes were proposed in this pull request?

This PR updates the Kubernetes documentation corresponding to the following features/changes in #19954.
* Ability to use remote dependencies through the init-container.
* Ability to mount user-specified secrets into the driver and executor pods.

vanzin jiangxb1987 foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20059 from liyinan926/doc-update.
---
 docs/running-on-kubernetes.md    | 194 ++++++++++++++++++++++---------
 sbin/build-push-docker-images.sh |   3 +-
 2 files changed, 143 insertions(+), 54 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 0048bd90b48ae..e491329136a3c 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -69,17 +69,17 @@ building using the supplied script, or manually.
 
 To launch Spark Pi in cluster mode,
 
-{% highlight bash %}
+```bash
 $ bin/spark-submit \
     --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
     --deploy-mode cluster \
     --name spark-pi \
     --class org.apache.spark.examples.SparkPi \
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.docker.image=<driver-image> \
-    --conf spark.kubernetes.executor.docker.image=<executor-image> \
+    --conf spark.kubernetes.driver.container.image=<driver-image> \
+    --conf spark.kubernetes.executor.container.image=<executor-image> \
     local:///path/to/examples.jar
-{% endhighlight %}
+```
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
 `spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
@@ -120,6 +120,54 @@ by their appropriate remote URIs. Also, application dependencies can be pre-moun
 Those dependencies can be added to the classpath by referencing them with `local://` URIs and/or setting the
 `SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles.
 
+### Using Remote Dependencies
+When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
+need a Kubernetes [init-container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) for downloading
+the dependencies so the driver and executor containers can use them locally. This requires users to specify the container
+image for the init-container using the configuration property `spark.kubernetes.initContainer.image`. For example, users
+simply add the following option to the `spark-submit` command to specify the init-container image:
+
+```
+--conf spark.kubernetes.initContainer.image=<init-container image>
+```
+
+The init-container handles remote dependencies specified in `spark.jars` (or the `--jars` option of `spark-submit`) and
+`spark.files` (or the `--files` option of `spark-submit`). It also handles remotely hosted main application resources, e.g.,
+the main application jar. The following shows an example of using remote dependencies with the `spark-submit` command:
+
+```bash
+$ bin/spark-submit \
+    --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
+    --deploy-mode cluster \
+    --name spark-pi \
+    --class org.apache.spark.examples.SparkPi \
+    --jars https://path/to/dependency1.jar,https://path/to/dependency2.jar
+    --files hdfs://host:port/path/to/file1,hdfs://host:port/path/to/file2
+    --conf spark.executor.instances=5 \
+    --conf spark.kubernetes.driver.container.image=<driver-image> \
+    --conf spark.kubernetes.executor.container.image=<executor-image> \
+    --conf spark.kubernetes.initContainer.image=<init-container image>
+    https://path/to/examples.jar
+```
+
+## Secret Management
+Kubernetes [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) can be used to provide credentials for a
+Spark application to access secured services. To mount a user-specified secret into the driver container, users can use
+the configuration property of the form `spark.kubernetes.driver.secrets.[SecretName]=<mount path>`. Similarly, the
+configuration property of the form `spark.kubernetes.executor.secrets.[SecretName]=<mount path>` can be used to mount a
+user-specified secret into the executor containers. Note that it is assumed that the secret to be mounted is in the same
+namespace as that of the driver and executor pods. For example, to mount a secret named `spark-secret` onto the path
+`/etc/secrets` in both the driver and executor containers, add the following options to the `spark-submit` command:
+
+```
+--conf spark.kubernetes.driver.secrets.spark-secret=/etc/secrets
+--conf spark.kubernetes.executor.secrets.spark-secret=/etc/secrets
+```
+
+Note that if an init-container is used, any secret mounted into the driver container will also be mounted into the
+init-container of the driver. Similarly, any secret mounted into an executor container will also be mounted into the
+init-container of the executor.
+
 ## Introspection and Debugging
 
 These are the different ways in which you can investigate a running/completed Spark application, monitor progress, and
@@ -275,7 +323,7 @@ specific to Spark on Kubernetes.
   <td><code>(none)</code></td>
   <td>
     Container image to use for the driver.
-    This is usually of the form `example.com/repo/spark-driver:v1.0.0`.
+    This is usually of the form <code>example.com/repo/spark-driver:v1.0.0</code>.
     This configuration is required and must be provided by the user.
   </td>
 </tr>
@@ -284,7 +332,7 @@ specific to Spark on Kubernetes.
   <td><code>(none)</code></td>
   <td>
     Container image to use for the executors.
-    This is usually of the form `example.com/repo/spark-executor:v1.0.0`.
+    This is usually of the form <code>example.com/repo/spark-executor:v1.0.0</code>.
     This configuration is required and must be provided by the user.
   </td>
 </tr>
@@ -528,51 +576,91 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-   <td><code>spark.kubernetes.driver.limit.cores</code></td>
-   <td>(none)</td>
-   <td>
-     Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for the driver pod.
-   </td>
- </tr>
- <tr>
-   <td><code>spark.kubernetes.executor.limit.cores</code></td>
-   <td>(none)</td>
-   <td>
-     Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for each executor pod launched for the Spark Application.
-   </td>
- </tr>
- <tr>
-   <td><code>spark.kubernetes.node.selector.[labelKey]</code></td>
-   <td>(none)</td>
-   <td>
-     Adds to the node selector of the driver pod and executor pods, with key <code>labelKey</code> and the value as the
-     configuration's value. For example, setting <code>spark.kubernetes.node.selector.identifier</code> to <code>myIdentifier</code>
-     will result in the driver pod and executors having a node selector with key <code>identifier</code> and value
-      <code>myIdentifier</code>. Multiple node selector keys can be added by setting multiple configurations with this prefix.
-    </td>
-  </tr>
- <tr>
-   <td><code>spark.kubernetes.driverEnv.[EnvironmentVariableName]</code></td>
-   <td>(none)</td>
-   <td>
-     Add the environment variable specified by <code>EnvironmentVariableName</code> to
-     the Driver process. The user can specify multiple of these to set multiple environment variables.
-   </td>
- </tr>
-  <tr>
-    <td><code>spark.kubernetes.mountDependencies.jarsDownloadDir</code></td>
-    <td><code>/var/spark-data/spark-jars</code></td>
-    <td>
-      Location to download jars to in the driver and executors.
-      This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
-    </td>
-  </tr>
-   <tr>
-     <td><code>spark.kubernetes.mountDependencies.filesDownloadDir</code></td>
-     <td><code>/var/spark-data/spark-files</code></td>
-     <td>
-       Location to download jars to in the driver and executors.
-       This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
-     </td>
-   </tr>
+  <td><code>spark.kubernetes.driver.limit.cores</code></td>
+  <td>(none)</td>
+  <td>
+    Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for the driver pod.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.limit.cores</code></td>
+  <td>(none)</td>
+  <td>
+    Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for each executor pod launched for the Spark Application.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.node.selector.[labelKey]</code></td>
+  <td>(none)</td>
+  <td>
+    Adds to the node selector of the driver pod and executor pods, with key <code>labelKey</code> and the value as the
+    configuration's value. For example, setting <code>spark.kubernetes.node.selector.identifier</code> to <code>myIdentifier</code>
+    will result in the driver pod and executors having a node selector with key <code>identifier</code> and value
+     <code>myIdentifier</code>. Multiple node selector keys can be added by setting multiple configurations with this prefix.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driverEnv.[EnvironmentVariableName]</code></td>
+  <td>(none)</td>
+  <td>
+    Add the environment variable specified by <code>EnvironmentVariableName</code> to
+    the Driver process. The user can specify multiple of these to set multiple environment variables.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.mountDependencies.jarsDownloadDir</code></td>
+  <td><code>/var/spark-data/spark-jars</code></td>
+  <td>
+    Location to download jars to in the driver and executors.
+    This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.mountDependencies.filesDownloadDir</code></td>
+  <td><code>/var/spark-data/spark-files</code></td>
+  <td>
+    Location to download jars to in the driver and executors.
+    This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.mountDependencies.timeout</code></td>
+  <td>300s</td>
+  <td>
+   Timeout in seconds before aborting the attempt to download and unpack dependencies from remote locations into
+   the driver and executor pods.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.mountDependencies.maxSimultaneousDownloads</code></td>
+  <td>5</td>
+  <td>
+   Maximum number of remote dependencies to download simultaneously in a driver or executor pod.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.initContainer.image</code></td>
+  <td>(none)</td>
+  <td>
+   Container image for the <a href="https://kubernetes.io/docs/concepts/workloads/pods/init-containers/">init-container</a> of the driver and executors for downloading dependencies. This is usually of the form <code>example.com/repo/spark-init:v1.0.0</code>. This configuration is optional and must be provided by the user if any non-container local dependency is used and must be downloaded remotely.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.secrets.[SecretName]</code></td>
+  <td>(none)</td>
+  <td>
+   Add the <a href="https://kubernetes.io/docs/concepts/configuration/secret/">Kubernetes Secret</a> named <code>SecretName</code> to the driver pod on the path specified in the value. For example,
+   <code>spark.kubernetes.driver.secrets.spark-secret=/etc/secrets</code>. Note that if an init-container is used,
+   the secret will also be added to the init-container in the driver pod.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.executor.secrets.[SecretName]</code></td>
+  <td>(none)</td>
+  <td>
+   Add the <a href="https://kubernetes.io/docs/concepts/configuration/secret/">Kubernetes Secret</a> named <code>SecretName</code> to the executor pod on the path specified in the value. For example,
+   <code>spark.kubernetes.executor.secrets.spark-secret=/etc/secrets</code>. Note that if an init-container is used,
+   the secret will also be added to the init-container in the executor pod.
+  </td>
+</tr>
 </table>
\ No newline at end of file
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
index 4546e98dc2074..b3137598692d8 100755
--- a/sbin/build-push-docker-images.sh
+++ b/sbin/build-push-docker-images.sh
@@ -20,7 +20,8 @@
 # with Kubernetes support.
 
 declare -A path=( [spark-driver]=kubernetes/dockerfiles/driver/Dockerfile \
-                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile )
+                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile \
+                  [spark-init]=kubernetes/dockerfiles/init-container/Dockerfile )
 
 function build {
   docker build -t spark-base -f kubernetes/dockerfiles/spark-base/Dockerfile .

From 76e8a1d7e2619c1e6bd75c399314d2583a86b93b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 28 Dec 2017 20:17:26 +0900
Subject: [PATCH 204/356] [SPARK-22843][R] Adds localCheckpoint in R

## What changes were proposed in this pull request?

This PR proposes to add `localCheckpoint(..)` in R API.

```r
df <- localCheckpoint(createDataFrame(iris))
```

## How was this patch tested?

Unit tests added in `R/pkg/tests/fulltests/test_sparkSQL.R`

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20073 from HyukjinKwon/SPARK-22843.
---
 R/pkg/NAMESPACE                       |  1 +
 R/pkg/R/DataFrame.R                   | 27 +++++++++++++++++++++++++++
 R/pkg/R/generics.R                    |  4 ++++
 R/pkg/tests/fulltests/test_sparkSQL.R | 22 ++++++++++++++++++++++
 4 files changed, 54 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index dce64e1e607c8..4b699de558b4f 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -133,6 +133,7 @@ exportMethods("arrange",
               "isStreaming",
               "join",
               "limit",
+              "localCheckpoint",
               "merge",
               "mutate",
               "na.omit",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b8d732a485862..ace49daf9cd83 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3782,6 +3782,33 @@ setMethod("checkpoint",
             dataFrame(df)
           })
 
+#' localCheckpoint
+#'
+#' Returns a locally checkpointed version of this SparkDataFrame. Checkpointing can be used to
+#' truncate the logical plan, which is especially useful in iterative algorithms where the plan
+#' may grow exponentially. Local checkpoints are stored in the executors using the caching
+#' subsystem and therefore they are not reliable.
+#'
+#' @param x A SparkDataFrame
+#' @param eager whether to locally checkpoint this SparkDataFrame immediately
+#' @return a new locally checkpointed SparkDataFrame
+#' @family SparkDataFrame functions
+#' @aliases localCheckpoint,SparkDataFrame-method
+#' @rdname localCheckpoint
+#' @name localCheckpoint
+#' @export
+#' @examples
+#'\dontrun{
+#' df <- localCheckpoint(df)
+#' }
+#' @note localCheckpoint since 2.3.0
+setMethod("localCheckpoint",
+          signature(x = "SparkDataFrame"),
+          function(x, eager = TRUE) {
+            df <- callJMethod(x@sdf, "localCheckpoint", as.logical(eager))
+            dataFrame(df)
+          })
+
 #' cube
 #'
 #' Create a multi-dimensional cube for the SparkDataFrame using the specified columns.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 5ddaa669f9205..d5d0bc9d8a97d 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -611,6 +611,10 @@ setGeneric("isStreaming", function(x) { standardGeneric("isStreaming") })
 #' @export
 setGeneric("limit", function(x, num) {standardGeneric("limit") })
 
+#' @rdname localCheckpoint
+#' @export
+setGeneric("localCheckpoint", function(x, eager = TRUE) { standardGeneric("localCheckpoint") })
+
 #' @rdname merge
 #' @export
 setGeneric("merge")
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 6cc0188dae95f..650e7c05f46e8 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -957,6 +957,28 @@ test_that("setCheckpointDir(), checkpoint() on a DataFrame", {
   }
 })
 
+test_that("localCheckpoint() on a DataFrame", {
+  if (windows_with_hadoop()) {
+    # Checkpoint directory shouldn't matter in localCheckpoint.
+    checkpointDir <- file.path(tempdir(), "lcproot")
+    expect_true(length(list.files(path = checkpointDir, all.files = TRUE, recursive = TRUE)) == 0)
+    setCheckpointDir(checkpointDir)
+
+    textPath <- tempfile(pattern = "textPath", fileext = ".txt")
+    writeLines(mockLines, textPath)
+    # Read it lazily and then locally checkpoint eagerly.
+    df <- read.df(textPath, "text")
+    df <- localCheckpoint(df, eager = TRUE)
+    # Here, we remove the source path to check eagerness.
+    unlink(textPath)
+    expect_is(df, "SparkDataFrame")
+    expect_equal(colnames(df), c("value"))
+    expect_equal(count(df), 3)
+
+    expect_true(length(list.files(path = checkpointDir, all.files = TRUE, recursive = TRUE)) == 0)
+  }
+})
+
 test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
   df <- read.json(jsonPath)
   testSchema <- schema(df)

From 1eebfbe192060af3c81cd086bc5d5a7e80d09e77 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 28 Dec 2017 20:18:47 +0900
Subject: [PATCH 205/356] [SPARK-21208][R] Adds setLocalProperty and
 getLocalProperty in R

## What changes were proposed in this pull request?

This PR adds `setLocalProperty` and `getLocalProperty`in R.

```R
> df <- createDataFrame(iris)
> setLocalProperty("spark.job.description", "Hello world!")
> count(df)
> setLocalProperty("spark.job.description", "Hi !!")
> count(df)
```

<img width="775" alt="2017-12-25 4 18 07" src="https://user-images.githubusercontent.com/6477701/34335213-60655a7c-e990-11e7-88aa-12debe311627.png">

```R
> print(getLocalProperty("spark.job.description"))
NULL
> setLocalProperty("spark.job.description", "Hello world!")
> print(getLocalProperty("spark.job.description"))
[1] "Hello world!"
> setLocalProperty("spark.job.description", "Hi !!")
> print(getLocalProperty("spark.job.description"))
[1] "Hi !!"
```

## How was this patch tested?

Manually tested and a test in `R/pkg/tests/fulltests/test_context.R`.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20075 from HyukjinKwon/SPARK-21208.
---
 R/pkg/NAMESPACE                      |  4 ++-
 R/pkg/R/sparkR.R                     | 45 ++++++++++++++++++++++++++++
 R/pkg/tests/fulltests/test_context.R | 33 +++++++++++++++++++-
 3 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 4b699de558b4f..ce3eec069efe5 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -76,7 +76,9 @@ exportMethods("glm",
 export("setJobGroup",
        "clearJobGroup",
        "cancelJobGroup",
-       "setJobDescription")
+       "setJobDescription",
+       "setLocalProperty",
+       "getLocalProperty")
 
 # Export Utility methods
 export("setLogLevel")
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index fb5f1d21fc723..965471f3b07a0 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -560,10 +560,55 @@ cancelJobGroup <- function(sc, groupId) {
 #'}
 #' @note setJobDescription since 2.3.0
 setJobDescription <- function(value) {
+  if (!is.null(value)) {
+    value <- as.character(value)
+  }
   sc <- getSparkContext()
   invisible(callJMethod(sc, "setJobDescription", value))
 }
 
+#' Set a local property that affects jobs submitted from this thread, such as the
+#' Spark fair scheduler pool.
+#'
+#' @param key The key for a local property.
+#' @param value The value for a local property.
+#' @rdname setLocalProperty
+#' @name setLocalProperty
+#' @examples
+#'\dontrun{
+#' setLocalProperty("spark.scheduler.pool", "poolA")
+#'}
+#' @note setLocalProperty since 2.3.0
+setLocalProperty <- function(key, value) {
+  if (is.null(key) || is.na(key)) {
+    stop("key should not be NULL or NA.")
+  }
+  if (!is.null(value)) {
+    value <- as.character(value)
+  }
+  sc <- getSparkContext()
+  invisible(callJMethod(sc, "setLocalProperty", as.character(key), value))
+}
+
+#' Get a local property set in this thread, or \code{NULL} if it is missing. See
+#' \code{setLocalProperty}.
+#'
+#' @param key The key for a local property.
+#' @rdname getLocalProperty
+#' @name getLocalProperty
+#' @examples
+#'\dontrun{
+#' getLocalProperty("spark.scheduler.pool")
+#'}
+#' @note getLocalProperty since 2.3.0
+getLocalProperty <- function(key) {
+  if (is.null(key) || is.na(key)) {
+    stop("key should not be NULL or NA.")
+  }
+  sc <- getSparkContext()
+  callJMethod(sc, "getLocalProperty", as.character(key))
+}
+
 sparkConfToSubmitOps <- new.env()
 sparkConfToSubmitOps[["spark.driver.memory"]]           <- "--driver-memory"
 sparkConfToSubmitOps[["spark.driver.extraClassPath"]]   <- "--driver-class-path"
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
index 77635c5a256b9..f0d0a5114f89f 100644
--- a/R/pkg/tests/fulltests/test_context.R
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -100,7 +100,6 @@ test_that("job group functions can be called", {
   setJobGroup("groupId", "job description", TRUE)
   cancelJobGroup("groupId")
   clearJobGroup()
-  setJobDescription("job description")
 
   suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
   suppressWarnings(cancelJobGroup(sc, "groupId"))
@@ -108,6 +107,38 @@ test_that("job group functions can be called", {
   sparkR.session.stop()
 })
 
+test_that("job description and local properties can be set and got", {
+  sc <- sparkR.sparkContext(master = sparkRTestMaster)
+  setJobDescription("job description")
+  expect_equal(getLocalProperty("spark.job.description"), "job description")
+  setJobDescription(1234)
+  expect_equal(getLocalProperty("spark.job.description"), "1234")
+  setJobDescription(NULL)
+  expect_equal(getLocalProperty("spark.job.description"), NULL)
+  setJobDescription(NA)
+  expect_equal(getLocalProperty("spark.job.description"), NULL)
+
+  setLocalProperty("spark.scheduler.pool", "poolA")
+  expect_equal(getLocalProperty("spark.scheduler.pool"), "poolA")
+  setLocalProperty("spark.scheduler.pool", NULL)
+  expect_equal(getLocalProperty("spark.scheduler.pool"), NULL)
+  setLocalProperty("spark.scheduler.pool", NA)
+  expect_equal(getLocalProperty("spark.scheduler.pool"), NULL)
+
+  setLocalProperty(4321, 1234)
+  expect_equal(getLocalProperty(4321), "1234")
+  setLocalProperty(4321, NULL)
+  expect_equal(getLocalProperty(4321), NULL)
+  setLocalProperty(4321, NA)
+  expect_equal(getLocalProperty(4321), NULL)
+
+  expect_error(setLocalProperty(NULL, "should fail"), "key should not be NULL or NA")
+  expect_error(getLocalProperty(NULL), "key should not be NULL or NA")
+  expect_error(setLocalProperty(NA, "should fail"), "key should not be NULL or NA")
+  expect_error(getLocalProperty(NA), "key should not be NULL or NA")
+  sparkR.session.stop()
+})
+
 test_that("utility function can be called", {
   sparkR.sparkContext(master = sparkRTestMaster)
   setLogLevel("ERROR")

From 755f2f5189a08597fddc90b7f9df4ad0ec6bd2ef Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 28 Dec 2017 21:33:03 +0800
Subject: [PATCH 206/356] [SPARK-20392][SQL][FOLLOWUP] should not add extra
 AnalysisBarrier

## What changes were proposed in this pull request?

I found this problem while auditing the analyzer code. It's dangerous to introduce extra `AnalysisBarrer` during analysis, as the plan inside it will bypass all analysis afterward, which may not be expected. We should only preserve `AnalysisBarrer` but not introduce new ones.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20094 from cloud-fan/barrier.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 191 ++++++++----------
 .../sql/hive/execution/SQLQuerySuite.scala    |   2 +-
 2 files changed, 84 insertions(+), 109 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 10b237fb22b96..7f2128ed36a08 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
-import scala.annotation.tailrec
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.AnalysisException
@@ -665,14 +664,18 @@ class Analyzer(
      * Generate a new logical plan for the right child with different expression IDs
      * for all conflicting attributes.
      */
-    private def dedupRight (left: LogicalPlan, originalRight: LogicalPlan): LogicalPlan = {
-      // Remove analysis barrier if any.
-      val right = EliminateBarriers(originalRight)
+    private def dedupRight (left: LogicalPlan, right: LogicalPlan): LogicalPlan = {
       val conflictingAttributes = left.outputSet.intersect(right.outputSet)
       logDebug(s"Conflicting attributes ${conflictingAttributes.mkString(",")} " +
         s"between $left and $right")
 
       right.collect {
+        // For `AnalysisBarrier`, recursively de-duplicate its child.
+        case oldVersion: AnalysisBarrier
+            if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
+          val newVersion = dedupRight(left, oldVersion.child)
+          (oldVersion, AnalysisBarrier(newVersion))
+
         // Handle base relations that might appear more than once.
         case oldVersion: MultiInstanceRelation
             if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
@@ -710,10 +713,10 @@ class Analyzer(
            * that this rule cannot handle. When that is the case, there must be another rule
            * that resolves these conflicts. Otherwise, the analysis will fail.
            */
-          originalRight
+          right
         case Some((oldRelation, newRelation)) =>
           val attributeRewrites = AttributeMap(oldRelation.output.zip(newRelation.output))
-          val newRight = right transformUp {
+          right transformUp {
             case r if r == oldRelation => newRelation
           } transformUp {
             case other => other transformExpressions {
@@ -723,7 +726,6 @@ class Analyzer(
                 s.withNewPlan(dedupOuterReferencesInSubquery(s.plan, attributeRewrites))
             }
           }
-          AnalysisBarrier(newRight)
       }
     }
 
@@ -958,7 +960,8 @@ class Analyzer(
   protected[sql] def resolveExpression(
       expr: Expression,
       plan: LogicalPlan,
-      throws: Boolean = false) = {
+      throws: Boolean = false): Expression = {
+    if (expr.resolved) return expr
     // Resolve expression in one round.
     // If throws == false or the desired attribute doesn't exist
     // (like try to resolve `a.b` but `a` doesn't exist), fail and return the origin one.
@@ -1079,100 +1082,74 @@ class Analyzer(
       case sa @ Sort(_, _, AnalysisBarrier(child: Aggregate)) => sa
       case sa @ Sort(_, _, child: Aggregate) => sa
 
-      case s @ Sort(order, _, originalChild) if !s.resolved && originalChild.resolved =>
-        val child = EliminateBarriers(originalChild)
-        try {
-          val newOrder = order.map(resolveExpressionRecursively(_, child).asInstanceOf[SortOrder])
-          val requiredAttrs = AttributeSet(newOrder).filter(_.resolved)
-          val missingAttrs = requiredAttrs -- child.outputSet
-          if (missingAttrs.nonEmpty) {
-            // Add missing attributes and then project them away after the sort.
-            Project(child.output,
-              Sort(newOrder, s.global, addMissingAttr(child, missingAttrs)))
-          } else if (newOrder != order) {
-            s.copy(order = newOrder)
-          } else {
-            s
-          }
-        } catch {
-          // Attempting to resolve it might fail. When this happens, return the original plan.
-          // Users will see an AnalysisException for resolution failure of missing attributes
-          // in Sort
-          case ae: AnalysisException => s
+      case s @ Sort(order, _, child) if !s.resolved && child.resolved =>
+        val (newOrder, newChild) = resolveExprsAndAddMissingAttrs(order, child)
+        val ordering = newOrder.map(_.asInstanceOf[SortOrder])
+        if (child.output == newChild.output) {
+          s.copy(order = ordering)
+        } else {
+          // Add missing attributes and then project them away.
+          val newSort = s.copy(order = ordering, child = newChild)
+          Project(child.output, newSort)
         }
 
-      case f @ Filter(cond, originalChild) if !f.resolved && originalChild.resolved =>
-        val child = EliminateBarriers(originalChild)
-        try {
-          val newCond = resolveExpressionRecursively(cond, child)
-          val requiredAttrs = newCond.references.filter(_.resolved)
-          val missingAttrs = requiredAttrs -- child.outputSet
-          if (missingAttrs.nonEmpty) {
-            // Add missing attributes and then project them away.
-            Project(child.output,
-              Filter(newCond, addMissingAttr(child, missingAttrs)))
-          } else if (newCond != cond) {
-            f.copy(condition = newCond)
-          } else {
-            f
-          }
-        } catch {
-          // Attempting to resolve it might fail. When this happens, return the original plan.
-          // Users will see an AnalysisException for resolution failure of missing attributes
-          case ae: AnalysisException => f
+      case f @ Filter(cond, child) if !f.resolved && child.resolved =>
+        val (newCond, newChild) = resolveExprsAndAddMissingAttrs(Seq(cond), child)
+        if (child.output == newChild.output) {
+          f.copy(condition = newCond.head)
+        } else {
+          // Add missing attributes and then project them away.
+          val newFilter = Filter(newCond.head, newChild)
+          Project(child.output, newFilter)
         }
     }
 
-    /**
-     * Add the missing attributes into projectList of Project/Window or aggregateExpressions of
-     * Aggregate.
-     */
-    private def addMissingAttr(plan: LogicalPlan, missingAttrs: AttributeSet): LogicalPlan = {
-      if (missingAttrs.isEmpty) {
-        return AnalysisBarrier(plan)
-      }
-      plan match {
-        case p: Project =>
-          val missing = missingAttrs -- p.child.outputSet
-          Project(p.projectList ++ missingAttrs, addMissingAttr(p.child, missing))
-        case a: Aggregate =>
-          // all the missing attributes should be grouping expressions
-          // TODO: push down AggregateExpression
-          missingAttrs.foreach { attr =>
-            if (!a.groupingExpressions.exists(_.semanticEquals(attr))) {
-              throw new AnalysisException(s"Can't add $attr to ${a.simpleString}")
-            }
-          }
-          val newAggregateExpressions = a.aggregateExpressions ++ missingAttrs
-          a.copy(aggregateExpressions = newAggregateExpressions)
-        case g: Generate =>
-          // If join is false, we will convert it to true for getting from the child the missing
-          // attributes that its child might have or could have.
-          val missing = missingAttrs -- g.child.outputSet
-          g.copy(join = true, child = addMissingAttr(g.child, missing))
-        case d: Distinct =>
-          throw new AnalysisException(s"Can't add $missingAttrs to $d")
-        case u: UnaryNode =>
-          u.withNewChildren(addMissingAttr(u.child, missingAttrs) :: Nil)
-        case other =>
-          throw new AnalysisException(s"Can't add $missingAttrs to $other")
-      }
-    }
-
-    /**
-     * Resolve the expression on a specified logical plan and it's child (recursively), until
-     * the expression is resolved or meet a non-unary node or Subquery.
-     */
-    @tailrec
-    private def resolveExpressionRecursively(expr: Expression, plan: LogicalPlan): Expression = {
-      val resolved = resolveExpression(expr, plan)
-      if (resolved.resolved) {
-        resolved
+    private def resolveExprsAndAddMissingAttrs(
+        exprs: Seq[Expression], plan: LogicalPlan): (Seq[Expression], LogicalPlan) = {
+      if (exprs.forall(_.resolved)) {
+        // All given expressions are resolved, no need to continue anymore.
+        (exprs, plan)
       } else {
         plan match {
-          case u: UnaryNode if !u.isInstanceOf[SubqueryAlias] =>
-            resolveExpressionRecursively(resolved, u.child)
-          case other => resolved
+          // For `AnalysisBarrier`, recursively resolve expressions and add missing attributes via
+          // its child.
+          case barrier: AnalysisBarrier =>
+            val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(exprs, barrier.child)
+            (newExprs, AnalysisBarrier(newChild))
+
+          case p: Project =>
+            val maybeResolvedExprs = exprs.map(resolveExpression(_, p))
+            val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, p.child)
+            val missingAttrs = AttributeSet(newExprs) -- AttributeSet(maybeResolvedExprs)
+            (newExprs, Project(p.projectList ++ missingAttrs, newChild))
+
+          case a @ Aggregate(groupExprs, aggExprs, child) =>
+            val maybeResolvedExprs = exprs.map(resolveExpression(_, a))
+            val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, child)
+            val missingAttrs = AttributeSet(newExprs) -- AttributeSet(maybeResolvedExprs)
+            if (missingAttrs.forall(attr => groupExprs.exists(_.semanticEquals(attr)))) {
+              // All the missing attributes are grouping expressions, valid case.
+              (newExprs, a.copy(aggregateExpressions = aggExprs ++ missingAttrs, child = newChild))
+            } else {
+              // Need to add non-grouping attributes, invalid case.
+              (exprs, a)
+            }
+
+          case g: Generate =>
+            val maybeResolvedExprs = exprs.map(resolveExpression(_, g))
+            val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, g.child)
+            (newExprs, g.copy(join = true, child = newChild))
+
+          // For `Distinct` and `SubqueryAlias`, we can't recursively resolve and add attributes
+          // via its children.
+          case u: UnaryNode if !u.isInstanceOf[Distinct] && !u.isInstanceOf[SubqueryAlias] =>
+            val maybeResolvedExprs = exprs.map(resolveExpression(_, u))
+            val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, u.child)
+            (newExprs, u.withNewChildren(Seq(newChild)))
+
+          // For other operators, we can't recursively resolve and add attributes via its children.
+          case other =>
+            (exprs.map(resolveExpression(_, other)), other)
         }
       }
     }
@@ -1404,18 +1381,16 @@ class Analyzer(
    */
   object ResolveAggregateFunctions extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
-      case filter @ Filter(havingCondition, AnalysisBarrier(aggregate: Aggregate)) =>
-        apply(Filter(havingCondition, aggregate)).mapChildren(AnalysisBarrier)
-      case filter @ Filter(havingCondition,
-             aggregate @ Aggregate(grouping, originalAggExprs, child))
-          if aggregate.resolved =>
+      case Filter(cond, AnalysisBarrier(agg: Aggregate)) =>
+        apply(Filter(cond, agg)).mapChildren(AnalysisBarrier)
+      case f @ Filter(cond, agg @ Aggregate(grouping, originalAggExprs, child)) if agg.resolved =>
 
         // Try resolving the condition of the filter as though it is in the aggregate clause
         try {
           val aggregatedCondition =
             Aggregate(
               grouping,
-              Alias(havingCondition, "havingCondition")() :: Nil,
+              Alias(cond, "havingCondition")() :: Nil,
               child)
           val resolvedOperator = execute(aggregatedCondition)
           def resolvedAggregateFilter =
@@ -1436,7 +1411,7 @@ class Analyzer(
               // Grouping functions are handled in the rule [[ResolveGroupingAnalytics]].
               case e: Expression if grouping.exists(_.semanticEquals(e)) &&
                   !ResolveGroupingAnalytics.hasGroupingFunction(e) &&
-                  !aggregate.output.exists(_.semanticEquals(e)) =>
+                  !agg.output.exists(_.semanticEquals(e)) =>
                 e match {
                   case ne: NamedExpression =>
                     aggregateExpressions += ne
@@ -1450,22 +1425,22 @@ class Analyzer(
 
             // Push the aggregate expressions into the aggregate (if any).
             if (aggregateExpressions.nonEmpty) {
-              Project(aggregate.output,
+              Project(agg.output,
                 Filter(transformedAggregateFilter,
-                  aggregate.copy(aggregateExpressions = originalAggExprs ++ aggregateExpressions)))
+                  agg.copy(aggregateExpressions = originalAggExprs ++ aggregateExpressions)))
             } else {
-              filter
+              f
             }
           } else {
-            filter
+            f
           }
         } catch {
           // Attempting to resolve in the aggregate can result in ambiguity.  When this happens,
           // just return the original plan.
-          case ae: AnalysisException => filter
+          case ae: AnalysisException => f
         }
 
-      case sort @ Sort(sortOrder, global, AnalysisBarrier(aggregate: Aggregate)) =>
+      case Sort(sortOrder, global, AnalysisBarrier(aggregate: Aggregate)) =>
         apply(Sort(sortOrder, global, aggregate)).mapChildren(AnalysisBarrier)
       case sort @ Sort(sortOrder, global, aggregate: Aggregate) if aggregate.resolved =>
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c11e37a516646..07ae3ae945848 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1562,7 +1562,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("multi-insert with lateral view") {
-    withTempView("t1") {
+    withTempView("source") {
       spark.range(10)
         .select(array($"id", $"id" + 1).as("arr"), $"id")
         .createOrReplaceTempView("source")

From 28778174208664327b75915e83ae5e611360eef3 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Thu, 28 Dec 2017 21:49:37 +0800
Subject: [PATCH 207/356] [SPARK-22917][SQL] Should not try to generate
 histogram for empty/null columns

## What changes were proposed in this pull request?

For empty/null column, the result of `ApproximatePercentile` is null. Then in `ApproxCountDistinctForIntervals`, a `MatchError` (for `endpoints`) will be thrown if we try to generate histogram for that column. Besides, there is no need to generate histogram for such column. In this patch, we exclude such column when generating histogram.

## How was this patch tested?

Enhanced test cases for empty/null columns.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #20102 from wzhfy/no_record_hgm_bug.
---
 .../command/AnalyzeColumnCommand.scala        |  7 ++++++-
 .../spark/sql/StatisticsCollectionSuite.scala | 21 +++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index e3bb4d357b395..1122522ccb4cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -143,7 +143,12 @@ case class AnalyzeColumnCommand(
       val percentilesRow = new QueryExecution(sparkSession, Aggregate(Nil, namedExprs, relation))
         .executedPlan.executeTake(1).head
       attrsToGenHistogram.zipWithIndex.foreach { case (attr, i) =>
-        attributePercentiles += attr -> percentilesRow.getArray(i)
+        val percentiles = percentilesRow.getArray(i)
+        // When there is no non-null value, `percentiles` is null. In such case, there is no
+        // need to generate histogram.
+        if (percentiles != null) {
+          attributePercentiles += attr -> percentiles
+        }
       }
     }
     AttributeMap(attributePercentiles.toSeq)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index fba5d2652d3f5..b11e798532056 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -85,13 +85,24 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
   test("analyze empty table") {
     val table = "emptyTable"
     withTable(table) {
-      sql(s"CREATE TABLE $table (key STRING, value STRING) USING PARQUET")
+      val df = Seq.empty[Int].toDF("key")
+      df.write.format("json").saveAsTable(table)
       sql(s"ANALYZE TABLE $table COMPUTE STATISTICS noscan")
       val fetchedStats1 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = None)
       assert(fetchedStats1.get.sizeInBytes == 0)
       sql(s"ANALYZE TABLE $table COMPUTE STATISTICS")
       val fetchedStats2 = checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(0))
       assert(fetchedStats2.get.sizeInBytes == 0)
+
+      val expectedColStat =
+        "key" -> ColumnStat(0, None, None, 0, IntegerType.defaultSize, IntegerType.defaultSize)
+
+      // There won't be histogram for empty column.
+      Seq("true", "false").foreach { histogramEnabled =>
+        withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) {
+          checkColStats(df, mutable.LinkedHashMap(expectedColStat))
+        }
+      }
     }
   }
 
@@ -178,7 +189,13 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     val expectedColStats = dataTypes.map { case (tpe, idx) =>
       (s"col$idx", ColumnStat(0, None, None, 1, tpe.defaultSize.toLong, tpe.defaultSize.toLong))
     }
-    checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+
+    // There won't be histograms for null columns.
+    Seq("true", "false").foreach { histogramEnabled =>
+      withSQLConf(SQLConf.HISTOGRAM_ENABLED.key -> histogramEnabled) {
+        checkColStats(df, mutable.LinkedHashMap(expectedColStats: _*))
+      }
+    }
   }
 
   test("number format in statistics") {

From 5536f3181c1e77c70f01d6417407d218ea48b961 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Thu, 28 Dec 2017 09:43:50 -0600
Subject: [PATCH 208/356] [MINOR][BUILD] Fix Java linter errors

## What changes were proposed in this pull request?

This PR cleans up a few Java linter errors for Apache Spark 2.3 release.

## How was this patch tested?

```bash
$ dev/lint-java
Using `mvn` from path: /usr/local/bin/mvn
Checkstyle checks passed.
```

We can see the result from [Travis CI](https://travis-ci.org/dongjoon-hyun/spark/builds/322470787), too.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20101 from dongjoon-hyun/fix-java-lint.
---
 .../org/apache/spark/memory/MemoryConsumer.java    |  3 ++-
 .../streaming/kinesis/KinesisInitialPositions.java | 14 ++++++++------
 .../parquet/VectorizedColumnReader.java            |  3 +--
 .../parquet/VectorizedParquetRecordReader.java     |  4 ++--
 .../sql/execution/vectorized/ColumnarRow.java      |  3 ++-
 .../spark/sql/sources/v2/SessionConfigSupport.java |  3 ---
 .../v2/streaming/ContinuousReadSupport.java        |  5 ++++-
 .../v2/streaming/ContinuousWriteSupport.java       |  6 +++---
 .../sql/sources/v2/streaming/reader/Offset.java    |  3 ++-
 .../v2/streaming/reader/PartitionOffset.java       |  1 -
 .../hive/service/cli/operation/SQLOperation.java   |  1 -
 11 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
index a7bd4b3799a25..115e1fbb79a2e 100644
--- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
+++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java
@@ -154,6 +154,7 @@ private void throwOom(final MemoryBlock page, final long required) {
       taskMemoryManager.freePage(page, this);
     }
     taskMemoryManager.showMemoryUsage();
-    throw new SparkOutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " + got);
+    throw new SparkOutOfMemoryError("Unable to acquire " + required + " bytes of memory, got " +
+      got);
   }
 }
diff --git a/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java b/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java
index 206e1e4699030..b5f5ab0e90540 100644
--- a/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java
+++ b/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java
@@ -67,9 +67,10 @@ public Date getTimestamp() {
 
 
     /**
-     * Returns instance of [[KinesisInitialPosition]] based on the passed [[InitialPositionInStream]].
-     * This method is used in KinesisUtils for translating the InitialPositionInStream
-     * to InitialPosition. This function would be removed when we deprecate the KinesisUtils.
+     * Returns instance of [[KinesisInitialPosition]] based on the passed
+     * [[InitialPositionInStream]]. This method is used in KinesisUtils for translating the
+     * InitialPositionInStream to InitialPosition. This function would be removed when we deprecate
+     * the KinesisUtils.
      *
      * @return [[InitialPosition]]
      */
@@ -83,9 +84,10 @@ public static KinesisInitialPosition fromKinesisInitialPosition(
             // InitialPositionInStream.AT_TIMESTAMP is not supported.
             // Use InitialPosition.atTimestamp(timestamp) instead.
             throw new UnsupportedOperationException(
-                    "Only InitialPositionInStream.LATEST and InitialPositionInStream.TRIM_HORIZON " +
-                            "supported in initialPositionInStream(). Please use the initialPosition() from " +
-                            "builder API in KinesisInputDStream for using InitialPositionInStream.AT_TIMESTAMP");
+                    "Only InitialPositionInStream.LATEST and InitialPositionInStream." +
+                            "TRIM_HORIZON supported in initialPositionInStream(). Please use " +
+                            "the initialPosition() from builder API in KinesisInputDStream for " +
+                            "using InitialPositionInStream.AT_TIMESTAMP");
         }
     }
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 3ba180860c325..c120863152a96 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -31,7 +31,6 @@
 import org.apache.parquet.schema.PrimitiveType;
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
-import org.apache.spark.sql.execution.vectorized.ColumnVector;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.DecimalType;
@@ -96,7 +95,7 @@ public class VectorizedColumnReader {
   private final OriginalType originalType;
   // The timezone conversion to apply to int96 timestamps. Null if no conversion.
   private final TimeZone convertTz;
-  private final static TimeZone UTC = DateTimeUtils.TimeZoneUTC();
+  private static final TimeZone UTC = DateTimeUtils.TimeZoneUTC();
 
   public VectorizedColumnReader(
       ColumnDescriptor descriptor,
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 14f2a58d638f0..6c157e85d411f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -79,8 +79,8 @@ public class VectorizedParquetRecordReader extends SpecificParquetRecordReaderBa
   private boolean[] missingColumns;
 
   /**
-   * The timezone that timestamp INT96 values should be converted to. Null if no conversion. Here to workaround
-   * incompatibilities between different engines when writing timestamp values.
+   * The timezone that timestamp INT96 values should be converted to. Null if no conversion. Here to
+   * workaround incompatibilities between different engines when writing timestamp values.
    */
   private TimeZone convertTz = null;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
index 95c0d09873d67..8bb33ed5b78c0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
@@ -28,7 +28,8 @@
  * to be reused, callers should copy the data out if it needs to be stored.
  */
 public final class ColumnarRow extends InternalRow {
-  // The data for this row. E.g. the value of 3rd int field is `data.getChildColumn(3).getInt(rowId)`.
+  // The data for this row.
+  // E.g. the value of 3rd int field is `data.getChildColumn(3).getInt(rowId)`.
   private final ColumnVector data;
   private final int rowId;
   private final int numFields;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
index 0b5b6ac675f2c..3cb020d2e0836 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/SessionConfigSupport.java
@@ -19,9 +19,6 @@
 
 import org.apache.spark.annotation.InterfaceStability;
 
-import java.util.List;
-import java.util.Map;
-
 /**
  * A mix-in interface for {@link DataSourceV2}. Data sources can implement this interface to
  * propagate session configs with the specified key-prefix to all data source operations in this
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
index 8837bae6156b1..3136cee1f655f 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousReadSupport.java
@@ -39,5 +39,8 @@ public interface ContinuousReadSupport extends DataSourceV2 {
    * @param options the options for the returned data source reader, which is an immutable
    *                case-insensitive string-to-string map.
    */
-  ContinuousReader createContinuousReader(Optional<StructType> schema, String checkpointLocation, DataSourceV2Options options);
+  ContinuousReader createContinuousReader(
+    Optional<StructType> schema,
+    String checkpointLocation,
+    DataSourceV2Options options);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
index ec15e436672b4..dee493cadb71e 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/ContinuousWriteSupport.java
@@ -39,9 +39,9 @@ public interface ContinuousWriteSupport extends BaseStreamingSink {
      * Creates an optional {@link ContinuousWriter} to save the data to this data source. Data
      * sources can return None if there is no writing needed to be done.
      *
-     * @param queryId A unique string for the writing query. It's possible that there are many writing
-     *                queries running at the same time, and the returned {@link DataSourceV2Writer}
-     *                can use this id to distinguish itself from others.
+     * @param queryId A unique string for the writing query. It's possible that there are many
+     *                writing queries running at the same time, and the returned
+     *                {@link DataSourceV2Writer} can use this id to distinguish itself from others.
      * @param schema the schema of the data to be written.
      * @param mode the output mode which determines what successive epoch output means to this
      *             sink, please refer to {@link OutputMode} for more details.
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
index 517fdab16684c..60b87f2ac0756 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/Offset.java
@@ -42,7 +42,8 @@ public abstract class Offset extends org.apache.spark.sql.execution.streaming.Of
     @Override
     public boolean equals(Object obj) {
         if (obj instanceof org.apache.spark.sql.execution.streaming.Offset) {
-            return this.json().equals(((org.apache.spark.sql.execution.streaming.Offset) obj).json());
+            return this.json()
+                .equals(((org.apache.spark.sql.execution.streaming.Offset) obj).json());
         } else {
             return false;
         }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
index 729a6123034f0..eca0085c8a8ce 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/reader/PartitionOffset.java
@@ -26,5 +26,4 @@
  * These offsets must be serializable.
  */
 public interface PartitionOffset extends Serializable {
-    
 }
diff --git a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
index fd9108eb53ca9..70c27948de61b 100644
--- a/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
+++ b/sql/hive-thriftserver/src/main/java/org/apache/hive/service/cli/operation/SQLOperation.java
@@ -42,7 +42,6 @@
 import org.apache.hadoop.hive.ql.metadata.HiveException;
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution;
 import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
-import org.apache.hadoop.hive.ql.session.OperationLog;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.SerDe;

From 8f6d5734d504be89249fb6744ef4067be2e889fc Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gshegalov@salesforce.com>
Date: Thu, 28 Dec 2017 11:44:06 -0600
Subject: [PATCH 209/356] [SPARK-22875][BUILD] Assembly build fails for a high
 user id

## What changes were proposed in this pull request?

Add tarLongFileMode=posix configuration for the assembly plugin

## How was this patch tested?

Reran build successfully
```
./build/mvn package -Pbigtop-dist -DskipTests -rf :spark-assembly_2.11
[INFO] Spark Project Assembly ............................. SUCCESS [ 23.082 s]
```

Author: Gera Shegalov <gshegalov@salesforce.com>

Closes #20055 from gerashegalov/gera/tarLongFileMode.
---
 pom.xml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pom.xml b/pom.xml
index 92f897095f087..34793b0c41708 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2295,6 +2295,9 @@
           <groupId>org.apache.maven.plugins</groupId>
           <artifactId>maven-assembly-plugin</artifactId>
           <version>3.1.0</version>
+          <configuration>
+            <tarLongFileMode>posix</tarLongFileMode>
+          </configuration>
         </plugin>
         <plugin>
           <groupId>org.apache.maven.plugins</groupId>

From 9c21ece35edc175edde949175a4ce701679824c8 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 28 Dec 2017 15:41:16 -0600
Subject: [PATCH 210/356] [SPARK-22836][UI] Show driver logs in UI when
 available.

Port code from the old executors listener to the new one, so that
the driver logs present in the application start event are kept.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20038 from vanzin/SPARK-22836.
---
 .../spark/status/AppStatusListener.scala       | 11 +++++++++++
 .../spark/status/AppStatusListenerSuite.scala  | 18 ++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 5253297137323..487a782e865e8 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -119,6 +119,17 @@ private[spark] class AppStatusListener(
 
     kvstore.write(new ApplicationInfoWrapper(appInfo))
     kvstore.write(appSummary)
+
+    // Update the driver block manager with logs from this event. The SparkContext initialization
+    // code registers the driver before this event is sent.
+    event.driverLogs.foreach { logs =>
+      val driver = liveExecutors.get(SparkContext.DRIVER_IDENTIFIER)
+        .orElse(liveExecutors.get(SparkContext.LEGACY_DRIVER_IDENTIFIER))
+      driver.foreach { d =>
+        d.executorLogs = logs.toMap
+        update(d, System.nanoTime())
+      }
+    }
   }
 
   override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = {
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index c0b3a79fe981e..997c7de8dd02b 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -942,6 +942,24 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
+  test("driver logs") {
+    val listener = new AppStatusListener(store, conf, true)
+
+    val driver = BlockManagerId(SparkContext.DRIVER_IDENTIFIER, "localhost", 42)
+    listener.onBlockManagerAdded(SparkListenerBlockManagerAdded(time, driver, 42L))
+    listener.onApplicationStart(SparkListenerApplicationStart(
+      "name",
+      Some("id"),
+      time,
+      "user",
+      Some("attempt"),
+      Some(Map("stdout" -> "file.txt"))))
+
+    check[ExecutorSummaryWrapper](SparkContext.DRIVER_IDENTIFIER) { d =>
+      assert(d.info.executorLogs("stdout") === "file.txt")
+    }
+  }
+
   private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
 
   private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {

From 613b71a1230723ed556239b8b387722043c67252 Mon Sep 17 00:00:00 2001
From: Yuming Wang <wgyumg@gmail.com>
Date: Fri, 29 Dec 2017 06:58:38 +0800
Subject: [PATCH 211/356] [SPARK-22890][TEST] Basic tests for
 DateTimeOperations

## What changes were proposed in this pull request?

Test Coverage for `DateTimeOperations`, this is a Sub-tasks for [SPARK-22722](https://issues.apache.org/jira/browse/SPARK-22722).

## How was this patch tested?

N/A

Author: Yuming Wang <wgyumg@gmail.com>

Closes #20061 from wangyum/SPARK-22890.
---
 .../native/dateTimeOperations.sql             |  60 +++
 .../native/dateTimeOperations.sql.out         | 349 ++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  13 +-
 3 files changed, 410 insertions(+), 12 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/dateTimeOperations.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out

diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/dateTimeOperations.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/dateTimeOperations.sql
new file mode 100644
index 0000000000000..1e98221867965
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/dateTimeOperations.sql
@@ -0,0 +1,60 @@
+--
+--   Licensed to the Apache Software Foundation (ASF) under one or more
+--   contributor license agreements.  See the NOTICE file distributed with
+--   this work for additional information regarding copyright ownership.
+--   The ASF licenses this file to You under the Apache License, Version 2.0
+--   (the "License"); you may not use this file except in compliance with
+--   the License.  You may obtain a copy of the License at
+--
+--      http://www.apache.org/licenses/LICENSE-2.0
+--
+--   Unless required by applicable law or agreed to in writing, software
+--   distributed under the License is distributed on an "AS IS" BASIS,
+--   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+--   See the License for the specific language governing permissions and
+--   limitations under the License.
+--
+
+CREATE TEMPORARY VIEW t AS SELECT 1;
+
+select cast(1 as tinyint) + interval 2 day;
+select cast(1 as smallint) + interval 2 day;
+select cast(1 as int) + interval 2 day;
+select cast(1 as bigint) + interval 2 day;
+select cast(1 as float) + interval 2 day;
+select cast(1 as double) + interval 2 day;
+select cast(1 as decimal(10, 0)) + interval 2 day;
+select cast('2017-12-11' as string) + interval 2 day;
+select cast('2017-12-11 09:30:00' as string) + interval 2 day;
+select cast('1' as binary) + interval 2 day;
+select cast(1 as boolean) + interval 2 day;
+select cast('2017-12-11 09:30:00.0' as timestamp) + interval 2 day;
+select cast('2017-12-11 09:30:00' as date) + interval 2 day;
+
+select interval 2 day + cast(1 as tinyint);
+select interval 2 day + cast(1 as smallint);
+select interval 2 day + cast(1 as int);
+select interval 2 day + cast(1 as bigint);
+select interval 2 day + cast(1 as float);
+select interval 2 day + cast(1 as double);
+select interval 2 day + cast(1 as decimal(10, 0));
+select interval 2 day + cast('2017-12-11' as string);
+select interval 2 day + cast('2017-12-11 09:30:00' as string);
+select interval 2 day + cast('1' as binary);
+select interval 2 day + cast(1 as boolean);
+select interval 2 day + cast('2017-12-11 09:30:00.0' as timestamp);
+select interval 2 day + cast('2017-12-11 09:30:00' as date);
+
+select cast(1 as tinyint) - interval 2 day;
+select cast(1 as smallint) - interval 2 day;
+select cast(1 as int) - interval 2 day;
+select cast(1 as bigint) - interval 2 day;
+select cast(1 as float) - interval 2 day;
+select cast(1 as double) - interval 2 day;
+select cast(1 as decimal(10, 0)) - interval 2 day;
+select cast('2017-12-11' as string) - interval 2 day;
+select cast('2017-12-11 09:30:00' as string) - interval 2 day;
+select cast('1' as binary) - interval 2 day;
+select cast(1 as boolean) - interval 2 day;
+select cast('2017-12-11 09:30:00.0' as timestamp) - interval 2 day;
+select cast('2017-12-11 09:30:00' as date) - interval 2 day;
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out
new file mode 100644
index 0000000000000..12c1d1617679f
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/dateTimeOperations.sql.out
@@ -0,0 +1,349 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 40
+
+
+-- !query 0
+CREATE TEMPORARY VIEW t AS SELECT 1
+-- !query 0 schema
+struct<>
+-- !query 0 output
+
+
+
+-- !query 1
+select cast(1 as tinyint) + interval 2 day
+-- !query 1 schema
+struct<>
+-- !query 1 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) + interval 2 days)' (tinyint and calendarinterval).; line 1 pos 7
+
+
+-- !query 2
+select cast(1 as smallint) + interval 2 day
+-- !query 2 schema
+struct<>
+-- !query 2 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) + interval 2 days)' (smallint and calendarinterval).; line 1 pos 7
+
+
+-- !query 3
+select cast(1 as int) + interval 2 day
+-- !query 3 schema
+struct<>
+-- !query 3 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS INT) + interval 2 days)' (int and calendarinterval).; line 1 pos 7
+
+
+-- !query 4
+select cast(1 as bigint) + interval 2 day
+-- !query 4 schema
+struct<>
+-- !query 4 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) + interval 2 days)' (bigint and calendarinterval).; line 1 pos 7
+
+
+-- !query 5
+select cast(1 as float) + interval 2 day
+-- !query 5 schema
+struct<>
+-- !query 5 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) + interval 2 days)' (float and calendarinterval).; line 1 pos 7
+
+
+-- !query 6
+select cast(1 as double) + interval 2 day
+-- !query 6 schema
+struct<>
+-- !query 6 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) + interval 2 days)' (double and calendarinterval).; line 1 pos 7
+
+
+-- !query 7
+select cast(1 as decimal(10, 0)) + interval 2 day
+-- !query 7 schema
+struct<>
+-- !query 7 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) + interval 2 days)' (decimal(10,0) and calendarinterval).; line 1 pos 7
+
+
+-- !query 8
+select cast('2017-12-11' as string) + interval 2 day
+-- !query 8 schema
+struct<CAST(CAST(CAST(2017-12-11 AS STRING) AS TIMESTAMP) + interval 2 days AS STRING):string>
+-- !query 8 output
+2017-12-13 00:00:00
+
+
+-- !query 9
+select cast('2017-12-11 09:30:00' as string) + interval 2 day
+-- !query 9 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS STRING) AS TIMESTAMP) + interval 2 days AS STRING):string>
+-- !query 9 output
+2017-12-13 09:30:00
+
+
+-- !query 10
+select cast('1' as binary) + interval 2 day
+-- !query 10 schema
+struct<>
+-- !query 10 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) + interval 2 days)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) + interval 2 days)' (binary and calendarinterval).; line 1 pos 7
+
+
+-- !query 11
+select cast(1 as boolean) + interval 2 day
+-- !query 11 schema
+struct<>
+-- !query 11 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) + interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) + interval 2 days)' (boolean and calendarinterval).; line 1 pos 7
+
+
+-- !query 12
+select cast('2017-12-11 09:30:00.0' as timestamp) + interval 2 day
+-- !query 12 schema
+struct<CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) + interval 2 days AS TIMESTAMP):timestamp>
+-- !query 12 output
+2017-12-13 09:30:00
+
+
+-- !query 13
+select cast('2017-12-11 09:30:00' as date) + interval 2 day
+-- !query 13 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP) + interval 2 days AS DATE):date>
+-- !query 13 output
+2017-12-13
+
+
+-- !query 14
+select interval 2 day + cast(1 as tinyint)
+-- !query 14 schema
+struct<>
+-- !query 14 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS TINYINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS TINYINT))' (calendarinterval and tinyint).; line 1 pos 7
+
+
+-- !query 15
+select interval 2 day + cast(1 as smallint)
+-- !query 15 schema
+struct<>
+-- !query 15 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS SMALLINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS SMALLINT))' (calendarinterval and smallint).; line 1 pos 7
+
+
+-- !query 16
+select interval 2 day + cast(1 as int)
+-- !query 16 schema
+struct<>
+-- !query 16 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS INT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS INT))' (calendarinterval and int).; line 1 pos 7
+
+
+-- !query 17
+select interval 2 day + cast(1 as bigint)
+-- !query 17 schema
+struct<>
+-- !query 17 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS BIGINT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS BIGINT))' (calendarinterval and bigint).; line 1 pos 7
+
+
+-- !query 18
+select interval 2 day + cast(1 as float)
+-- !query 18 schema
+struct<>
+-- !query 18 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS FLOAT))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS FLOAT))' (calendarinterval and float).; line 1 pos 7
+
+
+-- !query 19
+select interval 2 day + cast(1 as double)
+-- !query 19 schema
+struct<>
+-- !query 19 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS DOUBLE))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS DOUBLE))' (calendarinterval and double).; line 1 pos 7
+
+
+-- !query 20
+select interval 2 day + cast(1 as decimal(10, 0))
+-- !query 20 schema
+struct<>
+-- !query 20 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS DECIMAL(10,0)))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS DECIMAL(10,0)))' (calendarinterval and decimal(10,0)).; line 1 pos 7
+
+
+-- !query 21
+select interval 2 day + cast('2017-12-11' as string)
+-- !query 21 schema
+struct<CAST(CAST(CAST(2017-12-11 AS STRING) AS TIMESTAMP) + interval 2 days AS STRING):string>
+-- !query 21 output
+2017-12-13 00:00:00
+
+
+-- !query 22
+select interval 2 day + cast('2017-12-11 09:30:00' as string)
+-- !query 22 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS STRING) AS TIMESTAMP) + interval 2 days AS STRING):string>
+-- !query 22 output
+2017-12-13 09:30:00
+
+
+-- !query 23
+select interval 2 day + cast('1' as binary)
+-- !query 23 schema
+struct<>
+-- !query 23 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST('1' AS BINARY))' due to data type mismatch: differing types in '(interval 2 days + CAST('1' AS BINARY))' (calendarinterval and binary).; line 1 pos 7
+
+
+-- !query 24
+select interval 2 day + cast(1 as boolean)
+-- !query 24 schema
+struct<>
+-- !query 24 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(interval 2 days + CAST(1 AS BOOLEAN))' due to data type mismatch: differing types in '(interval 2 days + CAST(1 AS BOOLEAN))' (calendarinterval and boolean).; line 1 pos 7
+
+
+-- !query 25
+select interval 2 day + cast('2017-12-11 09:30:00.0' as timestamp)
+-- !query 25 schema
+struct<CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) + interval 2 days AS TIMESTAMP):timestamp>
+-- !query 25 output
+2017-12-13 09:30:00
+
+
+-- !query 26
+select interval 2 day + cast('2017-12-11 09:30:00' as date)
+-- !query 26 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP) + interval 2 days AS DATE):date>
+-- !query 26 output
+2017-12-13
+
+
+-- !query 27
+select cast(1 as tinyint) - interval 2 day
+-- !query 27 schema
+struct<>
+-- !query 27 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS TINYINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS TINYINT) - interval 2 days)' (tinyint and calendarinterval).; line 1 pos 7
+
+
+-- !query 28
+select cast(1 as smallint) - interval 2 day
+-- !query 28 schema
+struct<>
+-- !query 28 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS SMALLINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS SMALLINT) - interval 2 days)' (smallint and calendarinterval).; line 1 pos 7
+
+
+-- !query 29
+select cast(1 as int) - interval 2 day
+-- !query 29 schema
+struct<>
+-- !query 29 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS INT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS INT) - interval 2 days)' (int and calendarinterval).; line 1 pos 7
+
+
+-- !query 30
+select cast(1 as bigint) - interval 2 day
+-- !query 30 schema
+struct<>
+-- !query 30 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BIGINT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BIGINT) - interval 2 days)' (bigint and calendarinterval).; line 1 pos 7
+
+
+-- !query 31
+select cast(1 as float) - interval 2 day
+-- !query 31 schema
+struct<>
+-- !query 31 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS FLOAT) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS FLOAT) - interval 2 days)' (float and calendarinterval).; line 1 pos 7
+
+
+-- !query 32
+select cast(1 as double) - interval 2 day
+-- !query 32 schema
+struct<>
+-- !query 32 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DOUBLE) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DOUBLE) - interval 2 days)' (double and calendarinterval).; line 1 pos 7
+
+
+-- !query 33
+select cast(1 as decimal(10, 0)) - interval 2 day
+-- !query 33 schema
+struct<>
+-- !query 33 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS DECIMAL(10,0)) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS DECIMAL(10,0)) - interval 2 days)' (decimal(10,0) and calendarinterval).; line 1 pos 7
+
+
+-- !query 34
+select cast('2017-12-11' as string) - interval 2 day
+-- !query 34 schema
+struct<CAST(CAST(CAST(2017-12-11 AS STRING) AS TIMESTAMP) - interval 2 days AS STRING):string>
+-- !query 34 output
+2017-12-09 00:00:00
+
+
+-- !query 35
+select cast('2017-12-11 09:30:00' as string) - interval 2 day
+-- !query 35 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS STRING) AS TIMESTAMP) - interval 2 days AS STRING):string>
+-- !query 35 output
+2017-12-09 09:30:00
+
+
+-- !query 36
+select cast('1' as binary) - interval 2 day
+-- !query 36 schema
+struct<>
+-- !query 36 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST('1' AS BINARY) - interval 2 days)' due to data type mismatch: differing types in '(CAST('1' AS BINARY) - interval 2 days)' (binary and calendarinterval).; line 1 pos 7
+
+
+-- !query 37
+select cast(1 as boolean) - interval 2 day
+-- !query 37 schema
+struct<>
+-- !query 37 output
+org.apache.spark.sql.AnalysisException
+cannot resolve '(CAST(1 AS BOOLEAN) - interval 2 days)' due to data type mismatch: differing types in '(CAST(1 AS BOOLEAN) - interval 2 days)' (boolean and calendarinterval).; line 1 pos 7
+
+
+-- !query 38
+select cast('2017-12-11 09:30:00.0' as timestamp) - interval 2 day
+-- !query 38 schema
+struct<CAST(CAST(2017-12-11 09:30:00.0 AS TIMESTAMP) - interval 2 days AS TIMESTAMP):timestamp>
+-- !query 38 output
+2017-12-09 09:30:00
+
+
+-- !query 39
+select cast('2017-12-11 09:30:00' as date) - interval 2 day
+-- !query 39 schema
+struct<CAST(CAST(CAST(2017-12-11 09:30:00 AS DATE) AS TIMESTAMP) - interval 2 days AS DATE):date>
+-- !query 39 output
+2017-12-09
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 1972dec7b86ce..5e077285ade55 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.io.File
 import java.math.MathContext
 import java.net.{MalformedURLException, URL}
-import java.sql.{Date, Timestamp}
+import java.sql.Timestamp
 import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.{AccumulatorSuite, SparkException}
@@ -2760,17 +2760,6 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("SPARK-22894: DateTimeOperations should accept SQL like string type") {
-    val date = "2017-12-24"
-    val str = sql(s"SELECT CAST('$date' as STRING) + interval 2 months 2 seconds")
-    val dt = sql(s"SELECT CAST('$date' as DATE) + interval 2 months 2 seconds")
-    val ts = sql(s"SELECT CAST('$date' as TIMESTAMP) + interval 2 months 2 seconds")
-
-    checkAnswer(str, Row("2018-02-24 00:00:02") :: Nil)
-    checkAnswer(dt, Row(Date.valueOf("2018-02-24")) :: Nil)
-    checkAnswer(ts, Row(Timestamp.valueOf("2018-02-24 00:00:02")) :: Nil)
-  }
-
   // Only New OrcFileFormat supports this
   Seq(classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName,
       "parquet").foreach { format =>

From cfcd746689c2b84824745fa6d327ffb584c7a17d Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 28 Dec 2017 17:00:49 -0600
Subject: [PATCH 212/356] [SPARK-11035][CORE] Add in-process Spark app
 launcher.

This change adds a new launcher that allows applications to be run
in a separate thread in the same process as the calling code. To
achieve that, some code from the child process implementation was
moved to abstract classes that implement the common functionality,
and the new launcher inherits from those.

The new launcher was added as a new class, instead of implemented
as a new option to the existing SparkLauncher, to avoid ambigous
APIs. For example, SparkLauncher has ways to set the child app's
environment, modify SPARK_HOME, or control the logging of the
child process, none of which apply to in-process apps.

The in-process launcher has limitations: it needs Spark in the
context class loader of the calling thread, and it's bound by
Spark's current limitation of a single client-mode application
per JVM. It also relies on the recently added SparkApplication
trait to make sure different apps don't mess up each other's
configuration, so config isolation is currently limited to cluster mode.

I also chose to keep the same socket-based communication for in-process
apps, even though it might be possible to avoid it for in-process
mode. That helps both implementations share more code.

Tested with new and existing unit tests, and with a simple app that
uses the launcher; also made sure the app ran fine with older launcher
jar to check binary compatibility.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #19591 from vanzin/SPARK-11035.
---
 core/pom.xml                                  |   8 +
 .../spark/launcher/LauncherBackend.scala      |  11 +-
 .../cluster/StandaloneSchedulerBackend.scala  |   1 +
 .../local/LocalSchedulerBackend.scala         |   1 +
 .../spark/launcher/SparkLauncherSuite.java    |  70 +++-
 .../spark/launcher/AbstractAppHandle.java     | 129 +++++++
 .../spark/launcher/AbstractLauncher.java      | 307 +++++++++++++++
 .../spark/launcher/ChildProcAppHandle.java    | 117 +-----
 .../spark/launcher/InProcessAppHandle.java    |  83 +++++
 .../spark/launcher/InProcessLauncher.java     | 110 ++++++
 .../spark/launcher/LauncherProtocol.java      |   6 +
 .../apache/spark/launcher/LauncherServer.java | 113 +++---
 .../apache/spark/launcher/SparkLauncher.java  | 351 +++++-------------
 .../launcher/SparkSubmitCommandBuilder.java   |   2 +-
 .../apache/spark/launcher/package-info.java   |  28 +-
 .../org/apache/spark/launcher/BaseSuite.java  |  35 +-
 .../launcher/ChildProcAppHandleSuite.java     |  16 -
 .../launcher/InProcessLauncherSuite.java      | 170 +++++++++
 .../spark/launcher/LauncherServerSuite.java   |  82 ++--
 .../MesosCoarseGrainedSchedulerBackend.scala  |   2 +
 .../org/apache/spark/deploy/yarn/Client.scala |   2 +
 21 files changed, 1139 insertions(+), 505 deletions(-)
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/InProcessLauncher.java
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/InProcessLauncherSuite.java

diff --git a/core/pom.xml b/core/pom.xml
index fa138d3e7a4e0..0a5bd958fc9c5 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -351,6 +351,14 @@
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
     </dependency>
 
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-launcher_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <classifier>tests</classifier>
+      <scope>test</scope>
+    </dependency>
+
     <!--
       This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
       them will yield errors.
diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
index a5d41a1eeb479..aaae33ca4e6f3 100644
--- a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
+++ b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -19,7 +19,7 @@ package org.apache.spark.launcher
 
 import java.net.{InetAddress, Socket}
 
-import org.apache.spark.SPARK_VERSION
+import org.apache.spark.{SPARK_VERSION, SparkConf}
 import org.apache.spark.launcher.LauncherProtocol._
 import org.apache.spark.util.{ThreadUtils, Utils}
 
@@ -36,9 +36,14 @@ private[spark] abstract class LauncherBackend {
   private var lastState: SparkAppHandle.State = _
   @volatile private var _isConnected = false
 
+  protected def conf: SparkConf
+
   def connect(): Unit = {
-    val port = sys.env.get(LauncherProtocol.ENV_LAUNCHER_PORT).map(_.toInt)
-    val secret = sys.env.get(LauncherProtocol.ENV_LAUNCHER_SECRET)
+    val port = conf.getOption(LauncherProtocol.CONF_LAUNCHER_PORT)
+      .orElse(sys.env.get(LauncherProtocol.ENV_LAUNCHER_PORT))
+      .map(_.toInt)
+    val secret = conf.getOption(LauncherProtocol.CONF_LAUNCHER_SECRET)
+      .orElse(sys.env.get(LauncherProtocol.ENV_LAUNCHER_SECRET))
     if (port != None && secret != None) {
       val s = new Socket(InetAddress.getLoopbackAddress(), port.get)
       connection = new BackendConnection(s)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
index 505c342a889ee..f73a58ff5d48c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/StandaloneSchedulerBackend.scala
@@ -45,6 +45,7 @@ private[spark] class StandaloneSchedulerBackend(
   private var client: StandaloneAppClient = null
   private val stopping = new AtomicBoolean(false)
   private val launcherBackend = new LauncherBackend() {
+    override protected def conf: SparkConf = sc.conf
     override protected def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala
index 35509bc2f85b9..4c614c5c0f602 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalSchedulerBackend.scala
@@ -105,6 +105,7 @@ private[spark] class LocalSchedulerBackend(
   private val userClassPath = getUserClasspath(conf)
   private val listenerBus = scheduler.sc.listenerBus
   private val launcherBackend = new LauncherBackend() {
+    override def conf: SparkConf = LocalSchedulerBackend.this.conf
     override def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
   }
 
diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index ac4391e3ef99b..c2261c204cd45 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -18,30 +18,26 @@
 package org.apache.spark.launcher;
 
 import java.util.Arrays;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.Properties;
 
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.slf4j.bridge.SLF4JBridgeHandler;
 import static org.junit.Assert.*;
 import static org.junit.Assume.*;
+import static org.mockito.Mockito.*;
 
+import org.apache.spark.SparkContext;
 import org.apache.spark.internal.config.package$;
 import org.apache.spark.util.Utils;
 
 /**
  * These tests require the Spark assembly to be built before they can be run.
  */
-public class SparkLauncherSuite {
+public class SparkLauncherSuite extends BaseSuite {
 
-  static {
-    SLF4JBridgeHandler.removeHandlersForRootLogger();
-    SLF4JBridgeHandler.install();
-  }
-
-  private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class);
   private static final NamedThreadFactory TF = new NamedThreadFactory("SparkLauncherSuite-%d");
 
   private final SparkLauncher launcher = new SparkLauncher();
@@ -123,6 +119,50 @@ public void testChildProcLauncher() throws Exception {
     assertEquals(0, app.waitFor());
   }
 
+  @Test
+  public void testInProcessLauncher() throws Exception {
+    // Because this test runs SparkLauncher in process and in client mode, it pollutes the system
+    // properties, and that can cause test failures down the test pipeline. So restore the original
+    // system properties after this test runs.
+    Map<Object, Object> properties = new HashMap<>(System.getProperties());
+    try {
+      inProcessLauncherTestImpl();
+    } finally {
+      Properties p = new Properties();
+      for (Map.Entry<Object, Object> e : properties.entrySet()) {
+        p.put(e.getKey(), e.getValue());
+      }
+      System.setProperties(p);
+    }
+  }
+
+  private void inProcessLauncherTestImpl() throws Exception {
+    final List<SparkAppHandle.State> transitions = new ArrayList<>();
+    SparkAppHandle.Listener listener = mock(SparkAppHandle.Listener.class);
+    doAnswer(invocation -> {
+      SparkAppHandle h = (SparkAppHandle) invocation.getArguments()[0];
+      transitions.add(h.getState());
+      return null;
+    }).when(listener).stateChanged(any(SparkAppHandle.class));
+
+    SparkAppHandle handle = new InProcessLauncher()
+      .setMaster("local")
+      .setAppResource(SparkLauncher.NO_RESOURCE)
+      .setMainClass(InProcessTestApp.class.getName())
+      .addAppArgs("hello")
+      .startApplication(listener);
+
+    waitFor(handle);
+    assertEquals(SparkAppHandle.State.FINISHED, handle.getState());
+
+    // Matches the behavior of LocalSchedulerBackend.
+    List<SparkAppHandle.State> expected = Arrays.asList(
+      SparkAppHandle.State.CONNECTED,
+      SparkAppHandle.State.RUNNING,
+      SparkAppHandle.State.FINISHED);
+    assertEquals(expected, transitions);
+  }
+
   public static class SparkLauncherTestApp {
 
     public static void main(String[] args) throws Exception {
@@ -134,4 +174,14 @@ public static void main(String[] args) throws Exception {
 
   }
 
+  public static class InProcessTestApp {
+
+    public static void main(String[] args) throws Exception {
+      assertNotEquals(0, args.length);
+      assertEquals(args[0], "hello");
+      new SparkContext().stop();
+    }
+
+  }
+
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
new file mode 100644
index 0000000000000..df1e7316861d4
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractAppHandle.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+abstract class AbstractAppHandle implements SparkAppHandle {
+
+  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+
+  private final LauncherServer server;
+
+  private LauncherConnection connection;
+  private List<Listener> listeners;
+  private State state;
+  private String appId;
+  private boolean disposed;
+
+  protected AbstractAppHandle(LauncherServer server) {
+    this.server = server;
+    this.state = State.UNKNOWN;
+  }
+
+  @Override
+  public synchronized void addListener(Listener l) {
+    if (listeners == null) {
+      listeners = new ArrayList<>();
+    }
+    listeners.add(l);
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public String getAppId() {
+    return appId;
+  }
+
+  @Override
+  public void stop() {
+    CommandBuilderUtils.checkState(connection != null, "Application is still not connected.");
+    try {
+      connection.send(new LauncherProtocol.Stop());
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    }
+  }
+
+  @Override
+  public synchronized void disconnect() {
+    if (!disposed) {
+      disposed = true;
+      if (connection != null) {
+        try {
+          connection.close();
+        } catch (IOException ioe) {
+          // no-op.
+        }
+      }
+      server.unregister(this);
+    }
+  }
+
+  void setConnection(LauncherConnection connection) {
+    this.connection = connection;
+  }
+
+  LauncherConnection getConnection() {
+    return connection;
+  }
+
+  boolean isDisposed() {
+    return disposed;
+  }
+
+  void setState(State s) {
+    setState(s, false);
+  }
+
+  synchronized void setState(State s, boolean force) {
+    if (force || !state.isFinal()) {
+      state = s;
+      fireEvent(false);
+    } else {
+      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
+        new Object[] { state, s });
+    }
+  }
+
+  synchronized void setAppId(String appId) {
+    this.appId = appId;
+    fireEvent(true);
+  }
+
+  private void fireEvent(boolean isInfoChanged) {
+    if (listeners != null) {
+      for (Listener l : listeners) {
+        if (isInfoChanged) {
+          l.infoChanged(this);
+        } else {
+          l.stateChanged(this);
+        }
+      }
+    }
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java
new file mode 100644
index 0000000000000..44e69fc45dffa
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractLauncher.java
@@ -0,0 +1,307 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.apache.spark.launcher.CommandBuilderUtils.*;
+
+/**
+ * Base class for launcher implementations.
+ *
+ * @since Spark 2.3.0
+ */
+public abstract class AbstractLauncher<T extends AbstractLauncher> {
+
+  final SparkSubmitCommandBuilder builder;
+
+  AbstractLauncher() {
+    this.builder = new SparkSubmitCommandBuilder();
+  }
+
+  /**
+   * Set a custom properties file with Spark configuration for the application.
+   *
+   * @param path Path to custom properties file to use.
+   * @return This launcher.
+   */
+  public T setPropertiesFile(String path) {
+    checkNotNull(path, "path");
+    builder.setPropertiesFile(path);
+    return self();
+  }
+
+  /**
+   * Set a single configuration value for the application.
+   *
+   * @param key Configuration key.
+   * @param value The value to use.
+   * @return This launcher.
+   */
+  public T setConf(String key, String value) {
+    checkNotNull(key, "key");
+    checkNotNull(value, "value");
+    checkArgument(key.startsWith("spark."), "'key' must start with 'spark.'");
+    builder.conf.put(key, value);
+    return self();
+  }
+
+  /**
+   * Set the application name.
+   *
+   * @param appName Application name.
+   * @return This launcher.
+   */
+  public T setAppName(String appName) {
+    checkNotNull(appName, "appName");
+    builder.appName = appName;
+    return self();
+  }
+
+  /**
+   * Set the Spark master for the application.
+   *
+   * @param master Spark master.
+   * @return This launcher.
+   */
+  public T setMaster(String master) {
+    checkNotNull(master, "master");
+    builder.master = master;
+    return self();
+  }
+
+  /**
+   * Set the deploy mode for the application.
+   *
+   * @param mode Deploy mode.
+   * @return This launcher.
+   */
+  public T setDeployMode(String mode) {
+    checkNotNull(mode, "mode");
+    builder.deployMode = mode;
+    return self();
+  }
+
+  /**
+   * Set the main application resource. This should be the location of a jar file for Scala/Java
+   * applications, or a python script for PySpark applications.
+   *
+   * @param resource Path to the main application resource.
+   * @return This launcher.
+   */
+  public T setAppResource(String resource) {
+    checkNotNull(resource, "resource");
+    builder.appResource = resource;
+    return self();
+  }
+
+  /**
+   * Sets the application class name for Java/Scala applications.
+   *
+   * @param mainClass Application's main class.
+   * @return This launcher.
+   */
+  public T setMainClass(String mainClass) {
+    checkNotNull(mainClass, "mainClass");
+    builder.mainClass = mainClass;
+    return self();
+  }
+
+  /**
+   * Adds a no-value argument to the Spark invocation. If the argument is known, this method
+   * validates whether the argument is indeed a no-value argument, and throws an exception
+   * otherwise.
+   * <p>
+   * Use this method with caution. It is possible to create an invalid Spark command by passing
+   * unknown arguments to this method, since those are allowed for forward compatibility.
+   *
+   * @since 1.5.0
+   * @param arg Argument to add.
+   * @return This launcher.
+   */
+  public T addSparkArg(String arg) {
+    SparkSubmitOptionParser validator = new ArgumentValidator(false);
+    validator.parse(Arrays.asList(arg));
+    builder.sparkArgs.add(arg);
+    return self();
+  }
+
+  /**
+   * Adds an argument with a value to the Spark invocation. If the argument name corresponds to
+   * a known argument, the code validates that the argument actually expects a value, and throws
+   * an exception otherwise.
+   * <p>
+   * It is safe to add arguments modified by other methods in this class (such as
+   * {@link #setMaster(String)} - the last invocation will be the one to take effect.
+   * <p>
+   * Use this method with caution. It is possible to create an invalid Spark command by passing
+   * unknown arguments to this method, since those are allowed for forward compatibility.
+   *
+   * @since 1.5.0
+   * @param name Name of argument to add.
+   * @param value Value of the argument.
+   * @return This launcher.
+   */
+  public T addSparkArg(String name, String value) {
+    SparkSubmitOptionParser validator = new ArgumentValidator(true);
+    if (validator.MASTER.equals(name)) {
+      setMaster(value);
+    } else if (validator.PROPERTIES_FILE.equals(name)) {
+      setPropertiesFile(value);
+    } else if (validator.CONF.equals(name)) {
+      String[] vals = value.split("=", 2);
+      setConf(vals[0], vals[1]);
+    } else if (validator.CLASS.equals(name)) {
+      setMainClass(value);
+    } else if (validator.JARS.equals(name)) {
+      builder.jars.clear();
+      for (String jar : value.split(",")) {
+        addJar(jar);
+      }
+    } else if (validator.FILES.equals(name)) {
+      builder.files.clear();
+      for (String file : value.split(",")) {
+        addFile(file);
+      }
+    } else if (validator.PY_FILES.equals(name)) {
+      builder.pyFiles.clear();
+      for (String file : value.split(",")) {
+        addPyFile(file);
+      }
+    } else {
+      validator.parse(Arrays.asList(name, value));
+      builder.sparkArgs.add(name);
+      builder.sparkArgs.add(value);
+    }
+    return self();
+  }
+
+  /**
+   * Adds command line arguments for the application.
+   *
+   * @param args Arguments to pass to the application's main class.
+   * @return This launcher.
+   */
+  public T addAppArgs(String... args) {
+    for (String arg : args) {
+      checkNotNull(arg, "arg");
+      builder.appArgs.add(arg);
+    }
+    return self();
+  }
+
+  /**
+   * Adds a jar file to be submitted with the application.
+   *
+   * @param jar Path to the jar file.
+   * @return This launcher.
+   */
+  public T addJar(String jar) {
+    checkNotNull(jar, "jar");
+    builder.jars.add(jar);
+    return self();
+  }
+
+  /**
+   * Adds a file to be submitted with the application.
+   *
+   * @param file Path to the file.
+   * @return This launcher.
+   */
+  public T addFile(String file) {
+    checkNotNull(file, "file");
+    builder.files.add(file);
+    return self();
+  }
+
+  /**
+   * Adds a python file / zip / egg to be submitted with the application.
+   *
+   * @param file Path to the file.
+   * @return This launcher.
+   */
+  public T addPyFile(String file) {
+    checkNotNull(file, "file");
+    builder.pyFiles.add(file);
+    return self();
+  }
+
+  /**
+   * Enables verbose reporting for SparkSubmit.
+   *
+   * @param verbose Whether to enable verbose output.
+   * @return This launcher.
+   */
+  public T setVerbose(boolean verbose) {
+    builder.verbose = verbose;
+    return self();
+  }
+
+  /**
+   * Starts a Spark application.
+   *
+   * <p>
+   * This method returns a handle that provides information about the running application and can
+   * be used to do basic interaction with it.
+   * <p>
+   * The returned handle assumes that the application will instantiate a single SparkContext
+   * during its lifetime. Once that context reports a final state (one that indicates the
+   * SparkContext has stopped), the handle will not perform new state transitions, so anything
+   * that happens after that cannot be monitored. If the underlying application is launched as
+   * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process.
+   *
+   * @since 1.6.0
+   * @param listeners Listeners to add to the handle before the app is launched.
+   * @return A handle for the launched application.
+   */
+  public abstract SparkAppHandle startApplication(SparkAppHandle.Listener... listeners)
+    throws IOException;
+
+  abstract T self();
+
+  private static class ArgumentValidator extends SparkSubmitOptionParser {
+
+    private final boolean hasValue;
+
+    ArgumentValidator(boolean hasValue) {
+      this.hasValue = hasValue;
+    }
+
+    @Override
+    protected boolean handle(String opt, String value) {
+      if (value == null && hasValue) {
+        throw new IllegalArgumentException(String.format("'%s' expects a value.", opt));
+      }
+      return true;
+    }
+
+    @Override
+    protected boolean handleUnknown(String opt) {
+      // Do not fail on unknown arguments, to support future arguments added to SparkSubmit.
+      return true;
+    }
+
+    protected void handleExtraArgs(List<String> extra) {
+      // No op.
+    }
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 5391d4a50fe47..3bb7e12385fd8 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -17,77 +17,29 @@
 
 package org.apache.spark.launcher;
 
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.List;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
 /**
  * Handle implementation for monitoring apps started as a child process.
  */
-class ChildProcAppHandle implements SparkAppHandle {
+class ChildProcAppHandle extends AbstractAppHandle {
 
   private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
 
-  private final String secret;
-  private final LauncherServer server;
-
   private volatile Process childProc;
-  private boolean disposed;
-  private LauncherConnection connection;
-  private List<Listener> listeners;
-  private State state;
-  private String appId;
   private OutputRedirector redirector;
 
-  ChildProcAppHandle(String secret, LauncherServer server) {
-    this.secret = secret;
-    this.server = server;
-    this.state = State.UNKNOWN;
-  }
-
-  @Override
-  public synchronized void addListener(Listener l) {
-    if (listeners == null) {
-      listeners = new ArrayList<>();
-    }
-    listeners.add(l);
-  }
-
-  @Override
-  public State getState() {
-    return state;
-  }
-
-  @Override
-  public String getAppId() {
-    return appId;
-  }
-
-  @Override
-  public void stop() {
-    CommandBuilderUtils.checkState(connection != null, "Application is still not connected.");
-    try {
-      connection.send(new LauncherProtocol.Stop());
-    } catch (IOException ioe) {
-      throw new RuntimeException(ioe);
-    }
+  ChildProcAppHandle(LauncherServer server) {
+    super(server);
   }
 
   @Override
   public synchronized void disconnect() {
-    if (!disposed) {
-      disposed = true;
-      if (connection != null) {
-        try {
-          connection.close();
-        } catch (IOException ioe) {
-          // no-op.
-        }
-      }
-      server.unregister(this);
+    try {
+      super.disconnect();
+    } finally {
       if (redirector != null) {
         redirector.stop();
       }
@@ -106,10 +58,6 @@ public synchronized void kill() {
     setState(State.KILLED);
   }
 
-  String getSecret() {
-    return secret;
-  }
-
   void setChildProc(Process childProc, String loggerName, InputStream logStream) {
     this.childProc = childProc;
     if (logStream != null) {
@@ -118,39 +66,10 @@ void setChildProc(Process childProc, String loggerName, InputStream logStream) {
     } else {
       // If there is no log redirection, spawn a thread that will wait for the child process
       // to finish.
-      Thread waiter = SparkLauncher.REDIRECTOR_FACTORY.newThread(this::monitorChild);
-      waiter.setDaemon(true);
-      waiter.start();
+      SparkLauncher.REDIRECTOR_FACTORY.newThread(this::monitorChild).start();
     }
   }
 
-  void setConnection(LauncherConnection connection) {
-    this.connection = connection;
-  }
-
-  LauncherServer getServer() {
-    return server;
-  }
-
-  LauncherConnection getConnection() {
-    return connection;
-  }
-
-  synchronized void setState(State s) {
-    if (!state.isFinal()) {
-      state = s;
-      fireEvent(false);
-    } else {
-      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
-        new Object[] { state, s });
-    }
-  }
-
-  synchronized void setAppId(String appId) {
-    this.appId = appId;
-    fireEvent(true);
-  }
-
   /**
    * Wait for the child process to exit and update the handle's state if necessary, accoding to
    * the exit code.
@@ -171,7 +90,7 @@ void monitorChild() {
     }
 
     synchronized (this) {
-      if (disposed) {
+      if (isDisposed()) {
         return;
       }
 
@@ -185,31 +104,19 @@ void monitorChild() {
         ec = 1;
       }
 
+      State currState = getState();
       State newState = null;
       if (ec != 0) {
         // Override state with failure if the current state is not final, or is success.
-        if (!state.isFinal() || state == State.FINISHED) {
+        if (!currState.isFinal() || currState == State.FINISHED) {
           newState = State.FAILED;
         }
-      } else if (!state.isFinal()) {
+      } else if (!currState.isFinal()) {
         newState = State.LOST;
       }
 
       if (newState != null) {
-        state = newState;
-        fireEvent(false);
-      }
-    }
-  }
-
-  private void fireEvent(boolean isInfoChanged) {
-    if (listeners != null) {
-      for (Listener l : listeners) {
-        if (isInfoChanged) {
-          l.infoChanged(this);
-        } else {
-          l.stateChanged(this);
-        }
+        setState(newState, true);
       }
     }
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
new file mode 100644
index 0000000000000..0d6a73a3da3ed
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.lang.reflect.Method;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+class InProcessAppHandle extends AbstractAppHandle {
+
+  private static final String THREAD_NAME_FMT = "spark-app-%d: '%s'";
+  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+  private static final AtomicLong THREAD_IDS = new AtomicLong();
+
+  // Avoid really long thread names.
+  private static final int MAX_APP_NAME_LEN = 16;
+
+  private Thread app;
+
+  InProcessAppHandle(LauncherServer server) {
+    super(server);
+  }
+
+  @Override
+  public synchronized void kill() {
+    LOG.warning("kill() may leave the underlying app running in in-process mode.");
+    disconnect();
+
+    // Interrupt the thread. This is not guaranteed to kill the app, though.
+    if (app != null) {
+      app.interrupt();
+    }
+
+    setState(State.KILLED);
+  }
+
+  synchronized void start(String appName, Method main, String[] args) {
+    CommandBuilderUtils.checkState(app == null, "Handle already started.");
+
+    if (appName.length() > MAX_APP_NAME_LEN) {
+      appName = "..." + appName.substring(appName.length() - MAX_APP_NAME_LEN);
+    }
+
+    app = new Thread(() -> {
+      try {
+        main.invoke(null, (Object) args);
+      } catch (Throwable t) {
+        LOG.log(Level.WARNING, "Application failed with exception.", t);
+        setState(State.FAILED);
+      }
+
+      synchronized (InProcessAppHandle.this) {
+        if (!isDisposed()) {
+          disconnect();
+          if (!getState().isFinal()) {
+            setState(State.LOST, true);
+          }
+        }
+      }
+    });
+
+    app.setName(String.format(THREAD_NAME_FMT, THREAD_IDS.incrementAndGet(), appName));
+    app.start();
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessLauncher.java
new file mode 100644
index 0000000000000..6d726b4a69a86
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessLauncher.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.lang.reflect.Method;
+import java.lang.reflect.Modifier;
+import java.util.List;
+import java.util.logging.Logger;
+
+/**
+ * In-process launcher for Spark applications.
+ * <p>
+ * Use this class to start Spark applications programmatically. Applications launched using this
+ * class will run in the same process as the caller.
+ * <p>
+ * Because Spark only supports a single active instance of <code>SparkContext</code> per JVM, code
+ * that uses this class should be careful about which applications are launched. It's recommended
+ * that this launcher only be used to launch applications in cluster mode.
+ * <p>
+ * Also note that, when running applications in client mode, JVM-related configurations (like
+ * driver memory or configs which modify the driver's class path) do not take effect. Logging
+ * configuration is also inherited from the parent application.
+ *
+ * @since Spark 2.3.0
+ */
+public class InProcessLauncher extends AbstractLauncher<InProcessLauncher> {
+
+  private static final Logger LOG = Logger.getLogger(InProcessLauncher.class.getName());
+
+  /**
+   * Starts a Spark application.
+   *
+   * @see AbstractLauncher#startApplication(SparkAppHandle.Listener...)
+   * @param listeners Listeners to add to the handle before the app is launched.
+   * @return A handle for the launched application.
+   */
+  @Override
+  public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) throws IOException {
+    if (builder.isClientMode(builder.getEffectiveConfig())) {
+      LOG.warning("It's not recommended to run client-mode applications using InProcessLauncher.");
+    }
+
+    Method main = findSparkSubmit();
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    InProcessAppHandle handle = new InProcessAppHandle(server);
+    for (SparkAppHandle.Listener l : listeners) {
+      handle.addListener(l);
+    }
+
+    String secret = server.registerHandle(handle);
+    setConf(LauncherProtocol.CONF_LAUNCHER_PORT, String.valueOf(server.getPort()));
+    setConf(LauncherProtocol.CONF_LAUNCHER_SECRET, secret);
+
+    List<String> sparkArgs = builder.buildSparkSubmitArgs();
+    String[] argv = sparkArgs.toArray(new String[sparkArgs.size()]);
+
+    String appName = CommandBuilderUtils.firstNonEmpty(builder.appName, builder.mainClass,
+      "<unknown>");
+    handle.start(appName, main, argv);
+    return handle;
+  }
+
+  @Override
+  InProcessLauncher self() {
+    return this;
+  }
+
+  // Visible for testing.
+  Method findSparkSubmit() throws IOException {
+    ClassLoader cl = Thread.currentThread().getContextClassLoader();
+    if (cl == null) {
+      cl = getClass().getClassLoader();
+    }
+
+    Class<?> sparkSubmit;
+    try {
+      sparkSubmit = cl.loadClass("org.apache.spark.deploy.SparkSubmit");
+    } catch (Exception e) {
+      throw new IOException("Cannot find SparkSubmit; make sure necessary jars are available.", e);
+    }
+
+    Method main;
+    try {
+      main = sparkSubmit.getMethod("main", String[].class);
+    } catch (Exception e) {
+      throw new IOException("Cannot find SparkSubmit main method.", e);
+    }
+
+    CommandBuilderUtils.checkState(Modifier.isStatic(main.getModifiers()),
+      "main method is not static.");
+    return main;
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
index 042f11cd9e434..e1eadb9d9355b 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
@@ -32,6 +32,12 @@ final class LauncherProtocol {
   /** Environment variable where the secret for connecting back to the server is stored. */
   static final String ENV_LAUNCHER_SECRET = "_SPARK_LAUNCHER_SECRET";
 
+  /** Spark conf key used to propagate the server port for in-process launches. */
+  static final String CONF_LAUNCHER_PORT = "spark.launcher.port";
+
+  /** Spark conf key used to propagate the app secret for in-process launches. */
+  static final String CONF_LAUNCHER_SECRET = "spark.launcher.secret";
+
   static class Message implements Serializable {
 
   }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index 4353e3f263c51..b8999a1d7a4f4 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -26,6 +26,7 @@
 import java.security.SecureRandom;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 import java.util.Timer;
 import java.util.TimerTask;
 import java.util.concurrent.ConcurrentHashMap;
@@ -88,31 +89,25 @@ class LauncherServer implements Closeable {
 
   private static volatile LauncherServer serverInstance;
 
-  /**
-   * Creates a handle for an app to be launched. This method will start a server if one hasn't been
-   * started yet. The server is shared for multiple handles, and once all handles are disposed of,
-   * the server is shut down.
-   */
-  static synchronized ChildProcAppHandle newAppHandle() throws IOException {
-    LauncherServer server = serverInstance != null ? serverInstance : new LauncherServer();
+  static synchronized LauncherServer getOrCreateServer() throws IOException {
+    LauncherServer server;
+    do {
+      server = serverInstance != null ? serverInstance : new LauncherServer();
+    } while (!server.running);
+
     server.ref();
     serverInstance = server;
-
-    String secret = server.createSecret();
-    while (server.pending.containsKey(secret)) {
-      secret = server.createSecret();
-    }
-
-    return server.newAppHandle(secret);
+    return server;
   }
 
-  static LauncherServer getServerInstance() {
+  // For testing.
+  static synchronized LauncherServer getServer() {
     return serverInstance;
   }
 
   private final AtomicLong refCount;
   private final AtomicLong threadIds;
-  private final ConcurrentMap<String, ChildProcAppHandle> pending;
+  private final ConcurrentMap<String, AbstractAppHandle> secretToPendingApps;
   private final List<ServerConnection> clients;
   private final ServerSocket server;
   private final Thread serverThread;
@@ -132,7 +127,7 @@ private LauncherServer() throws IOException {
       this.clients = new ArrayList<>();
       this.threadIds = new AtomicLong();
       this.factory = new NamedThreadFactory(THREAD_NAME_FMT);
-      this.pending = new ConcurrentHashMap<>();
+      this.secretToPendingApps = new ConcurrentHashMap<>();
       this.timeoutTimer = new Timer("LauncherServer-TimeoutTimer", true);
       this.server = server;
       this.running = true;
@@ -149,32 +144,38 @@ private LauncherServer() throws IOException {
   }
 
   /**
-   * Creates a new app handle. The handle will wait for an incoming connection for a configurable
-   * amount of time, and if one doesn't arrive, it will transition to an error state.
+   * Registers a handle with the server, and returns the secret the child app needs to connect
+   * back.
    */
-  ChildProcAppHandle newAppHandle(String secret) {
-    ChildProcAppHandle handle = new ChildProcAppHandle(secret, this);
-    ChildProcAppHandle existing = pending.putIfAbsent(secret, handle);
-    CommandBuilderUtils.checkState(existing == null, "Multiple handles with the same secret.");
-    return handle;
+  synchronized String registerHandle(AbstractAppHandle handle) {
+    String secret = createSecret();
+    secretToPendingApps.put(secret, handle);
+    return secret;
   }
 
   @Override
   public void close() throws IOException {
     synchronized (this) {
-      if (running) {
-        running = false;
-        timeoutTimer.cancel();
-        server.close();
-        synchronized (clients) {
-          List<ServerConnection> copy = new ArrayList<>(clients);
-          clients.clear();
-          for (ServerConnection client : copy) {
-            client.close();
-          }
-        }
+      if (!running) {
+        return;
+      }
+      running = false;
+    }
+
+    synchronized(LauncherServer.class) {
+      serverInstance = null;
+    }
+
+    timeoutTimer.cancel();
+    server.close();
+    synchronized (clients) {
+      List<ServerConnection> copy = new ArrayList<>(clients);
+      clients.clear();
+      for (ServerConnection client : copy) {
+        client.close();
       }
     }
+
     if (serverThread != null) {
       try {
         serverThread.join();
@@ -195,8 +196,6 @@ void unref() {
           close();
         } catch (IOException ioe) {
           // no-op.
-        } finally {
-          serverInstance = null;
         }
       }
     }
@@ -210,8 +209,14 @@ int getPort() {
    * Removes the client handle from the pending list (in case it's still there), and unrefs
    * the server.
    */
-  void unregister(ChildProcAppHandle handle) {
-    pending.remove(handle.getSecret());
+  void unregister(AbstractAppHandle handle) {
+    for (Map.Entry<String, AbstractAppHandle> e : secretToPendingApps.entrySet()) {
+      if (e.getValue().equals(handle)) {
+        String secret = e.getKey();
+        secretToPendingApps.remove(secret);
+        break;
+      }
+    }
     unref();
   }
 
@@ -260,24 +265,30 @@ private long getConnectionTimeout() {
   }
 
   private String createSecret() {
-    byte[] secret = new byte[128];
-    RND.nextBytes(secret);
-
-    StringBuilder sb = new StringBuilder();
-    for (byte b : secret) {
-      int ival = b >= 0 ? b : Byte.MAX_VALUE - b;
-      if (ival < 0x10) {
-        sb.append("0");
+    while (true) {
+      byte[] secret = new byte[128];
+      RND.nextBytes(secret);
+
+      StringBuilder sb = new StringBuilder();
+      for (byte b : secret) {
+        int ival = b >= 0 ? b : Byte.MAX_VALUE - b;
+        if (ival < 0x10) {
+          sb.append("0");
+        }
+        sb.append(Integer.toHexString(ival));
+      }
+
+      String secretStr = sb.toString();
+      if (!secretToPendingApps.containsKey(secretStr)) {
+        return secretStr;
       }
-      sb.append(Integer.toHexString(ival));
     }
-    return sb.toString();
   }
 
   private class ServerConnection extends LauncherConnection {
 
     private TimerTask timeout;
-    private ChildProcAppHandle handle;
+    private AbstractAppHandle handle;
 
     ServerConnection(Socket socket, TimerTask timeout) throws IOException {
       super(socket);
@@ -291,7 +302,7 @@ protected void handle(Message msg) throws IOException {
           timeout.cancel();
           timeout = null;
           Hello hello = (Hello) msg;
-          ChildProcAppHandle handle = pending.remove(hello.secret);
+          AbstractAppHandle handle = secretToPendingApps.remove(hello.secret);
           if (handle != null) {
             handle.setConnection(this);
             handle.setState(SparkAppHandle.State.CONNECTED);
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 75b8ef5ca5ef4..f86d40015bd22 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -21,7 +21,6 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -37,7 +36,7 @@
  * to allow clients to configure the Spark application and launch it as a child process.
  * </p>
  */
-public class SparkLauncher {
+public class SparkLauncher extends AbstractLauncher<SparkLauncher> {
 
   /** The Spark master. */
   public static final String SPARK_MASTER = "spark.master";
@@ -109,7 +108,6 @@ public static void setConfig(String name, String value) {
   }
 
   // Visible for testing.
-  final SparkSubmitCommandBuilder builder;
   File workingDir;
   boolean redirectErrorStream;
   ProcessBuilder.Redirect errorStream;
@@ -125,7 +123,6 @@ public SparkLauncher() {
    * @param env Environment variables to set.
    */
   public SparkLauncher(Map<String, String> env) {
-    this.builder = new SparkSubmitCommandBuilder();
     if (env != null) {
       this.builder.childEnv.putAll(env);
     }
@@ -156,297 +153,152 @@ public SparkLauncher setSparkHome(String sparkHome) {
   }
 
   /**
-   * Set a custom properties file with Spark configuration for the application.
+   * Sets the working directory of spark-submit.
    *
-   * @param path Path to custom properties file to use.
+   * @param dir The directory to set as spark-submit's working directory.
    * @return This launcher.
    */
-  public SparkLauncher setPropertiesFile(String path) {
-    checkNotNull(path, "path");
-    builder.setPropertiesFile(path);
+  public SparkLauncher directory(File dir) {
+    workingDir = dir;
     return this;
   }
 
   /**
-   * Set a single configuration value for the application.
+   * Specifies that stderr in spark-submit should be redirected to stdout.
    *
-   * @param key Configuration key.
-   * @param value The value to use.
    * @return This launcher.
    */
-  public SparkLauncher setConf(String key, String value) {
-    checkNotNull(key, "key");
-    checkNotNull(value, "value");
-    checkArgument(key.startsWith("spark."), "'key' must start with 'spark.'");
-    builder.conf.put(key, value);
+  public SparkLauncher redirectError() {
+    redirectErrorStream = true;
     return this;
   }
 
   /**
-   * Set the application name.
+   * Redirects error output to the specified Redirect.
    *
-   * @param appName Application name.
+   * @param to The method of redirection.
    * @return This launcher.
    */
-  public SparkLauncher setAppName(String appName) {
-    checkNotNull(appName, "appName");
-    builder.appName = appName;
+  public SparkLauncher redirectError(ProcessBuilder.Redirect to) {
+    errorStream = to;
     return this;
   }
 
   /**
-   * Set the Spark master for the application.
+   * Redirects standard output to the specified Redirect.
    *
-   * @param master Spark master.
+   * @param to The method of redirection.
    * @return This launcher.
    */
-  public SparkLauncher setMaster(String master) {
-    checkNotNull(master, "master");
-    builder.master = master;
+  public SparkLauncher redirectOutput(ProcessBuilder.Redirect to) {
+    outputStream = to;
     return this;
   }
 
   /**
-   * Set the deploy mode for the application.
+   * Redirects error output to the specified File.
    *
-   * @param mode Deploy mode.
+   * @param errFile The file to which stderr is written.
    * @return This launcher.
    */
-  public SparkLauncher setDeployMode(String mode) {
-    checkNotNull(mode, "mode");
-    builder.deployMode = mode;
+  public SparkLauncher redirectError(File errFile) {
+    errorStream = ProcessBuilder.Redirect.to(errFile);
     return this;
   }
 
   /**
-   * Set the main application resource. This should be the location of a jar file for Scala/Java
-   * applications, or a python script for PySpark applications.
+   * Redirects error output to the specified File.
    *
-   * @param resource Path to the main application resource.
+   * @param outFile The file to which stdout is written.
    * @return This launcher.
    */
-  public SparkLauncher setAppResource(String resource) {
-    checkNotNull(resource, "resource");
-    builder.appResource = resource;
+  public SparkLauncher redirectOutput(File outFile) {
+    outputStream = ProcessBuilder.Redirect.to(outFile);
     return this;
   }
 
   /**
-   * Sets the application class name for Java/Scala applications.
+   * Sets all output to be logged and redirected to a logger with the specified name.
    *
-   * @param mainClass Application's main class.
+   * @param loggerName The name of the logger to log stdout and stderr.
    * @return This launcher.
    */
-  public SparkLauncher setMainClass(String mainClass) {
-    checkNotNull(mainClass, "mainClass");
-    builder.mainClass = mainClass;
+  public SparkLauncher redirectToLog(String loggerName) {
+    setConf(CHILD_PROCESS_LOGGER_NAME, loggerName);
     return this;
   }
 
-  /**
-   * Adds a no-value argument to the Spark invocation. If the argument is known, this method
-   * validates whether the argument is indeed a no-value argument, and throws an exception
-   * otherwise.
-   * <p>
-   * Use this method with caution. It is possible to create an invalid Spark command by passing
-   * unknown arguments to this method, since those are allowed for forward compatibility.
-   *
-   * @since 1.5.0
-   * @param arg Argument to add.
-   * @return This launcher.
-   */
-  public SparkLauncher addSparkArg(String arg) {
-    SparkSubmitOptionParser validator = new ArgumentValidator(false);
-    validator.parse(Arrays.asList(arg));
-    builder.sparkArgs.add(arg);
-    return this;
+  // The following methods just delegate to the parent class, but they are needed to keep
+  // binary compatibility with previous versions of this class.
+
+  @Override
+  public SparkLauncher setPropertiesFile(String path) {
+    return super.setPropertiesFile(path);
   }
 
-  /**
-   * Adds an argument with a value to the Spark invocation. If the argument name corresponds to
-   * a known argument, the code validates that the argument actually expects a value, and throws
-   * an exception otherwise.
-   * <p>
-   * It is safe to add arguments modified by other methods in this class (such as
-   * {@link #setMaster(String)} - the last invocation will be the one to take effect.
-   * <p>
-   * Use this method with caution. It is possible to create an invalid Spark command by passing
-   * unknown arguments to this method, since those are allowed for forward compatibility.
-   *
-   * @since 1.5.0
-   * @param name Name of argument to add.
-   * @param value Value of the argument.
-   * @return This launcher.
-   */
-  public SparkLauncher addSparkArg(String name, String value) {
-    SparkSubmitOptionParser validator = new ArgumentValidator(true);
-    if (validator.MASTER.equals(name)) {
-      setMaster(value);
-    } else if (validator.PROPERTIES_FILE.equals(name)) {
-      setPropertiesFile(value);
-    } else if (validator.CONF.equals(name)) {
-      String[] vals = value.split("=", 2);
-      setConf(vals[0], vals[1]);
-    } else if (validator.CLASS.equals(name)) {
-      setMainClass(value);
-    } else if (validator.JARS.equals(name)) {
-      builder.jars.clear();
-      for (String jar : value.split(",")) {
-        addJar(jar);
-      }
-    } else if (validator.FILES.equals(name)) {
-      builder.files.clear();
-      for (String file : value.split(",")) {
-        addFile(file);
-      }
-    } else if (validator.PY_FILES.equals(name)) {
-      builder.pyFiles.clear();
-      for (String file : value.split(",")) {
-        addPyFile(file);
-      }
-    } else {
-      validator.parse(Arrays.asList(name, value));
-      builder.sparkArgs.add(name);
-      builder.sparkArgs.add(value);
-    }
-    return this;
+  @Override
+  public SparkLauncher setConf(String key, String value) {
+    return super.setConf(key, value);
   }
 
-  /**
-   * Adds command line arguments for the application.
-   *
-   * @param args Arguments to pass to the application's main class.
-   * @return This launcher.
-   */
-  public SparkLauncher addAppArgs(String... args) {
-    for (String arg : args) {
-      checkNotNull(arg, "arg");
-      builder.appArgs.add(arg);
-    }
-    return this;
+  @Override
+  public SparkLauncher setAppName(String appName) {
+    return super.setAppName(appName);
   }
 
-  /**
-   * Adds a jar file to be submitted with the application.
-   *
-   * @param jar Path to the jar file.
-   * @return This launcher.
-   */
-  public SparkLauncher addJar(String jar) {
-    checkNotNull(jar, "jar");
-    builder.jars.add(jar);
-    return this;
+  @Override
+  public SparkLauncher setMaster(String master) {
+    return super.setMaster(master);
   }
 
-  /**
-   * Adds a file to be submitted with the application.
-   *
-   * @param file Path to the file.
-   * @return This launcher.
-   */
-  public SparkLauncher addFile(String file) {
-    checkNotNull(file, "file");
-    builder.files.add(file);
-    return this;
+  @Override
+  public SparkLauncher setDeployMode(String mode) {
+    return super.setDeployMode(mode);
   }
 
-  /**
-   * Adds a python file / zip / egg to be submitted with the application.
-   *
-   * @param file Path to the file.
-   * @return This launcher.
-   */
-  public SparkLauncher addPyFile(String file) {
-    checkNotNull(file, "file");
-    builder.pyFiles.add(file);
-    return this;
+  @Override
+  public SparkLauncher setAppResource(String resource) {
+    return super.setAppResource(resource);
   }
 
-  /**
-   * Enables verbose reporting for SparkSubmit.
-   *
-   * @param verbose Whether to enable verbose output.
-   * @return This launcher.
-   */
-  public SparkLauncher setVerbose(boolean verbose) {
-    builder.verbose = verbose;
-    return this;
+  @Override
+  public SparkLauncher setMainClass(String mainClass) {
+    return super.setMainClass(mainClass);
   }
 
-  /**
-   * Sets the working directory of spark-submit.
-   *
-   * @param dir The directory to set as spark-submit's working directory.
-   * @return This launcher.
-   */
-  public SparkLauncher directory(File dir) {
-    workingDir = dir;
-    return this;
+  @Override
+  public SparkLauncher addSparkArg(String arg) {
+    return super.addSparkArg(arg);
   }
 
-  /**
-   * Specifies that stderr in spark-submit should be redirected to stdout.
-   *
-   * @return This launcher.
-   */
-  public SparkLauncher redirectError() {
-    redirectErrorStream = true;
-    return this;
+  @Override
+  public SparkLauncher addSparkArg(String name, String value) {
+    return super.addSparkArg(name, value);
   }
 
-  /**
-   * Redirects error output to the specified Redirect.
-   *
-   * @param to The method of redirection.
-   * @return This launcher.
-   */
-  public SparkLauncher redirectError(ProcessBuilder.Redirect to) {
-    errorStream = to;
-    return this;
+  @Override
+  public SparkLauncher addAppArgs(String... args) {
+    return super.addAppArgs(args);
   }
 
-  /**
-   * Redirects standard output to the specified Redirect.
-   *
-   * @param to The method of redirection.
-   * @return This launcher.
-   */
-  public SparkLauncher redirectOutput(ProcessBuilder.Redirect to) {
-    outputStream = to;
-    return this;
+  @Override
+  public SparkLauncher addJar(String jar) {
+    return super.addJar(jar);
   }
 
-  /**
-   * Redirects error output to the specified File.
-   *
-   * @param errFile The file to which stderr is written.
-   * @return This launcher.
-   */
-  public SparkLauncher redirectError(File errFile) {
-    errorStream = ProcessBuilder.Redirect.to(errFile);
-    return this;
+  @Override
+  public SparkLauncher addFile(String file) {
+    return super.addFile(file);
   }
 
-  /**
-   * Redirects error output to the specified File.
-   *
-   * @param outFile The file to which stdout is written.
-   * @return This launcher.
-   */
-  public SparkLauncher redirectOutput(File outFile) {
-    outputStream = ProcessBuilder.Redirect.to(outFile);
-    return this;
+  @Override
+  public SparkLauncher addPyFile(String file) {
+    return super.addPyFile(file);
   }
 
-  /**
-   * Sets all output to be logged and redirected to a logger with the specified name.
-   *
-   * @param loggerName The name of the logger to log stdout and stderr.
-   * @return This launcher.
-   */
-  public SparkLauncher redirectToLog(String loggerName) {
-    setConf(CHILD_PROCESS_LOGGER_NAME, loggerName);
-    return this;
+  @Override
+  public SparkLauncher setVerbose(boolean verbose) {
+    return super.setVerbose(verbose);
   }
 
   /**
@@ -479,17 +331,9 @@ public Process launch() throws IOException {
 
   /**
    * Starts a Spark application.
+   *
    * <p>
-   * This method returns a handle that provides information about the running application and can
-   * be used to do basic interaction with it.
-   * <p>
-   * The returned handle assumes that the application will instantiate a single SparkContext
-   * during its lifetime. Once that context reports a final state (one that indicates the
-   * SparkContext has stopped), the handle will not perform new state transitions, so anything
-   * that happens after that cannot be monitored. If the underlying application is launched as
-   * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process.
-   * <p>
-   * Currently, all applications are launched as child processes. The child's stdout and stderr
+   * Applications launched by this launcher run as child processes. The child's stdout and stderr
    * are merged and written to a logger (see <code>java.util.logging</code>) only if redirection
    * has not otherwise been configured on this <code>SparkLauncher</code>. The logger's name can be
    * defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If that
@@ -499,15 +343,20 @@ public Process launch() throws IOException {
    * easily into the configuration of commonly-used logging systems.
    *
    * @since 1.6.0
+   * @see AbstractLauncher#startApplication(SparkAppHandle.Listener...)
    * @param listeners Listeners to add to the handle before the app is launched.
    * @return A handle for the launched application.
    */
+  @Override
   public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) throws IOException {
-    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
     for (SparkAppHandle.Listener l : listeners) {
       handle.addListener(l);
     }
 
+    String secret = server.registerHandle(handle);
+
     String loggerName = getLoggerName();
     ProcessBuilder pb = createBuilder();
 
@@ -540,9 +389,8 @@ public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) thr
       pb.redirectErrorStream(true);
     }
 
-    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_PORT,
-      String.valueOf(LauncherServer.getServerInstance().getPort()));
-    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, handle.getSecret());
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_PORT, String.valueOf(server.getPort()));
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, secret);
     try {
       Process child = pb.start();
       InputStream logStream = null;
@@ -604,6 +452,11 @@ private ProcessBuilder createBuilder() throws IOException {
     return pb;
   }
 
+  @Override
+  SparkLauncher self() {
+    return this;
+  }
+
   // Visible for testing.
   String findSparkSubmit() {
     String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
@@ -614,32 +467,4 @@ private String getLoggerName() throws IOException {
     return builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
   }
 
-  private static class ArgumentValidator extends SparkSubmitOptionParser {
-
-    private final boolean hasValue;
-
-    ArgumentValidator(boolean hasValue) {
-      this.hasValue = hasValue;
-    }
-
-    @Override
-    protected boolean handle(String opt, String value) {
-      if (value == null && hasValue) {
-        throw new IllegalArgumentException(String.format("'%s' expects a value.", opt));
-      }
-      return true;
-    }
-
-    @Override
-    protected boolean handleUnknown(String opt) {
-      // Do not fail on unknown arguments, to support future arguments added to SparkSubmit.
-      return true;
-    }
-
-    protected void handleExtraArgs(List<String> extra) {
-      // No op.
-    }
-
-  }
-
 }
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index 5f2da036ff9f7..e0ef22d7d5058 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -355,7 +355,7 @@ private void constructEnvVarArgs(
     env.put(submitArgsEnvVariable, submitArgs.toString());
   }
 
-  private boolean isClientMode(Map<String, String> userProps) {
+  boolean isClientMode(Map<String, String> userProps) {
     String userMaster = firstNonEmpty(master, userProps.get(SparkLauncher.SPARK_MASTER));
     String userDeployMode = firstNonEmpty(deployMode, userProps.get(SparkLauncher.DEPLOY_MODE));
     // Default master is "local[*]", so assume client mode in that case
diff --git a/launcher/src/main/java/org/apache/spark/launcher/package-info.java b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
index d1ac39bdc76a9..248b6d978733a 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/package-info.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
@@ -16,17 +16,18 @@
  */
 
 /**
- * Library for launching Spark applications.
+ * Library for launching Spark applications programmatically.
  *
  * <p>
- * This library allows applications to launch Spark programmatically. There's only one entry
- * point to the library - the {@link org.apache.spark.launcher.SparkLauncher} class.
+ * There are two ways to start applications with this library: as a child process, using
+ * {@link org.apache.spark.launcher.SparkLauncher}, or in-process, using
+ * {@link org.apache.spark.launcher.InProcessLauncher}.
  * </p>
  *
  * <p>
- * The {@link org.apache.spark.launcher.SparkLauncher#startApplication(
- * org.apache.spark.launcher.SparkAppHandle.Listener...)} can be used to start Spark and provide
- * a handle to monitor and control the running application:
+ * The {@link org.apache.spark.launcher.AbstractLauncher#startApplication(
+ * org.apache.spark.launcher.SparkAppHandle.Listener...)}  method can be used to start Spark and
+ * provide a handle to monitor and control the running application:
  * </p>
  *
  * <pre>
@@ -49,7 +50,20 @@
  * </pre>
  *
  * <p>
- * It's also possible to launch a raw child process, using the
+ * Launching applications as a child process requires a full Spark installation. The installation
+ * directory can be provided to the launcher explicitly in the launcher's configuration, or by
+ * setting the <i>SPARK_HOME</i> environment variable.
+ * </p>
+ *
+ * <p>
+ * Launching applications in-process is only recommended in cluster mode, since Spark cannot run
+ * multiple client-mode applications concurrently in the same process. The in-process launcher
+ * requires the necessary Spark dependencies (such as spark-core and cluster manager-specific
+ * modules) to be present in the caller thread's class loader.
+ * </p>
+ *
+ * <p>
+ * It's also possible to launch a raw child process, without the extra monitoring, using the
  * {@link org.apache.spark.launcher.SparkLauncher#launch()} method:
  * </p>
  *
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
index 23e2c64d6dcd7..3e1a90eae98d4 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -17,10 +17,14 @@
 
 package org.apache.spark.launcher;
 
+import java.util.concurrent.TimeUnit;
+
+import org.junit.After;
 import org.slf4j.bridge.SLF4JBridgeHandler;
+import static org.junit.Assert.*;
 
 /**
- * Handles configuring the JUL -> SLF4J bridge.
+ * Handles configuring the JUL -> SLF4J bridge, and provides some utility methods for tests.
  */
 class BaseSuite {
 
@@ -29,4 +33,33 @@ class BaseSuite {
     SLF4JBridgeHandler.install();
   }
 
+  @After
+  public void postChecks() {
+    LauncherServer server = LauncherServer.getServer();
+    if (server != null) {
+      // Shut down the server to clean things up for the next test.
+      try {
+        server.close();
+      } catch (Exception e) {
+        // Ignore.
+      }
+    }
+    assertNull(server);
+  }
+
+  protected void waitFor(SparkAppHandle handle) throws Exception {
+    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
+    try {
+      while (!handle.getState().isFinal()) {
+        assertTrue("Timed out waiting for handle to transition to final state.",
+          System.nanoTime() < deadline);
+        TimeUnit.MILLISECONDS.sleep(10);
+      }
+    } finally {
+      if (!handle.getState().isFinal()) {
+        handle.kill();
+      }
+    }
+  }
+
 }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
index 9f59b41d52d44..fe44efd2e46ab 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/ChildProcAppHandleSuite.java
@@ -24,7 +24,6 @@
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.List;
-import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import static java.nio.file.attribute.PosixFilePermission.*;
 
@@ -217,21 +216,6 @@ public void testFailedChildProc() throws Exception {
     assertEquals(SparkAppHandle.State.FAILED, handle.getState());
   }
 
-  private void waitFor(SparkAppHandle handle) throws Exception {
-    long deadline = System.nanoTime() + TimeUnit.SECONDS.toNanos(10);
-    try {
-      while (!handle.getState().isFinal()) {
-        assertTrue("Timed out waiting for handle to transition to final state.",
-          System.nanoTime() < deadline);
-        TimeUnit.MILLISECONDS.sleep(10);
-      }
-    } finally {
-      if (!handle.getState().isFinal()) {
-        handle.kill();
-      }
-    }
-  }
-
   private static class TestSparkLauncher extends SparkLauncher {
 
     TestSparkLauncher() {
diff --git a/launcher/src/test/java/org/apache/spark/launcher/InProcessLauncherSuite.java b/launcher/src/test/java/org/apache/spark/launcher/InProcessLauncherSuite.java
new file mode 100644
index 0000000000000..b23e56537040f
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/InProcessLauncherSuite.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.lang.reflect.Method;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.junit.Assert.*;
+
+public class InProcessLauncherSuite extends BaseSuite {
+
+  // Arguments passed to the test class to identify the test being run.
+  private static final String TEST_SUCCESS = "success";
+  private static final String TEST_FAILURE = "failure";
+  private static final String TEST_KILL = "kill";
+
+  private static final String TEST_FAILURE_MESSAGE = "d'oh";
+
+  private static Throwable lastError;
+
+  @Before
+  public void testSetup() {
+    lastError = null;
+  }
+
+  @Test
+  public void testLauncher() throws Exception {
+    SparkAppHandle app = startTest(TEST_SUCCESS);
+    waitFor(app);
+    assertNull(lastError);
+
+    // Because the test doesn't implement the launcher protocol, the final state here will be
+    // LOST instead of FINISHED.
+    assertEquals(SparkAppHandle.State.LOST, app.getState());
+  }
+
+  @Test
+  public void testKill() throws Exception {
+    SparkAppHandle app = startTest(TEST_KILL);
+    app.kill();
+    waitFor(app);
+    assertNull(lastError);
+    assertEquals(SparkAppHandle.State.KILLED, app.getState());
+  }
+
+  @Test
+  public void testErrorPropagation() throws Exception {
+    SparkAppHandle app = startTest(TEST_FAILURE);
+    waitFor(app);
+    assertEquals(SparkAppHandle.State.FAILED, app.getState());
+
+    assertNotNull(lastError);
+    assertEquals(TEST_FAILURE_MESSAGE, lastError.getMessage());
+  }
+
+  private SparkAppHandle startTest(String test) throws Exception {
+    return new TestInProcessLauncher()
+      .addAppArgs(test)
+      .setAppResource(SparkLauncher.NO_RESOURCE)
+      .startApplication();
+  }
+
+  public static void runTest(String[] args) {
+    try {
+      assertTrue(args.length != 0);
+
+      // Make sure at least the launcher-provided config options are in the args array.
+      final AtomicReference<String> port = new AtomicReference<>();
+      final AtomicReference<String> secret = new AtomicReference<>();
+      SparkSubmitOptionParser parser = new SparkSubmitOptionParser() {
+
+        @Override
+        protected boolean handle(String opt, String value) {
+          if (opt == CONF) {
+            String[] conf = value.split("=");
+            switch(conf[0]) {
+              case LauncherProtocol.CONF_LAUNCHER_PORT:
+                port.set(conf[1]);
+                break;
+
+              case LauncherProtocol.CONF_LAUNCHER_SECRET:
+                secret.set(conf[1]);
+                break;
+
+              default:
+                // no op
+            }
+          }
+
+          return true;
+        }
+
+        @Override
+        protected boolean handleUnknown(String opt) {
+          return true;
+        }
+
+        @Override
+        protected void handleExtraArgs(List<String> extra) {
+          // no op.
+        }
+
+      };
+
+      parser.parse(Arrays.asList(args));
+      assertNotNull("Launcher port not found.", port.get());
+      assertNotNull("Launcher secret not found.", secret.get());
+
+      String test = args[args.length - 1];
+      switch (test) {
+      case TEST_SUCCESS:
+        break;
+
+      case TEST_FAILURE:
+        throw new IllegalStateException(TEST_FAILURE_MESSAGE);
+
+      case TEST_KILL:
+        try {
+          // Wait for a reasonable amount of time to avoid the test hanging forever on failure,
+          // but still allowing for time outs to hopefully not occur on busy machines.
+          Thread.sleep(10000);
+          fail("Did not get expected interrupt after 10s.");
+        } catch (InterruptedException ie) {
+          // Expected.
+        }
+        break;
+
+      default:
+        fail("Unknown test " + test);
+      }
+    } catch (Throwable t) {
+      lastError = t;
+      throw new RuntimeException(t);
+    }
+  }
+
+  private static class TestInProcessLauncher extends InProcessLauncher {
+
+    @Override
+    Method findSparkSubmit() throws IOException {
+      try {
+        return InProcessLauncherSuite.class.getMethod("runTest", String[].class);
+      } catch (Exception e) {
+        throw new IOException(e);
+      }
+    }
+
+  }
+
+}
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index d0f6abe2fe692..7e2b09ce25c9b 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -39,39 +39,27 @@ public class LauncherServerSuite extends BaseSuite {
 
   @Test
   public void testLauncherServerReuse() throws Exception {
-    ChildProcAppHandle handle1 = null;
-    ChildProcAppHandle handle2 = null;
-    ChildProcAppHandle handle3 = null;
+    LauncherServer server1 = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server1);
+    handle.kill();
 
+    LauncherServer server2 = LauncherServer.getOrCreateServer();
     try {
-      handle1 = LauncherServer.newAppHandle();
-      handle2 = LauncherServer.newAppHandle();
-      LauncherServer server1 = handle1.getServer();
-      assertSame(server1, handle2.getServer());
-
-      handle1.kill();
-      handle2.kill();
-
-      handle3 = LauncherServer.newAppHandle();
-      assertNotSame(server1, handle3.getServer());
-
-      handle3.kill();
-
-      assertNull(LauncherServer.getServerInstance());
+      assertNotSame(server1, server2);
     } finally {
-      kill(handle1);
-      kill(handle2);
-      kill(handle3);
+      server2.unref();
     }
   }
 
   @Test
   public void testCommunication() throws Exception {
-    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
+    String secret = server.registerHandle(handle);
+
     TestClient client = null;
     try {
-      Socket s = new Socket(InetAddress.getLoopbackAddress(),
-        LauncherServer.getServerInstance().getPort());
+      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
 
       final Semaphore semaphore = new Semaphore(0);
       handle.addListener(new SparkAppHandle.Listener() {
@@ -86,7 +74,7 @@ public void infoChanged(SparkAppHandle handle) {
       });
 
       client = new TestClient(s);
-      client.send(new Hello(handle.getSecret(), "1.4.0"));
+      client.send(new Hello(secret, "1.4.0"));
       assertTrue(semaphore.tryAcquire(30, TimeUnit.SECONDS));
 
       // Make sure the server matched the client to the handle.
@@ -104,7 +92,7 @@ public void infoChanged(SparkAppHandle handle) {
       Message stopMsg = client.inbound.poll(30, TimeUnit.SECONDS);
       assertTrue(stopMsg instanceof Stop);
     } finally {
-      kill(handle);
+      handle.kill();
       close(client);
       client.clientThread.join();
     }
@@ -112,34 +100,36 @@ public void infoChanged(SparkAppHandle handle) {
 
   @Test
   public void testTimeout() throws Exception {
-    ChildProcAppHandle handle = null;
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
+    String secret = server.registerHandle(handle);
+
     TestClient client = null;
     try {
       // LauncherServer will immediately close the server-side socket when the timeout is set
       // to 0.
       SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, "0");
 
-      handle = LauncherServer.newAppHandle();
-
-      Socket s = new Socket(InetAddress.getLoopbackAddress(),
-        LauncherServer.getServerInstance().getPort());
+      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
       client = new TestClient(s);
-      waitForError(client, handle.getSecret());
+      waitForError(client, secret);
     } finally {
       SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
-      kill(handle);
+      handle.kill();
       close(client);
     }
   }
 
   @Test
   public void testSparkSubmitVmShutsDown() throws Exception {
-    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
+    String secret = server.registerHandle(handle);
+
     TestClient client = null;
     final Semaphore semaphore = new Semaphore(0);
     try {
-      Socket s = new Socket(InetAddress.getLoopbackAddress(),
-        LauncherServer.getServerInstance().getPort());
+      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
       handle.addListener(new SparkAppHandle.Listener() {
         public void stateChanged(SparkAppHandle handle) {
           semaphore.release();
@@ -149,7 +139,7 @@ public void infoChanged(SparkAppHandle handle) {
         }
       });
       client = new TestClient(s);
-      client.send(new Hello(handle.getSecret(), "1.4.0"));
+      client.send(new Hello(secret, "1.4.0"));
       assertTrue(semaphore.tryAcquire(30, TimeUnit.SECONDS));
       // Make sure the server matched the client to the handle.
       assertNotNull(handle.getConnection());
@@ -157,7 +147,7 @@ public void infoChanged(SparkAppHandle handle) {
       assertTrue(semaphore.tryAcquire(30, TimeUnit.SECONDS));
       assertEquals(SparkAppHandle.State.LOST, handle.getState());
     } finally {
-      kill(handle);
+      handle.kill();
       close(client);
       client.clientThread.join();
     }
@@ -165,11 +155,13 @@ public void infoChanged(SparkAppHandle handle) {
 
   @Test
   public void testStreamFiltering() throws Exception {
-    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    LauncherServer server = LauncherServer.getOrCreateServer();
+    ChildProcAppHandle handle = new ChildProcAppHandle(server);
+    String secret = server.registerHandle(handle);
+
     TestClient client = null;
     try {
-      Socket s = new Socket(InetAddress.getLoopbackAddress(),
-        LauncherServer.getServerInstance().getPort());
+      Socket s = new Socket(InetAddress.getLoopbackAddress(), server.getPort());
 
       client = new TestClient(s);
 
@@ -181,21 +173,15 @@ public void testStreamFiltering() throws Exception {
         // happening for other reasons).
       }
 
-      waitForError(client, handle.getSecret());
+      waitForError(client, secret);
       assertEquals(0, EvilPayload.EVIL_BIT);
     } finally {
-      kill(handle);
+      handle.kill();
       close(client);
       client.clientThread.join();
     }
   }
 
-  private void kill(SparkAppHandle handle) {
-    if (handle != null) {
-      handle.kill();
-    }
-  }
-
   private void close(Closeable c) {
     if (c != null) {
       try {
diff --git a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
index 191415a2578b2..53f5f61cca486 100644
--- a/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
+++ b/resource-managers/mesos/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosCoarseGrainedSchedulerBackend.scala
@@ -92,6 +92,8 @@ private[spark] class MesosCoarseGrainedSchedulerBackend(
   private[this] var stopCalled: Boolean = false
 
   private val launcherBackend = new LauncherBackend() {
+    override protected def conf: SparkConf = sc.conf
+
     override protected def onStopRequest(): Unit = {
       stopSchedulerBackend()
       setState(SparkAppHandle.State.KILLED)
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 3781b261a0381..15328d08b3b5c 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -100,6 +100,8 @@ private[spark] class Client(
   private var amKeytabFileName: String = null
 
   private val launcherBackend = new LauncherBackend() {
+    override protected def conf: SparkConf = sparkConf
+
     override def onStopRequest(): Unit = {
       if (isClusterMode && appId != null) {
         yarnClient.killApplication(appId)

From ffe6fd77a42043db8ebaf43d98059dcd28a53f1e Mon Sep 17 00:00:00 2001
From: soonmok-kwon <soonmok.kwon@navercorp.com>
Date: Fri, 29 Dec 2017 07:30:06 +0800
Subject: [PATCH 213/356] [SPARK-22818][SQL] csv escape of quote escape

## What changes were proposed in this pull request?

Escape of escape should be considered when using the UniVocity csv encoding/decoding library.

Ref: https://github.com/uniVocity/univocity-parsers#escaping-quote-escape-characters

One option is added for reading and writing CSV: `escapeQuoteEscaping`

## How was this patch tested?

Unit test added.

Author: soonmok-kwon <soonmok.kwon@navercorp.com>

Closes #20004 from ep1804/SPARK-22818.
---
 python/pyspark/sql/readwriter.py              | 33 ++++++++++++-------
 python/pyspark/sql/streaming.py               | 17 ++++++----
 .../apache/spark/sql/DataFrameReader.scala    | 11 ++++---
 .../apache/spark/sql/DataFrameWriter.scala    |  9 +++--
 .../datasources/csv/CSVOptions.scala          | 10 ++++++
 .../sql/streaming/DataStreamReader.scala      | 11 ++++---
 .../execution/datasources/csv/CSVSuite.scala  | 31 +++++++++++++++++
 7 files changed, 94 insertions(+), 28 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 4e58bfb843644..49af1bcee5ef8 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -333,7 +333,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
-            columnNameOfCorruptRecord=None, multiLine=None):
+            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None):
         """Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -344,17 +344,17 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                      or RDD of Strings storing CSV rows.
         :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
                        or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
-        :param sep: sets the single character as a separator for each field and value.
+        :param sep: sets a single character as a separator for each field and value.
                     If None is set, it uses the default value, ``,``.
         :param encoding: decodes the CSV files by the given encoding type. If None is set,
                          it uses the default value, ``UTF-8``.
-        :param quote: sets the single character used for escaping quoted values where the
+        :param quote: sets a single character used for escaping quoted values where the
                       separator can be part of the value. If None is set, it uses the default
                       value, ``"``. If you would like to turn off quotations, you need to set an
                       empty string.
-        :param escape: sets the single character used for escaping quotes inside an already
+        :param escape: sets a single character used for escaping quotes inside an already
                        quoted value. If None is set, it uses the default value, ``\``.
-        :param comment: sets the single character used for skipping lines beginning with this
+        :param comment: sets a single character used for skipping lines beginning with this
                         character. By default (None), it is disabled.
         :param header: uses the first line as names of columns. If None is set, it uses the
                        default value, ``false``.
@@ -410,6 +410,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                                           ``spark.sql.columnNameOfCorruptRecord``.
         :param multiLine: parse records, which may span multiple lines. If None is
                           set, it uses the default value, ``false``.
+        :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for
+                                          the quote character. If None is set, the default value is
+                                          escape character when escape and quote characters are
+                                          different, ``\0`` otherwise.
 
         >>> df = spark.read.csv('python/test_support/sql/ages.csv')
         >>> df.dtypes
@@ -427,7 +431,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
-            columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
+            columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:
@@ -814,7 +819,8 @@ def text(self, path, compression=None):
     @since(2.0)
     def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=None,
             header=None, nullValue=None, escapeQuotes=None, quoteAll=None, dateFormat=None,
-            timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None):
+            timestampFormat=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None,
+            charToEscapeQuoteEscaping=None):
         """Saves the content of the :class:`DataFrame` in CSV format at the specified path.
 
         :param path: the path in any Hadoop supported file system
@@ -829,12 +835,12 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
         :param compression: compression codec to use when saving to file. This can be one of the
                             known case-insensitive shorten names (none, bzip2, gzip, lz4,
                             snappy and deflate).
-        :param sep: sets the single character as a separator for each field and value. If None is
+        :param sep: sets a single character as a separator for each field and value. If None is
                     set, it uses the default value, ``,``.
-        :param quote: sets the single character used for escaping quoted values where the
+        :param quote: sets a single character used for escaping quoted values where the
                       separator can be part of the value. If None is set, it uses the default
                       value, ``"``. If an empty string is set, it uses ``u0000`` (null character).
-        :param escape: sets the single character used for escaping quotes inside an already
+        :param escape: sets a single character used for escaping quotes inside an already
                        quoted value. If None is set, it uses the default value, ``\``
         :param escapeQuotes: a flag indicating whether values containing quotes should always
                              be enclosed in quotes. If None is set, it uses the default value
@@ -860,6 +866,10 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
         :param ignoreTrailingWhiteSpace: a flag indicating whether or not trailing whitespaces from
                                          values being written should be skipped. If None is set, it
                                          uses the default value, ``true``.
+        :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for
+                                          the quote character. If None is set, the default value is
+                                          escape character when escape and quote characters are
+                                          different, ``\0`` otherwise..
 
         >>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
         """
@@ -868,7 +878,8 @@ def csv(self, path, mode=None, compression=None, sep=None, quote=None, escape=No
                        nullValue=nullValue, escapeQuotes=escapeQuotes, quoteAll=quoteAll,
                        dateFormat=dateFormat, timestampFormat=timestampFormat,
                        ignoreLeadingWhiteSpace=ignoreLeadingWhiteSpace,
-                       ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace)
+                       ignoreTrailingWhiteSpace=ignoreTrailingWhiteSpace,
+                       charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
         self._jwrite.csv(path)
 
     @since(1.5)
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index d0aba28788ac9..fb228f99ba7ab 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -560,7 +560,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
-            columnNameOfCorruptRecord=None, multiLine=None):
+            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None):
         """Loads a CSV file stream and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -572,17 +572,17 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         :param path: string, or list of strings, for input path(s).
         :param schema: an optional :class:`pyspark.sql.types.StructType` for the input schema
                        or a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
-        :param sep: sets the single character as a separator for each field and value.
+        :param sep: sets a single character as a separator for each field and value.
                     If None is set, it uses the default value, ``,``.
         :param encoding: decodes the CSV files by the given encoding type. If None is set,
                          it uses the default value, ``UTF-8``.
-        :param quote: sets the single character used for escaping quoted values where the
+        :param quote: sets a single character used for escaping quoted values where the
                       separator can be part of the value. If None is set, it uses the default
                       value, ``"``. If you would like to turn off quotations, you need to set an
                       empty string.
-        :param escape: sets the single character used for escaping quotes inside an already
+        :param escape: sets a single character used for escaping quotes inside an already
                        quoted value. If None is set, it uses the default value, ``\``.
-        :param comment: sets the single character used for skipping lines beginning with this
+        :param comment: sets a single character used for skipping lines beginning with this
                         character. By default (None), it is disabled.
         :param header: uses the first line as names of columns. If None is set, it uses the
                        default value, ``false``.
@@ -638,6 +638,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                                           ``spark.sql.columnNameOfCorruptRecord``.
         :param multiLine: parse one record, which may span multiple lines. If None is
                           set, it uses the default value, ``false``.
+        :param charToEscapeQuoteEscaping: sets a single character used for escaping the escape for
+                                          the quote character. If None is set, the default value is
+                                          escape character when escape and quote characters are
+                                          different, ``\0`` otherwise..
 
         >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)
         >>> csv_sdf.isStreaming
@@ -653,7 +657,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
-            columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine)
+            columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
         if isinstance(path, basestring):
             return self._df(self._jreader.csv(path))
         else:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index c43ee91294a27..e8d683a578f35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -517,17 +517,20 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    *
    * You can set the following CSV-specific options to deal with CSV files:
    * <ul>
-   * <li>`sep` (default `,`): sets the single character as a separator for each
+   * <li>`sep` (default `,`): sets a single character as a separator for each
    * field and value.</li>
    * <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
    * type.</li>
-   * <li>`quote` (default `"`): sets the single character used for escaping quoted values where
+   * <li>`quote` (default `"`): sets a single character used for escaping quoted values where
    * the separator can be part of the value. If you would like to turn off quotations, you need to
    * set not `null` but an empty string. This behaviour is different from
    * `com.databricks.spark.csv`.</li>
-   * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
+   * <li>`escape` (default `\`): sets a single character used for escaping quotes inside
    * an already quoted value.</li>
-   * <li>`comment` (default empty string): sets the single character used for skipping lines
+   * <li>`charToEscapeQuoteEscaping` (default `escape` or `\0`): sets a single character used for
+   * escaping the escape for the quote character. The default value is escape character when escape
+   * and quote characters are different, `\0` otherwise.</li>
+   * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 7ccda0ad36d13..bd216ffd91ebd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -594,13 +594,16 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
    *
    * You can set the following CSV-specific option(s) for writing CSV files:
    * <ul>
-   * <li>`sep` (default `,`): sets the single character as a separator for each
+   * <li>`sep` (default `,`): sets a single character as a separator for each
    * field and value.</li>
-   * <li>`quote` (default `"`): sets the single character used for escaping quoted values where
+   * <li>`quote` (default `"`): sets a single character used for escaping quoted values where
    * the separator can be part of the value. If an empty string is set, it uses `u0000`
    * (null character).</li>
-   * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
+   * <li>`escape` (default `\`): sets a single character used for escaping quotes inside
    * an already quoted value.</li>
+   * <li>`charToEscapeQuoteEscaping` (default `escape` or `\0`): sets a single character used for
+   * escaping the escape for the quote character. The default value is escape character when escape
+   * and quote characters are different, `\0` otherwise.</li>
    * <li>`escapeQuotes` (default `true`): a flag indicating whether values containing
    * quotes should always be enclosed in quotes. Default is to escape all values containing
    * a quote character.</li>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index a13a5a34b4a84..c16790630ce17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -89,6 +89,14 @@ class CSVOptions(
 
   val quote = getChar("quote", '\"')
   val escape = getChar("escape", '\\')
+  val charToEscapeQuoteEscaping = parameters.get("charToEscapeQuoteEscaping") match {
+    case None => None
+    case Some(null) => None
+    case Some(value) if value.length == 0 => None
+    case Some(value) if value.length == 1 => Some(value.charAt(0))
+    case _ =>
+      throw new RuntimeException("charToEscapeQuoteEscaping cannot be more than one character")
+  }
   val comment = getChar("comment", '\u0000')
 
   val headerFlag = getBool("header")
@@ -148,6 +156,7 @@ class CSVOptions(
     format.setDelimiter(delimiter)
     format.setQuote(quote)
     format.setQuoteEscape(escape)
+    charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping)
     format.setComment(comment)
     writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite)
     writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite)
@@ -165,6 +174,7 @@ class CSVOptions(
     format.setDelimiter(delimiter)
     format.setQuote(quote)
     format.setQuoteEscape(escape)
+    charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping)
     format.setComment(comment)
     settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead)
     settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index acd5ca17dcf76..2e92beecf2c17 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -262,17 +262,20 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * <ul>
    * <li>`maxFilesPerTrigger` (default: no max limit): sets the maximum number of new files to be
    * considered in every trigger.</li>
-   * <li>`sep` (default `,`): sets the single character as a separator for each
+   * <li>`sep` (default `,`): sets a single character as a separator for each
    * field and value.</li>
    * <li>`encoding` (default `UTF-8`): decodes the CSV files by the given encoding
    * type.</li>
-   * <li>`quote` (default `"`): sets the single character used for escaping quoted values where
+   * <li>`quote` (default `"`): sets a single character used for escaping quoted values where
    * the separator can be part of the value. If you would like to turn off quotations, you need to
    * set not `null` but an empty string. This behaviour is different form
    * `com.databricks.spark.csv`.</li>
-   * <li>`escape` (default `\`): sets the single character used for escaping quotes inside
+   * <li>`escape` (default `\`): sets a single character used for escaping quotes inside
    * an already quoted value.</li>
-   * <li>`comment` (default empty string): sets the single character used for skipping lines
+   * <li>`charToEscapeQuoteEscaping` (default `escape` or `\0`): sets a single character used for
+   * escaping the escape for the quote character. The default value is escape character when escape
+   * and quote characters are different, `\0` otherwise.</li>
+   * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 4fe45420b4e77..4398e547d9217 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -482,6 +482,37 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     }
   }
 
+  test("save csv with quote escaping, using charToEscapeQuoteEscaping option") {
+    withTempPath { path =>
+
+      // original text
+      val df1 = Seq(
+        """You are "beautiful"""",
+        """Yes, \"in the inside"\"""
+      ).toDF()
+
+      // text written in CSV with following options:
+      // quote character: "
+      // escape character: \
+      // character to escape quote escaping: #
+      val df2 = Seq(
+        """"You are \"beautiful\""""",
+        """"Yes, #\\"in the inside\"#\""""
+      ).toDF()
+
+      df2.coalesce(1).write.text(path.getAbsolutePath)
+
+      val df3 = spark.read
+        .format("csv")
+        .option("quote", "\"")
+        .option("escape", "\\")
+        .option("charToEscapeQuoteEscaping", "#")
+        .load(path.getAbsolutePath)
+
+      checkAnswer(df1, df3)
+    }
+  }
+
   test("commented lines in CSV data") {
     Seq("false", "true").foreach { multiLine =>
 

From c74573084e3fb5f8005433c3631a99a85e1c2c7b Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 28 Dec 2017 17:32:30 -0800
Subject: [PATCH 214/356] [SPARK-22905][MLLIB] Fix ChiSqSelectorModel save
 implementation

## What changes were proposed in this pull request?

Currently, in `ChiSqSelectorModel`, save:
```
spark.createDataFrame(dataArray).repartition(1).write...
```
The default partition number used by createDataFrame is "defaultParallelism",
Current RoundRobinPartitioning won't guarantee the "repartition" generating the same order result with local array. We need fix it.

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20088 from WeichenXu123/fix_chisq_model_save.
---
 .../scala/org/apache/spark/mllib/feature/ChiSqSelector.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index 32f15552094e1..f923be871f438 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -144,7 +144,7 @@ object ChiSqSelectorModel extends Loader[ChiSqSelectorModel] {
       val dataArray = Array.tabulate(model.selectedFeatures.length) { i =>
         Data(model.selectedFeatures(i))
       }
-      spark.createDataFrame(dataArray).repartition(1).write.parquet(Loader.dataPath(path))
+      spark.createDataFrame(sc.makeRDD(dataArray, 1)).write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): ChiSqSelectorModel = {

From 796e48c60e8231edd374bdcb6937e185c2fd176d Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Fri, 29 Dec 2017 14:46:03 +0900
Subject: [PATCH 215/356] [SPARK-22313][PYTHON][FOLLOWUP] Explicitly import
 warnings namespace in flume.py

## What changes were proposed in this pull request?

This PR explicitly imports the missing `warnings` in `flume.py`.

## How was this patch tested?

Manually tested.

```python
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>> from pyspark.streaming import flume
>>> flume.FlumeUtils.createStream(None, None, None)
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/.../spark/python/pyspark/streaming/flume.py", line 60, in createStream
    warnings.warn(
NameError: global name 'warnings' is not defined
```

```python
>>> import warnings
>>> warnings.simplefilter('always', DeprecationWarning)
>>> from pyspark.streaming import flume
>>> flume.FlumeUtils.createStream(None, None, None)
/.../spark/python/pyspark/streaming/flume.py:65: DeprecationWarning: Deprecated in 2.3.0. Flume support is deprecated as of Spark 2.3.0. See SPARK-22142.
  DeprecationWarning)
...
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20110 from HyukjinKwon/SPARK-22313-followup.
---
 python/pyspark/streaming/flume.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index 5a975d050b0d8..5de448114ece8 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -20,6 +20,8 @@
     from io import BytesIO
 else:
     from StringIO import StringIO
+import warnings
+
 from py4j.protocol import Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel

From 67ea11ea0fd83a86c2b7546c7cb16e6e50c88cc6 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Fri, 29 Dec 2017 15:05:47 +0800
Subject: [PATCH 216/356] [SPARK-22891][SQL] Make hive client creation thread
 safe

## What changes were proposed in this pull request?

This is to walk around the hive issue: https://issues.apache.org/jira/browse/HIVE-11935

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Feng Liu <fengliu@databricks.com>

Closes #20109 from liufengdb/synchronized.
---
 .../org/apache/spark/sql/hive/client/IsolatedClientLoader.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 930f0dd4b32b8..7a76fd3fd2eb3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -248,7 +248,7 @@ private[hive] class IsolatedClientLoader(
   }
 
   /** The isolated client interface to Hive. */
-  private[hive] def createClient(): HiveClient = {
+  private[hive] def createClient(): HiveClient = synchronized {
     if (!isolationOn) {
       return new HiveClientImpl(version, sparkConf, hadoopConf, config, baseClassLoader, this)
     }

From d4f0b1d2c546fd0a3141dfb31aeccad1b9788bf5 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Fri, 29 Dec 2017 15:28:33 +0800
Subject: [PATCH 217/356] [SPARK-22834][SQL] Make insertion commands have real
 children to fix UI issues

## What changes were proposed in this pull request?

With #19474,  children of insertion commands are missing in UI.
To fix it:
1. Create a new physical plan `DataWritingCommandExec` to exec `DataWritingCommand` with children.  So that the other commands won't be affected.
2. On creation of `DataWritingCommand`, a new field `allColumns` must be specified, which is the output of analyzed plan.
3. In `FileFormatWriter`, the output schema will use `allColumns` instead of the output of optimized plan.

Before code changes:
![2017-12-19 10 27 10](https://user-images.githubusercontent.com/1097932/34161850-d2fd0acc-e50c-11e7-898a-177154fe7d8e.png)

After code changes:
![2017-12-19 10 27 04](https://user-images.githubusercontent.com/1097932/34161865-de23de26-e50c-11e7-9131-0c32f7b7b749.png)

## How was this patch tested?
Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20020 from gengliangwang/insert.
---
 .../apache/spark/sql/DataFrameWriter.scala    |  4 +-
 .../spark/sql/execution/SparkStrategies.scala |  1 +
 .../command/DataWritingCommand.scala          | 28 ++++++++------
 .../sql/execution/command/commands.scala      | 38 ++++++++++++++++++-
 .../execution/datasources/DataSource.scala    | 16 ++------
 .../datasources/DataSourceStrategy.scala      |  3 +-
 .../datasources/FileFormatWriter.scala        | 24 ++++++------
 .../InsertIntoHadoopFsRelationCommand.scala   | 10 +++--
 .../execution/streaming/FileStreamSink.scala  |  5 ++-
 .../spark/sql/hive/HiveStrategies.scala       |  5 ++-
 .../execution/InsertIntoHiveDirCommand.scala  | 14 ++++---
 .../hive/execution/InsertIntoHiveTable.scala  | 16 +++++---
 .../sql/hive/execution/SaveAsHiveFile.scala   | 10 +++--
 .../sql/hive/execution/HiveDDLSuite.scala     | 13 +++++++
 14 files changed, 123 insertions(+), 64 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index bd216ffd91ebd..3304f368e1050 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -26,7 +26,7 @@ import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
 import org.apache.spark.sql.catalyst.catalog._
-import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{AnalysisBarrier, InsertIntoTable, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{CreateTable, DataSource, LogicalRelation}
@@ -264,7 +264,7 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
           sparkSession = df.sparkSession,
           className = source,
           partitionColumns = partitioningColumns.getOrElse(Nil),
-          options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
+          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 8c6c324d456c7..6b3f301aad3f1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -403,6 +403,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
   // Can we automate these 'pass through' operations?
   object BasicOperators extends Strategy {
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
+      case d: DataWritingCommand => DataWritingCommandExec(d, planLater(d.query)) :: Nil
       case r: RunnableCommand => ExecutedCommandExec(r) :: Nil
 
       case MemoryPlan(sink, output) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index 2cf06982e25f6..e56f8105fc9a7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -20,30 +20,32 @@ package org.apache.spark.sql.execution.command
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
+import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.util.SerializableConfiguration
 
-
 /**
- * A special `RunnableCommand` which writes data out and updates metrics.
+ * A special `Command` which writes data out and updates metrics.
  */
-trait DataWritingCommand extends RunnableCommand {
-
+trait DataWritingCommand extends Command {
   /**
    * The input query plan that produces the data to be written.
+   * IMPORTANT: the input query plan MUST be analyzed, so that we can carry its output columns
+   *            to [[FileFormatWriter]].
    */
   def query: LogicalPlan
 
-  // We make the input `query` an inner child instead of a child in order to hide it from the
-  // optimizer. This is because optimizer may not preserve the output schema names' case, and we
-  // have to keep the original analyzed plan here so that we can pass the corrected schema to the
-  // writer. The schema of analyzed plan is what user expects(or specifies), so we should respect
-  // it when writing.
-  override protected def innerChildren: Seq[LogicalPlan] = query :: Nil
+  override final def children: Seq[LogicalPlan] = query :: Nil
 
-  override lazy val metrics: Map[String, SQLMetric] = {
+  // Output columns of the analyzed input query plan
+  def outputColumns: Seq[Attribute]
+
+  lazy val metrics: Map[String, SQLMetric] = {
     val sparkContext = SparkContext.getActive.get
     Map(
       "numFiles" -> SQLMetrics.createMetric(sparkContext, "number of written files"),
@@ -57,4 +59,6 @@ trait DataWritingCommand extends RunnableCommand {
     val serializableHadoopConf = new SerializableConfiguration(hadoopConf)
     new BasicWriteJobStatsTracker(serializableHadoopConf, metrics)
   }
+
+  def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row]
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index e28b5eb2e2a2b..2cc0e38adc2ee 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.errors.TreeNodeException
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
-import org.apache.spark.sql.execution.LeafExecNode
+import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan}
 import org.apache.spark.sql.execution.debug._
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
@@ -87,6 +87,42 @@ case class ExecutedCommandExec(cmd: RunnableCommand) extends LeafExecNode {
   }
 }
 
+/**
+ * A physical operator that executes the run method of a `DataWritingCommand` and
+ * saves the result to prevent multiple executions.
+ *
+ * @param cmd the `DataWritingCommand` this operator will run.
+ * @param child the physical plan child ran by the `DataWritingCommand`.
+ */
+case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan)
+  extends SparkPlan {
+
+  override lazy val metrics: Map[String, SQLMetric] = cmd.metrics
+
+  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
+    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
+    val rows = cmd.run(sqlContext.sparkSession, child)
+
+    rows.map(converter(_).asInstanceOf[InternalRow])
+  }
+
+  override def children: Seq[SparkPlan] = child :: Nil
+
+  override def output: Seq[Attribute] = cmd.output
+
+  override def nodeName: String = "Execute " + cmd.nodeName
+
+  override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
+
+  override def executeToIterator: Iterator[InternalRow] = sideEffectResult.toIterator
+
+  override def executeTake(limit: Int): Array[InternalRow] = sideEffectResult.take(limit).toArray
+
+  protected override def doExecute(): RDD[InternalRow] = {
+    sqlContext.sparkContext.parallelize(sideEffectResult, 1)
+  }
+}
+
 /**
  * An explain command for users to see how a command will be executed.
  *
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index b676672b38cd0..25e1210504273 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -456,17 +456,6 @@ case class DataSource(
     val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
     PartitioningUtils.validatePartitionColumn(data.schema, partitionColumns, caseSensitive)
 
-
-    // SPARK-17230: Resolve the partition columns so InsertIntoHadoopFsRelationCommand does
-    // not need to have the query as child, to avoid to analyze an optimized query,
-    // because InsertIntoHadoopFsRelationCommand will be optimized first.
-    val partitionAttributes = partitionColumns.map { name =>
-      data.output.find(a => equality(a.name, name)).getOrElse {
-        throw new AnalysisException(
-          s"Unable to resolve $name given [${data.output.map(_.name).mkString(", ")}]")
-      }
-    }
-
     val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
       sparkSession.table(tableIdent).queryExecution.analyzed.collect {
         case LogicalRelation(t: HadoopFsRelation, _, _, _) => t.location
@@ -479,14 +468,15 @@ case class DataSource(
       outputPath = outputPath,
       staticPartitions = Map.empty,
       ifPartitionNotExists = false,
-      partitionColumns = partitionAttributes,
+      partitionColumns = partitionColumns.map(UnresolvedAttribute.quoted),
       bucketSpec = bucketSpec,
       fileFormat = format,
       options = options,
       query = data,
       mode = mode,
       catalogTable = catalogTable,
-      fileIndex = fileIndex)
+      fileIndex = fileIndex,
+      outputColumns = data.output)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 400f2e03165b2..d94c5bbccdd84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -208,7 +208,8 @@ case class DataSourceAnalysis(conf: SQLConf) extends Rule[LogicalPlan] with Cast
         actualQuery,
         mode,
         table,
-        Some(t.location))
+        Some(t.location),
+        actualQuery.output)
   }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
index 1fac01a2c26c6..1d80a69bc5a1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormatWriter.scala
@@ -39,7 +39,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, _}
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.{QueryExecution, SortExec, SQLExecution}
+import org.apache.spark.sql.execution.{SortExec, SparkPlan, SQLExecution}
 import org.apache.spark.sql.types.StringType
 import org.apache.spark.util.{SerializableConfiguration, Utils}
 
@@ -56,7 +56,9 @@ object FileFormatWriter extends Logging {
 
   /** Describes how output files should be placed in the filesystem. */
   case class OutputSpec(
-    outputPath: String, customPartitionLocations: Map[TablePartitionSpec, String])
+    outputPath: String,
+    customPartitionLocations: Map[TablePartitionSpec, String],
+    outputColumns: Seq[Attribute])
 
   /** A shared job description for all the write tasks. */
   private class WriteJobDescription(
@@ -101,7 +103,7 @@ object FileFormatWriter extends Logging {
    */
   def write(
       sparkSession: SparkSession,
-      queryExecution: QueryExecution,
+      plan: SparkPlan,
       fileFormat: FileFormat,
       committer: FileCommitProtocol,
       outputSpec: OutputSpec,
@@ -117,11 +119,8 @@ object FileFormatWriter extends Logging {
     job.setOutputValueClass(classOf[InternalRow])
     FileOutputFormat.setOutputPath(job, new Path(outputSpec.outputPath))
 
-    // Pick the attributes from analyzed plan, as optimizer may not preserve the output schema
-    // names' case.
-    val allColumns = queryExecution.analyzed.output
     val partitionSet = AttributeSet(partitionColumns)
-    val dataColumns = allColumns.filterNot(partitionSet.contains)
+    val dataColumns = outputSpec.outputColumns.filterNot(partitionSet.contains)
 
     val bucketIdExpression = bucketSpec.map { spec =>
       val bucketColumns = spec.bucketColumnNames.map(c => dataColumns.find(_.name == c).get)
@@ -144,7 +143,7 @@ object FileFormatWriter extends Logging {
       uuid = UUID.randomUUID().toString,
       serializableHadoopConf = new SerializableConfiguration(job.getConfiguration),
       outputWriterFactory = outputWriterFactory,
-      allColumns = allColumns,
+      allColumns = outputSpec.outputColumns,
       dataColumns = dataColumns,
       partitionColumns = partitionColumns,
       bucketIdExpression = bucketIdExpression,
@@ -160,7 +159,7 @@ object FileFormatWriter extends Logging {
     // We should first sort by partition columns, then bucket id, and finally sorting columns.
     val requiredOrdering = partitionColumns ++ bucketIdExpression ++ sortColumns
     // the sort order doesn't matter
-    val actualOrdering = queryExecution.executedPlan.outputOrdering.map(_.child)
+    val actualOrdering = plan.outputOrdering.map(_.child)
     val orderingMatched = if (requiredOrdering.length > actualOrdering.length) {
       false
     } else {
@@ -178,17 +177,18 @@ object FileFormatWriter extends Logging {
 
     try {
       val rdd = if (orderingMatched) {
-        queryExecution.toRdd
+        plan.execute()
       } else {
         // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
         // the physical plan may have different attribute ids due to optimizer removing some
         // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
         val orderingExpr = requiredOrdering
-          .map(SortOrder(_, Ascending)).map(BindReferences.bindReference(_, allColumns))
+          .map(SortOrder(_, Ascending))
+          .map(BindReferences.bindReference(_, outputSpec.outputColumns))
         SortExec(
           orderingExpr,
           global = false,
-          child = queryExecution.executedPlan).execute()
+          child = plan).execute()
       }
       val ret = new Array[WriteTaskResult](rdd.partitions.length)
       sparkSession.sparkContext.runJob(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index 675bee85bf61e..ad24e280d942a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogT
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
 import org.apache.spark.sql.util.SchemaUtils
 
@@ -52,11 +53,12 @@ case class InsertIntoHadoopFsRelationCommand(
     query: LogicalPlan,
     mode: SaveMode,
     catalogTable: Option[CatalogTable],
-    fileIndex: Option[FileIndex])
+    fileIndex: Option[FileIndex],
+    outputColumns: Seq[Attribute])
   extends DataWritingCommand {
   import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
+  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
     // Most formats don't do well with duplicate columns, so lets not allow that
     SchemaUtils.checkSchemaColumnNameDuplication(
       query.schema,
@@ -139,11 +141,11 @@ case class InsertIntoHadoopFsRelationCommand(
       val updatedPartitionPaths =
         FileFormatWriter.write(
           sparkSession = sparkSession,
-          queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
+          plan = child,
           fileFormat = fileFormat,
           committer = committer,
           outputSpec = FileFormatWriter.OutputSpec(
-            qualifiedOutputPath.toString, customPartitionLocations),
+            qualifiedOutputPath.toString, customPartitionLocations, outputColumns),
           hadoopConf = hadoopConf,
           partitionColumns = partitionColumns,
           bucketSpec = bucketSpec,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
index 6bd0696622005..2715fa93d0e98 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/FileStreamSink.scala
@@ -118,13 +118,14 @@ class FileStreamSink(
           throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}")
         }
       }
+      val qe = data.queryExecution
 
       FileFormatWriter.write(
         sparkSession = sparkSession,
-        queryExecution = data.queryExecution,
+        plan = qe.executedPlan,
         fileFormat = fileFormat,
         committer = committer,
-        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty),
+        outputSpec = FileFormatWriter.OutputSpec(path, Map.empty, qe.analyzed.output),
         hadoopConf = hadoopConf,
         partitionColumns = partitionColumns,
         bucketSpec = None,
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
index a7961c757efa8..ab857b9055720 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala
@@ -148,7 +148,8 @@ object HiveAnalysis extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
     case InsertIntoTable(r: HiveTableRelation, partSpec, query, overwrite, ifPartitionNotExists)
         if DDLUtils.isHiveTable(r.tableMeta) =>
-      InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite, ifPartitionNotExists)
+      InsertIntoHiveTable(r.tableMeta, partSpec, query, overwrite,
+        ifPartitionNotExists, query.output)
 
     case CreateTable(tableDesc, mode, None) if DDLUtils.isHiveTable(tableDesc) =>
       DDLUtils.checkDataColNames(tableDesc)
@@ -163,7 +164,7 @@ object HiveAnalysis extends Rule[LogicalPlan] {
       val outputPath = new Path(storage.locationUri.get)
       if (overwrite) DDLUtils.verifyNotReadPath(child, outputPath)
 
-      InsertIntoHiveDirCommand(isLocal, storage, child, overwrite)
+      InsertIntoHiveDirCommand(isLocal, storage, child, overwrite, child.output)
   }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
index 1c6f8dd77fc2c..cebeca0ce9444 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveDirCommand.scala
@@ -27,10 +27,12 @@ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.apache.hadoop.mapred._
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.hive.client.HiveClientImpl
 
 /**
@@ -54,9 +56,10 @@ case class InsertIntoHiveDirCommand(
     isLocal: Boolean,
     storage: CatalogStorageFormat,
     query: LogicalPlan,
-    overwrite: Boolean) extends SaveAsHiveFile {
+    overwrite: Boolean,
+    outputColumns: Seq[Attribute]) extends SaveAsHiveFile {
 
-  override def run(sparkSession: SparkSession): Seq[Row] = {
+  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
     assert(storage.locationUri.nonEmpty)
 
     val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
@@ -98,10 +101,11 @@ case class InsertIntoHiveDirCommand(
     try {
       saveAsHiveFile(
         sparkSession = sparkSession,
-        queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
+        plan = child,
         hadoopConf = hadoopConf,
         fileSinkConf = fileSinkConf,
-        outputLocation = tmpPath.toString)
+        outputLocation = tmpPath.toString,
+        allColumns = outputColumns)
 
       val fs = writeToPath.getFileSystem(hadoopConf)
       if (overwrite && fs.exists(writeToPath)) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
index b46addb6aa85b..3ce5b8469d6fc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/InsertIntoHiveTable.scala
@@ -23,10 +23,11 @@ import org.apache.hadoop.hive.ql.ErrorMsg
 import org.apache.hadoop.hive.ql.plan.TableDesc
 
 import org.apache.spark.SparkException
-import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
+import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.catalog.{CatalogTable, ExternalCatalog}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.CommandUtils
 import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc}
 import org.apache.spark.sql.hive.client.HiveClientImpl
@@ -67,14 +68,15 @@ case class InsertIntoHiveTable(
     partition: Map[String, Option[String]],
     query: LogicalPlan,
     overwrite: Boolean,
-    ifPartitionNotExists: Boolean) extends SaveAsHiveFile {
+    ifPartitionNotExists: Boolean,
+    outputColumns: Seq[Attribute]) extends SaveAsHiveFile {
 
   /**
    * Inserts all the rows in the table into Hive.  Row objects are properly serialized with the
    * `org.apache.hadoop.hive.serde2.SerDe` and the
    * `org.apache.hadoop.mapred.OutputFormat` provided by the table definition.
    */
-  override def run(sparkSession: SparkSession): Seq[Row] = {
+  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
     val externalCatalog = sparkSession.sharedState.externalCatalog
     val hadoopConf = sparkSession.sessionState.newHadoopConf()
 
@@ -94,7 +96,7 @@ case class InsertIntoHiveTable(
     val tmpLocation = getExternalTmpPath(sparkSession, hadoopConf, tableLocation)
 
     try {
-      processInsert(sparkSession, externalCatalog, hadoopConf, tableDesc, tmpLocation)
+      processInsert(sparkSession, externalCatalog, hadoopConf, tableDesc, tmpLocation, child)
     } finally {
       // Attempt to delete the staging directory and the inclusive files. If failed, the files are
       // expected to be dropped at the normal termination of VM since deleteOnExit is used.
@@ -119,7 +121,8 @@ case class InsertIntoHiveTable(
       externalCatalog: ExternalCatalog,
       hadoopConf: Configuration,
       tableDesc: TableDesc,
-      tmpLocation: Path): Unit = {
+      tmpLocation: Path,
+      child: SparkPlan): Unit = {
     val fileSinkConf = new FileSinkDesc(tmpLocation.toString, tableDesc, false)
 
     val numDynamicPartitions = partition.values.count(_.isEmpty)
@@ -191,10 +194,11 @@ case class InsertIntoHiveTable(
 
     saveAsHiveFile(
       sparkSession = sparkSession,
-      queryExecution = Dataset.ofRows(sparkSession, query).queryExecution,
+      plan = child,
       hadoopConf = hadoopConf,
       fileSinkConf = fileSinkConf,
       outputLocation = tmpLocation.toString,
+      allColumns = outputColumns,
       partitionAttributes = partitionAttributes)
 
     if (partition.nonEmpty) {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
index 63657590e5e79..9a6607f2f2c6c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -33,7 +33,7 @@ import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command.DataWritingCommand
 import org.apache.spark.sql.execution.datasources.FileFormatWriter
 import org.apache.spark.sql.hive.HiveExternalCatalog
@@ -47,10 +47,11 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
 
   protected def saveAsHiveFile(
       sparkSession: SparkSession,
-      queryExecution: QueryExecution,
+      plan: SparkPlan,
       hadoopConf: Configuration,
       fileSinkConf: FileSinkDesc,
       outputLocation: String,
+      allColumns: Seq[Attribute],
       customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty,
       partitionAttributes: Seq[Attribute] = Nil): Set[String] = {
 
@@ -75,10 +76,11 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
 
     FileFormatWriter.write(
       sparkSession = sparkSession,
-      queryExecution = queryExecution,
+      plan = plan,
       fileFormat = new HiveFileFormat(fileSinkConf),
       committer = committer,
-      outputSpec = FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations),
+      outputSpec =
+        FileFormatWriter.OutputSpec(outputLocation, customPartitionLocations, allColumns),
       hadoopConf = hadoopConf,
       partitionColumns = partitionAttributes,
       bucketSpec = None,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index 6c11905ba8904..f2e0c695ca38b 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -2150,4 +2150,17 @@ class HiveDDLSuite
       assert(e.message.contains("LOAD DATA input path does not exist"))
     }
   }
+
+  test("SPARK-22252: FileFormatWriter should respect the input query schema in HIVE") {
+    withTable("t1", "t2", "t3", "t4") {
+      spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1")
+      spark.sql("select COL1, COL2 from t1").write.format("hive").saveAsTable("t2")
+      checkAnswer(spark.table("t2"), Row(0, 0))
+
+      // Test picking part of the columns when writing.
+      spark.range(1).select('id, 'id as 'col1, 'id as 'col2).write.saveAsTable("t3")
+      spark.sql("select COL1, COL2 from t3").write.format("hive").saveAsTable("t4")
+      checkAnswer(spark.table("t4"), Row(0, 0))
+    }
+  }
 }

From 224375c55ff4c832dafbb87c55f2971e6d8994f2 Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wangzhenhua@huawei.com>
Date: Fri, 29 Dec 2017 15:39:56 +0800
Subject: [PATCH 218/356] [SPARK-22892][SQL] Simplify some estimation logic by
 using double instead of decimal

## What changes were proposed in this pull request?

Simplify some estimation logic by using double instead of decimal.

## How was this patch tested?

Existing tests.

Author: Zhenhua Wang <wangzhenhua@huawei.com>

Closes #20062 from wzhfy/simplify_by_double.
---
 .../statsEstimation/EstimationUtils.scala     | 30 +++----
 .../statsEstimation/FilterEstimation.scala    | 84 ++++++++-----------
 .../statsEstimation/ValueInterval.scala       | 14 ++--
 3 files changed, 59 insertions(+), 69 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 71e852afe0659..d793f77413d18 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -89,29 +89,29 @@ object EstimationUtils {
   }
 
   /**
-   * For simplicity we use Decimal to unify operations for data types whose min/max values can be
+   * For simplicity we use Double to unify operations for data types whose min/max values can be
    * represented as numbers, e.g. Boolean can be represented as 0 (false) or 1 (true).
    * The two methods below are the contract of conversion.
    */
-  def toDecimal(value: Any, dataType: DataType): Decimal = {
+  def toDouble(value: Any, dataType: DataType): Double = {
     dataType match {
-      case _: NumericType | DateType | TimestampType => Decimal(value.toString)
-      case BooleanType => if (value.asInstanceOf[Boolean]) Decimal(1) else Decimal(0)
+      case _: NumericType | DateType | TimestampType => value.toString.toDouble
+      case BooleanType => if (value.asInstanceOf[Boolean]) 1 else 0
     }
   }
 
-  def fromDecimal(dec: Decimal, dataType: DataType): Any = {
+  def fromDouble(double: Double, dataType: DataType): Any = {
     dataType match {
-      case BooleanType => dec.toLong == 1
-      case DateType => dec.toInt
-      case TimestampType => dec.toLong
-      case ByteType => dec.toByte
-      case ShortType => dec.toShort
-      case IntegerType => dec.toInt
-      case LongType => dec.toLong
-      case FloatType => dec.toFloat
-      case DoubleType => dec.toDouble
-      case _: DecimalType => dec
+      case BooleanType => double.toInt == 1
+      case DateType => double.toInt
+      case TimestampType => double.toLong
+      case ByteType => double.toByte
+      case ShortType => double.toShort
+      case IntegerType => double.toInt
+      case LongType => double.toLong
+      case FloatType => double.toFloat
+      case DoubleType => double
+      case _: DecimalType => Decimal(double)
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 850dd1ba724a0..4cc32de2d32d7 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -31,7 +31,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
   private val childStats = plan.child.stats
 
-  private val colStatsMap = new ColumnStatsMap(childStats.attributeStats)
+  private val colStatsMap = ColumnStatsMap(childStats.attributeStats)
 
   /**
    * Returns an option of Statistics for a Filter logical plan node.
@@ -47,7 +47,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
     // Estimate selectivity of this filter predicate, and update column stats if needed.
     // For not-supported condition, set filter selectivity to a conservative estimate 100%
-    val filterSelectivity = calculateFilterSelectivity(plan.condition).getOrElse(BigDecimal(1))
+    val filterSelectivity = calculateFilterSelectivity(plan.condition).getOrElse(1.0)
 
     val filteredRowCount: BigInt = ceil(BigDecimal(childStats.rowCount.get) * filterSelectivity)
     val newColStats = if (filteredRowCount == 0) {
@@ -79,17 +79,16 @@ case class FilterEstimation(plan: Filter) extends Logging {
    * @return an optional double value to show the percentage of rows meeting a given condition.
    *         It returns None if the condition is not supported.
    */
-  def calculateFilterSelectivity(condition: Expression, update: Boolean = true)
-    : Option[BigDecimal] = {
+  def calculateFilterSelectivity(condition: Expression, update: Boolean = true): Option[Double] = {
     condition match {
       case And(cond1, cond2) =>
-        val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(BigDecimal(1))
-        val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(BigDecimal(1))
+        val percent1 = calculateFilterSelectivity(cond1, update).getOrElse(1.0)
+        val percent2 = calculateFilterSelectivity(cond2, update).getOrElse(1.0)
         Some(percent1 * percent2)
 
       case Or(cond1, cond2) =>
-        val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(BigDecimal(1))
-        val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(BigDecimal(1))
+        val percent1 = calculateFilterSelectivity(cond1, update = false).getOrElse(1.0)
+        val percent2 = calculateFilterSelectivity(cond2, update = false).getOrElse(1.0)
         Some(percent1 + percent2 - (percent1 * percent2))
 
       // Not-operator pushdown
@@ -131,7 +130,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
    * @return an optional double value to show the percentage of rows meeting a given condition.
    *         It returns None if the condition is not supported.
    */
-  def calculateSingleCondition(condition: Expression, update: Boolean): Option[BigDecimal] = {
+  def calculateSingleCondition(condition: Expression, update: Boolean): Option[Double] = {
     condition match {
       case l: Literal =>
         evaluateLiteral(l)
@@ -225,17 +224,17 @@ case class FilterEstimation(plan: Filter) extends Logging {
   def evaluateNullCheck(
       attr: Attribute,
       isNull: Boolean,
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
     if (!colStatsMap.contains(attr)) {
       logDebug("[CBO] No statistics for " + attr)
       return None
     }
     val colStat = colStatsMap(attr)
     val rowCountValue = childStats.rowCount.get
-    val nullPercent: BigDecimal = if (rowCountValue == 0) {
+    val nullPercent: Double = if (rowCountValue == 0) {
       0
     } else {
-      BigDecimal(colStat.nullCount) / BigDecimal(rowCountValue)
+      (BigDecimal(colStat.nullCount) / BigDecimal(rowCountValue)).toDouble
     }
 
     if (update) {
@@ -271,7 +270,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
       op: BinaryComparison,
       attr: Attribute,
       literal: Literal,
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
     if (!colStatsMap.contains(attr)) {
       logDebug("[CBO] No statistics for " + attr)
       return None
@@ -305,13 +304,12 @@ case class FilterEstimation(plan: Filter) extends Logging {
   def evaluateEquality(
       attr: Attribute,
       literal: Literal,
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
     if (!colStatsMap.contains(attr)) {
       logDebug("[CBO] No statistics for " + attr)
       return None
     }
     val colStat = colStatsMap(attr)
-    val ndv = colStat.distinctCount
 
     // decide if the value is in [min, max] of the column.
     // We currently don't store min/max for binary/string type.
@@ -334,7 +332,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
       if (colStat.histogram.isEmpty) {
         // returns 1/ndv if there is no histogram
-        Some(1.0 / BigDecimal(ndv))
+        Some(1.0 / colStat.distinctCount.toDouble)
       } else {
         Some(computeEqualityPossibilityByHistogram(literal, colStat))
       }
@@ -354,7 +352,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
    * @param literal a literal value (or constant)
    * @return an optional double value to show the percentage of rows meeting a given condition
    */
-  def evaluateLiteral(literal: Literal): Option[BigDecimal] = {
+  def evaluateLiteral(literal: Literal): Option[Double] = {
     literal match {
       case Literal(null, _) => Some(0.0)
       case FalseLiteral => Some(0.0)
@@ -379,7 +377,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
   def evaluateInSet(
       attr: Attribute,
       hSet: Set[Any],
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
     if (!colStatsMap.contains(attr)) {
       logDebug("[CBO] No statistics for " + attr)
       return None
@@ -403,8 +401,8 @@ case class FilterEstimation(plan: Filter) extends Logging {
           return Some(0.0)
         }
 
-        val newMax = validQuerySet.maxBy(EstimationUtils.toDecimal(_, dataType))
-        val newMin = validQuerySet.minBy(EstimationUtils.toDecimal(_, dataType))
+        val newMax = validQuerySet.maxBy(EstimationUtils.toDouble(_, dataType))
+        val newMin = validQuerySet.minBy(EstimationUtils.toDouble(_, dataType))
         // newNdv should not be greater than the old ndv.  For example, column has only 2 values
         // 1 and 6. The predicate column IN (1, 2, 3, 4, 5). validQuerySet.size is 5.
         newNdv = ndv.min(BigInt(validQuerySet.size))
@@ -425,7 +423,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
 
     // return the filter selectivity.  Without advanced statistics such as histograms,
     // we have to assume uniform distribution.
-    Some((BigDecimal(newNdv) / BigDecimal(ndv)).min(1.0))
+    Some(math.min(newNdv.toDouble / ndv.toDouble, 1.0))
   }
 
   /**
@@ -443,21 +441,17 @@ case class FilterEstimation(plan: Filter) extends Logging {
       op: BinaryComparison,
       attr: Attribute,
       literal: Literal,
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
 
     val colStat = colStatsMap(attr)
     val statsInterval =
       ValueInterval(colStat.min, colStat.max, attr.dataType).asInstanceOf[NumericValueInterval]
-    val max = statsInterval.max.toBigDecimal
-    val min = statsInterval.min.toBigDecimal
-    val ndv = BigDecimal(colStat.distinctCount)
+    val max = statsInterval.max
+    val min = statsInterval.min
+    val ndv = colStat.distinctCount.toDouble
 
     // determine the overlapping degree between predicate interval and column's interval
-    val numericLiteral = if (literal.dataType == BooleanType) {
-      if (literal.value.asInstanceOf[Boolean]) BigDecimal(1) else BigDecimal(0)
-    } else {
-      BigDecimal(literal.value.toString)
-    }
+    val numericLiteral = EstimationUtils.toDouble(literal.value, literal.dataType)
     val (noOverlap: Boolean, completeOverlap: Boolean) = op match {
       case _: LessThan =>
         (numericLiteral <= min, numericLiteral > max)
@@ -469,7 +463,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         (numericLiteral > max, numericLiteral <= min)
     }
 
-    var percent = BigDecimal(1)
+    var percent = 1.0
     if (noOverlap) {
       percent = 0.0
     } else if (completeOverlap) {
@@ -518,8 +512,6 @@ case class FilterEstimation(plan: Filter) extends Logging {
         val newValue = Some(literal.value)
         var newMax = colStat.max
         var newMin = colStat.min
-        var newNdv = ceil(ndv * percent)
-        if (newNdv < 1) newNdv = 1
 
         op match {
           case _: GreaterThan | _: GreaterThanOrEqual =>
@@ -528,8 +520,8 @@ case class FilterEstimation(plan: Filter) extends Logging {
             newMax = newValue
         }
 
-        val newStats =
-          colStat.copy(distinctCount = newNdv, min = newMin, max = newMax, nullCount = 0)
+        val newStats = colStat.copy(distinctCount = ceil(ndv * percent),
+          min = newMin, max = newMax, nullCount = 0)
 
         colStatsMap.update(attr, newStats)
       }
@@ -543,13 +535,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
    */
   private def computeEqualityPossibilityByHistogram(
       literal: Literal, colStat: ColumnStat): Double = {
-    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val datum = EstimationUtils.toDouble(literal.value, literal.dataType)
     val histogram = colStat.histogram.get
 
     // find bins where column's current min and max locate.  Note that a column's [min, max]
     // range may change due to another condition applied earlier.
-    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+    val min = EstimationUtils.toDouble(colStat.min.get, literal.dataType)
+    val max = EstimationUtils.toDouble(colStat.max.get, literal.dataType)
 
     // compute how many bins the column's current valid range [min, max] occupies.
     val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
@@ -574,13 +566,13 @@ case class FilterEstimation(plan: Filter) extends Logging {
    */
   private def computeComparisonPossibilityByHistogram(
       op: BinaryComparison, literal: Literal, colStat: ColumnStat): Double = {
-    val datum = EstimationUtils.toDecimal(literal.value, literal.dataType).toDouble
+    val datum = EstimationUtils.toDouble(literal.value, literal.dataType)
     val histogram = colStat.histogram.get
 
     // find bins where column's current min and max locate.  Note that a column's [min, max]
     // range may change due to another condition applied earlier.
-    val min = EstimationUtils.toDecimal(colStat.min.get, literal.dataType).toDouble
-    val max = EstimationUtils.toDecimal(colStat.max.get, literal.dataType).toDouble
+    val min = EstimationUtils.toDouble(colStat.min.get, literal.dataType)
+    val max = EstimationUtils.toDouble(colStat.max.get, literal.dataType)
 
     // compute how many bins the column's current valid range [min, max] occupies.
     val numBinsHoldingEntireRange = EstimationUtils.numBinsHoldingRange(
@@ -643,7 +635,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
       op: BinaryComparison,
       attrLeft: Attribute,
       attrRight: Attribute,
-      update: Boolean): Option[BigDecimal] = {
+      update: Boolean): Option[Double] = {
 
     if (!colStatsMap.contains(attrLeft)) {
       logDebug("[CBO] No statistics for " + attrLeft)
@@ -726,7 +718,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         )
     }
 
-    var percent = BigDecimal(1)
+    var percent = 1.0
     if (noOverlap) {
       percent = 0.0
     } else if (completeOverlap) {
@@ -740,11 +732,9 @@ case class FilterEstimation(plan: Filter) extends Logging {
         // Need to adjust new min/max after the filter condition is applied
 
         val ndvLeft = BigDecimal(colStatLeft.distinctCount)
-        var newNdvLeft = ceil(ndvLeft * percent)
-        if (newNdvLeft < 1) newNdvLeft = 1
+        val newNdvLeft = ceil(ndvLeft * percent)
         val ndvRight = BigDecimal(colStatRight.distinctCount)
-        var newNdvRight = ceil(ndvRight * percent)
-        if (newNdvRight < 1) newNdvRight = 1
+        val newNdvRight = ceil(ndvRight * percent)
 
         var newMaxLeft = colStatLeft.max
         var newMinLeft = colStatLeft.min
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
index 0caaf796a3b68..f46b4ed764e27 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/ValueInterval.scala
@@ -26,10 +26,10 @@ trait ValueInterval {
   def contains(l: Literal): Boolean
 }
 
-/** For simplicity we use decimal to unify operations of numeric intervals. */
-case class NumericValueInterval(min: Decimal, max: Decimal) extends ValueInterval {
+/** For simplicity we use double to unify operations of numeric intervals. */
+case class NumericValueInterval(min: Double, max: Double) extends ValueInterval {
   override def contains(l: Literal): Boolean = {
-    val lit = EstimationUtils.toDecimal(l.value, l.dataType)
+    val lit = EstimationUtils.toDouble(l.value, l.dataType)
     min <= lit && max >= lit
   }
 }
@@ -56,8 +56,8 @@ object ValueInterval {
     case _ if min.isEmpty || max.isEmpty => new NullValueInterval()
     case _ =>
       NumericValueInterval(
-        min = EstimationUtils.toDecimal(min.get, dataType),
-        max = EstimationUtils.toDecimal(max.get, dataType))
+        min = EstimationUtils.toDouble(min.get, dataType),
+        max = EstimationUtils.toDouble(max.get, dataType))
   }
 
   def isIntersected(r1: ValueInterval, r2: ValueInterval): Boolean = (r1, r2) match {
@@ -84,8 +84,8 @@ object ValueInterval {
         // Choose the maximum of two min values, and the minimum of two max values.
         val newMin = if (n1.min <= n2.min) n2.min else n1.min
         val newMax = if (n1.max <= n2.max) n1.max else n2.max
-        (Some(EstimationUtils.fromDecimal(newMin, dt)),
-          Some(EstimationUtils.fromDecimal(newMax, dt)))
+        (Some(EstimationUtils.fromDouble(newMin, dt)),
+          Some(EstimationUtils.fromDouble(newMax, dt)))
     }
   }
 }

From cc30ef8009b82c71a4b8e9caba82ed141761ab85 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Fri, 29 Dec 2017 18:48:47 +0800
Subject: [PATCH 219/356] [SPARK-22916][SQL] shouldn't bias towards build right
 if user does not specify

## What changes were proposed in this pull request?

When there are no broadcast hints, the current spark strategies will prefer to building the right side, without considering the sizes of the two tables. This patch added the logic to consider the sizes of the two tables for the build side. To make the logic clear, the build side is determined by two steps:

1. If there are broadcast hints, the build side is determined by `broadcastSideByHints`;
2. If there are no broadcast hints, the build side is determined by `broadcastSideBySizes`;
3. If the broadcast is disabled by the config, it falls back to the next cases.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Feng Liu <fengliu@databricks.com>

Closes #20099 from liufengdb/fix-spark-strategies.
---
 .../spark/sql/execution/SparkStrategies.scala | 75 ++++++++++++-------
 .../execution/joins/BroadcastJoinSuite.scala  | 75 +++++++++++++++----
 .../execution/metric/SQLMetricsSuite.scala    | 15 +++-
 3 files changed, 116 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 6b3f301aad3f1..0ed7c2f555507 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -158,45 +158,65 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
       def smallerSide =
         if (right.stats.sizeInBytes <= left.stats.sizeInBytes) BuildRight else BuildLeft
 
-      val buildRight = canBuildRight && right.stats.hints.broadcast
-      val buildLeft = canBuildLeft && left.stats.hints.broadcast
-
-      if (buildRight && buildLeft) {
+      if (canBuildRight && canBuildLeft) {
         // Broadcast smaller side base on its estimated physical size
         // if both sides have broadcast hint
         smallerSide
-      } else if (buildRight) {
+      } else if (canBuildRight) {
         BuildRight
-      } else if (buildLeft) {
+      } else if (canBuildLeft) {
         BuildLeft
-      } else if (canBuildRight && canBuildLeft) {
+      } else {
         // for the last default broadcast nested loop join
         smallerSide
-      } else {
-        throw new AnalysisException("Can not decide which side to broadcast for this join")
       }
     }
 
+    private def canBroadcastByHints(joinType: JoinType, left: LogicalPlan, right: LogicalPlan)
+      : Boolean = {
+      val buildLeft = canBuildLeft(joinType) && left.stats.hints.broadcast
+      val buildRight = canBuildRight(joinType) && right.stats.hints.broadcast
+      buildLeft || buildRight
+    }
+
+    private def broadcastSideByHints(joinType: JoinType, left: LogicalPlan, right: LogicalPlan)
+      : BuildSide = {
+      val buildLeft = canBuildLeft(joinType) && left.stats.hints.broadcast
+      val buildRight = canBuildRight(joinType) && right.stats.hints.broadcast
+      broadcastSide(buildLeft, buildRight, left, right)
+    }
+
+    private def canBroadcastBySizes(joinType: JoinType, left: LogicalPlan, right: LogicalPlan)
+      : Boolean = {
+      val buildLeft = canBuildLeft(joinType) && canBroadcast(left)
+      val buildRight = canBuildRight(joinType) && canBroadcast(right)
+      buildLeft || buildRight
+    }
+
+    private def broadcastSideBySizes(joinType: JoinType, left: LogicalPlan, right: LogicalPlan)
+      : BuildSide = {
+      val buildLeft = canBuildLeft(joinType) && canBroadcast(left)
+      val buildRight = canBuildRight(joinType) && canBroadcast(right)
+      broadcastSide(buildLeft, buildRight, left, right)
+    }
+
     def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
 
       // --- BroadcastHashJoin --------------------------------------------------------------------
 
+      // broadcast hints were specified
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
-        if (canBuildRight(joinType) && right.stats.hints.broadcast) ||
-          (canBuildLeft(joinType) && left.stats.hints.broadcast) =>
-        val buildSide = broadcastSide(canBuildLeft(joinType), canBuildRight(joinType), left, right)
+        if canBroadcastByHints(joinType, left, right) =>
+        val buildSide = broadcastSideByHints(joinType, left, right)
         Seq(joins.BroadcastHashJoinExec(
           leftKeys, rightKeys, joinType, buildSide, condition, planLater(left), planLater(right)))
 
+      // broadcast hints were not specified, so need to infer it from size and configuration.
       case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
-        if canBuildRight(joinType) && canBroadcast(right) =>
-        Seq(joins.BroadcastHashJoinExec(
-          leftKeys, rightKeys, joinType, BuildRight, condition, planLater(left), planLater(right)))
-
-      case ExtractEquiJoinKeys(joinType, leftKeys, rightKeys, condition, left, right)
-        if canBuildLeft(joinType) && canBroadcast(left) =>
+        if canBroadcastBySizes(joinType, left, right) =>
+        val buildSide = broadcastSideBySizes(joinType, left, right)
         Seq(joins.BroadcastHashJoinExec(
-          leftKeys, rightKeys, joinType, BuildLeft, condition, planLater(left), planLater(right)))
+          leftKeys, rightKeys, joinType, buildSide, condition, planLater(left), planLater(right)))
 
       // --- ShuffledHashJoin ---------------------------------------------------------------------
 
@@ -225,27 +245,24 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
       // Pick BroadcastNestedLoopJoin if one side could be broadcasted
       case j @ logical.Join(left, right, joinType, condition)
-        if (canBuildRight(joinType) && right.stats.hints.broadcast) ||
-          (canBuildLeft(joinType) && left.stats.hints.broadcast) =>
-        val buildSide = broadcastSide(canBuildLeft(joinType), canBuildRight(joinType), left, right)
+          if canBroadcastByHints(joinType, left, right) =>
+        val buildSide = broadcastSideByHints(joinType, left, right)
         joins.BroadcastNestedLoopJoinExec(
           planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
 
       case j @ logical.Join(left, right, joinType, condition)
-          if canBuildRight(joinType) && canBroadcast(right) =>
+          if canBroadcastBySizes(joinType, left, right) =>
+        val buildSide = broadcastSideBySizes(joinType, left, right)
         joins.BroadcastNestedLoopJoinExec(
-          planLater(left), planLater(right), BuildRight, joinType, condition) :: Nil
-      case j @ logical.Join(left, right, joinType, condition)
-          if canBuildLeft(joinType) && canBroadcast(left) =>
-        joins.BroadcastNestedLoopJoinExec(
-          planLater(left), planLater(right), BuildLeft, joinType, condition) :: Nil
+          planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
 
       // Pick CartesianProduct for InnerJoin
       case logical.Join(left, right, _: InnerLike, condition) =>
         joins.CartesianProductExec(planLater(left), planLater(right), condition) :: Nil
 
       case logical.Join(left, right, joinType, condition) =>
-        val buildSide = broadcastSide(canBuildLeft = true, canBuildRight = true, left, right)
+        val buildSide = broadcastSide(
+          left.stats.hints.broadcast, right.stats.hints.broadcast, left, right)
         // This join could be very slow or OOM
         joins.BroadcastNestedLoopJoinExec(
           planLater(left), planLater(right), buildSide, joinType, condition) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 67e2cdc7394bb..6da46ea3480b3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -225,17 +225,6 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
   }
 
   test("Shouldn't change broadcast join buildSide if user clearly specified") {
-    def assertJoinBuildSide(sqlStr: String, joinMethod: String, buildSide: BuildSide): Any = {
-      val executedPlan = sql(sqlStr).queryExecution.executedPlan
-      executedPlan match {
-        case b: BroadcastNestedLoopJoinExec =>
-          assert(b.getClass.getSimpleName === joinMethod)
-          assert(b.buildSide === buildSide)
-        case w: WholeStageCodegenExec =>
-          assert(w.children.head.getClass.getSimpleName === joinMethod)
-          assert(w.children.head.asInstanceOf[BroadcastHashJoinExec].buildSide === buildSide)
-      }
-    }
 
     withTempView("t1", "t2") {
       spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("t1")
@@ -246,9 +235,6 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
       val t2Size = spark.table("t2").queryExecution.analyzed.children.head.stats.sizeInBytes
       assert(t1Size < t2Size)
 
-      val bh = BroadcastHashJoinExec.toString
-      val bl = BroadcastNestedLoopJoinExec.toString
-
       // INNER JOIN && t1Size < t2Size => BuildLeft
       assertJoinBuildSide(
         "SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
@@ -266,8 +252,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
         "SELECT /*+ MAPJOIN(t2) */ * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildRight)
 
 
-      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0",
-        SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
         // INNER JOIN && t1Size < t2Size => BuildLeft
         assertJoinBuildSide("SELECT /*+ MAPJOIN(t1, t2) */ * FROM t1 JOIN t2", bl, BuildLeft)
         // FULL JOIN && t1Size < t2Size => BuildLeft
@@ -290,4 +275,62 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
       }
     }
   }
+
+  test("Shouldn't bias towards build right if user didn't specify") {
+
+    withTempView("t1", "t2") {
+      spark.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value").createTempView("t1")
+      spark.createDataFrame(Seq((1, "1"), (2, "12.3"), (2, "123"))).toDF("key", "value")
+        .createTempView("t2")
+
+      val t1Size = spark.table("t1").queryExecution.analyzed.children.head.stats.sizeInBytes
+      val t2Size = spark.table("t2").queryExecution.analyzed.children.head.stats.sizeInBytes
+      assert(t1Size < t2Size)
+
+      assertJoinBuildSide("SELECT * FROM t1 JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
+      assertJoinBuildSide("SELECT * FROM t2 JOIN t1 ON t1.key = t2.key", bh, BuildRight)
+
+      assertJoinBuildSide("SELECT * FROM t1 LEFT JOIN t2 ON t1.key = t2.key", bh, BuildRight)
+      assertJoinBuildSide("SELECT * FROM t2 LEFT JOIN t1 ON t1.key = t2.key", bh, BuildRight)
+
+      assertJoinBuildSide("SELECT * FROM t1 RIGHT JOIN t2 ON t1.key = t2.key", bh, BuildLeft)
+      assertJoinBuildSide("SELECT * FROM t2 RIGHT JOIN t1 ON t1.key = t2.key", bh, BuildLeft)
+
+      withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+        assertJoinBuildSide("SELECT * FROM t1 FULL OUTER JOIN t2", bl, BuildLeft)
+        assertJoinBuildSide("SELECT * FROM t2 FULL OUTER JOIN t1", bl, BuildRight)
+
+        assertJoinBuildSide("SELECT * FROM t1 LEFT JOIN t2", bl, BuildRight)
+        assertJoinBuildSide("SELECT * FROM t2 LEFT JOIN t1", bl, BuildRight)
+
+        assertJoinBuildSide("SELECT * FROM t1 RIGHT JOIN t2", bl, BuildLeft)
+        assertJoinBuildSide("SELECT * FROM t2 RIGHT JOIN t1", bl, BuildLeft)
+      }
+    }
+  }
+
+  private val bh = BroadcastHashJoinExec.toString
+  private val bl = BroadcastNestedLoopJoinExec.toString
+
+  private def assertJoinBuildSide(sqlStr: String, joinMethod: String, buildSide: BuildSide): Any = {
+    val executedPlan = sql(sqlStr).queryExecution.executedPlan
+    executedPlan match {
+      case b: BroadcastNestedLoopJoinExec =>
+        assert(b.getClass.getSimpleName === joinMethod)
+        assert(b.buildSide === buildSide)
+      case b: BroadcastNestedLoopJoinExec =>
+        assert(b.getClass.getSimpleName === joinMethod)
+        assert(b.buildSide === buildSide)
+      case w: WholeStageCodegenExec =>
+        assert(w.children.head.getClass.getSimpleName === joinMethod)
+        if (w.children.head.isInstanceOf[BroadcastNestedLoopJoinExec]) {
+          assert(
+            w.children.head.asInstanceOf[BroadcastNestedLoopJoinExec].buildSide === buildSide)
+        } else if (w.children.head.isInstanceOf[BroadcastHashJoinExec]) {
+          assert(w.children.head.asInstanceOf[BroadcastHashJoinExec].buildSide === buildSide)
+        } else {
+          fail()
+        }
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index fc3483379c817..a3a3f3851e21c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -478,15 +478,22 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
       spark.range(10).write.parquet(dir)
       spark.read.parquet(dir).createOrReplaceTempView("pqS")
 
+      // The executed plan looks like:
+      // Exchange RoundRobinPartitioning(2)
+      // +- BroadcastNestedLoopJoin BuildLeft, Cross
+      //   :- BroadcastExchange IdentityBroadcastMode
+      //   :  +- Exchange RoundRobinPartitioning(3)
+      //   :     +- *Range (0, 30, step=1, splits=2)
+      //   +- *FileScan parquet [id#465L] Batched: true, Format: Parquet, Location: ...(ignored)
       val res3 = InputOutputMetricsHelper.run(
         spark.range(30).repartition(3).crossJoin(sql("select * from pqS")).repartition(2).toDF()
       )
       // The query above is executed in the following stages:
-      //   1. sql("select * from pqS")    => (10, 0, 10)
-      //   2. range(30)                   => (30, 0, 30)
-      //   3. crossJoin(...) of 1. and 2. => (0, 30, 300)
+      //   1. range(30)                   => (30, 0, 30)
+      //   2. sql("select * from pqS")    => (0, 30, 0)
+      //   3. crossJoin(...) of 1. and 2. => (10, 0, 300)
       //   4. shuffle & return results    => (0, 300, 0)
-      assert(res3 === (10L, 0L, 10L) :: (30L, 0L, 30L) :: (0L, 30L, 300L) :: (0L, 300L, 0L) :: Nil)
+      assert(res3 === (30L, 0L, 30L) :: (0L, 30L, 0L) :: (10L, 0L, 300L) :: (0L, 300L, 0L) :: Nil)
     }
   }
 

From fcf66a32760c74e601acb537c51b2311ece6e9d5 Mon Sep 17 00:00:00 2001
From: oraviv <oraviv@paypal.com>
Date: Fri, 29 Dec 2017 21:08:34 +0800
Subject: [PATCH 220/356] [SPARK-21657][SQL] optimize explode quadratic memory
 consumpation

## What changes were proposed in this pull request?

The issue has been raised in two Jira tickets: [SPARK-21657](https://issues.apache.org/jira/browse/SPARK-21657), [SPARK-16998](https://issues.apache.org/jira/browse/SPARK-16998). Basically, what happens is that in collection generators like explode/inline we create many rows from each row. Currently each exploded row contains also the column on which it was created. This causes, for example, if we have a 10k array in one row that this array will get copy 10k times - to each of the row. this results a qudratic memory consumption. However, it is a common case that the original column gets projected out after the explode, so we can avoid duplicating it.
In this solution we propose to identify this situation in the optimizer and turn on a flag for omitting the original column in the generation process.

## How was this patch tested?

1. We added a benchmark test to MiscBenchmark that shows x16 improvement in runtimes.
2. We ran some of the other tests in MiscBenchmark and they show 15% improvements.
3. We ran this code on a specific case from our production data with rows containing arrays of size ~200k and it reduced the runtime from 6 hours to 3 mins.

Author: oraviv <oraviv@paypal.com>
Author: uzadude <ohad.raviv@gmail.com>
Author: uzadude <15645757+uzadude@users.noreply.github.com>

Closes #19683 from uzadude/optimize_explode.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  6 +--
 .../sql/catalyst/analysis/CheckAnalysis.scala |  4 +-
 .../spark/sql/catalyst/dsl/package.scala      |  6 +--
 .../sql/catalyst/optimizer/Optimizer.scala    | 13 +++---
 .../sql/catalyst/parser/AstBuilder.scala      |  2 +-
 .../plans/logical/basicLogicalOperators.scala | 21 +++++----
 .../optimizer/ColumnPruningSuite.scala        | 44 ++++++++++++-------
 .../optimizer/FilterPushdownSuite.scala       | 14 +++---
 .../sql/catalyst/parser/PlanParserSuite.scala | 16 +++----
 .../scala/org/apache/spark/sql/Dataset.scala  |  4 +-
 .../spark/sql/execution/GenerateExec.scala    | 29 ++++++------
 .../spark/sql/execution/SparkStrategies.scala |  6 +--
 .../execution/benchmark/MiscBenchmark.scala   | 37 ++++++++++++++++
 13 files changed, 128 insertions(+), 74 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 7f2128ed36a08..1f7191ce0db44 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -696,7 +696,7 @@ class Analyzer(
           (oldVersion, oldVersion.copy(aggregateExpressions = newAliases(aggregateExpressions)))
 
         case oldVersion: Generate
-            if oldVersion.generatedSet.intersect(conflictingAttributes).nonEmpty =>
+            if oldVersion.producedAttributes.intersect(conflictingAttributes).nonEmpty =>
           val newOutput = oldVersion.generatorOutput.map(_.newInstance())
           (oldVersion, oldVersion.copy(generatorOutput = newOutput))
 
@@ -1138,7 +1138,7 @@ class Analyzer(
           case g: Generate =>
             val maybeResolvedExprs = exprs.map(resolveExpression(_, g))
             val (newExprs, newChild) = resolveExprsAndAddMissingAttrs(maybeResolvedExprs, g.child)
-            (newExprs, g.copy(join = true, child = newChild))
+            (newExprs, g.copy(unrequiredChildIndex = Nil, child = newChild))
 
           // For `Distinct` and `SubqueryAlias`, we can't recursively resolve and add attributes
           // via its children.
@@ -1578,7 +1578,7 @@ class Analyzer(
             resolvedGenerator =
               Generate(
                 generator,
-                join = projectList.size > 1, // Only join if there are other expressions in SELECT.
+                unrequiredChildIndex = Nil,
                 outer = outer,
                 qualifier = None,
                 generatorOutput = ResolveGenerate.makeGeneratorOutput(generator, names),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 6894aed15c16f..bbcec5627bd49 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -608,8 +608,8 @@ trait CheckAnalysis extends PredicateHelper {
       // allows to have correlation under it
       // but must not host any outer references.
       // Note:
-      // Generator with join=false is treated as Category 4.
-      case g: Generate if g.join =>
+      // Generator with requiredChildOutput.isEmpty is treated as Category 4.
+      case g: Generate if g.requiredChildOutput.nonEmpty =>
         failOnInvalidOuterReference(g)
 
       // Category 4: Any other operators not in the above 3 categories
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 7c100afcd738f..59cb26d5e6c36 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -359,12 +359,12 @@ package object dsl {
 
       def generate(
         generator: Generator,
-        join: Boolean = false,
+        unrequiredChildIndex: Seq[Int] = Nil,
         outer: Boolean = false,
         alias: Option[String] = None,
         outputNames: Seq[String] = Nil): LogicalPlan =
-        Generate(generator, join = join, outer = outer, alias,
-          outputNames.map(UnresolvedAttribute(_)), logicalPlan)
+        Generate(generator, unrequiredChildIndex, outer,
+          alias, outputNames.map(UnresolvedAttribute(_)), logicalPlan)
 
       def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
         InsertIntoTable(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6a4d1e997c3c5..eeb1b130c578b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -456,12 +456,15 @@ object ColumnPruning extends Rule[LogicalPlan] {
       f.copy(child = prunedChild(child, f.references))
     case e @ Expand(_, _, child) if (child.outputSet -- e.references).nonEmpty =>
       e.copy(child = prunedChild(child, e.references))
-    case g: Generate if !g.join && (g.child.outputSet -- g.references).nonEmpty =>
-      g.copy(child = prunedChild(g.child, g.references))
 
-    // Turn off `join` for Generate if no column from it's child is used
-    case p @ Project(_, g: Generate) if g.join && p.references.subsetOf(g.generatedSet) =>
-      p.copy(child = g.copy(join = false))
+    // prune unrequired references
+    case p @ Project(_, g: Generate) if p.references != g.outputSet =>
+      val requiredAttrs = p.references -- g.producedAttributes ++ g.generator.references
+      val newChild = prunedChild(g.child, requiredAttrs)
+      val unrequired = g.generator.references -- p.references
+      val unrequiredIndices = newChild.output.zipWithIndex.filter(t => unrequired.contains(t._1))
+        .map(_._2)
+      p.copy(child = g.copy(child = newChild, unrequiredChildIndex = unrequiredIndices))
 
     // Eliminate unneeded attributes from right side of a Left Existence Join.
     case j @ Join(_, right, LeftExistence(_), _) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 7651d11ee65a8..bdc357d54a878 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -623,7 +623,7 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging
     val expressions = expressionList(ctx.expression)
     Generate(
       UnresolvedGenerator(visitFunctionName(ctx.qualifiedName), expressions),
-      join = true,
+      unrequiredChildIndex = Nil,
       outer = ctx.OUTER != null,
       Some(ctx.tblName.getText.toLowerCase),
       ctx.colName.asScala.map(_.getText).map(UnresolvedAttribute.apply),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index cd474551622d1..95e099c340af1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -73,8 +73,13 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend
  * their output.
  *
  * @param generator the generator expression
- * @param join  when true, each output row is implicitly joined with the input tuple that produced
- *              it.
+ * @param unrequiredChildIndex this paramter starts as Nil and gets filled by the Optimizer.
+ *                             It's used as an optimization for omitting data generation that will
+ *                             be discarded next by a projection.
+ *                             A common use case is when we explode(array(..)) and are interested
+ *                             only in the exploded data and not in the original array. before this
+ *                             optimization the array got duplicated for each of its elements,
+ *                             causing O(n^^2) memory consumption. (see [SPARK-21657])
  * @param outer when true, each input row will be output at least once, even if the output of the
  *              given `generator` is empty.
  * @param qualifier Qualifier for the attributes of generator(UDTF)
@@ -83,15 +88,17 @@ case class Project(projectList: Seq[NamedExpression], child: LogicalPlan) extend
  */
 case class Generate(
     generator: Generator,
-    join: Boolean,
+    unrequiredChildIndex: Seq[Int],
     outer: Boolean,
     qualifier: Option[String],
     generatorOutput: Seq[Attribute],
     child: LogicalPlan)
   extends UnaryNode {
 
-  /** The set of all attributes produced by this node. */
-  def generatedSet: AttributeSet = AttributeSet(generatorOutput)
+  lazy val requiredChildOutput: Seq[Attribute] = {
+    val unrequiredSet = unrequiredChildIndex.toSet
+    child.output.zipWithIndex.filterNot(t => unrequiredSet.contains(t._2)).map(_._1)
+  }
 
   override lazy val resolved: Boolean = {
     generator.resolved &&
@@ -114,9 +121,7 @@ case class Generate(
     nullableOutput
   }
 
-  def output: Seq[Attribute] = {
-    if (join) child.output ++ qualifiedGeneratorOutput else qualifiedGeneratorOutput
-  }
+  def output: Seq[Attribute] = requiredChildOutput ++ qualifiedGeneratorOutput
 }
 
 case class Filter(condition: Expression, child: LogicalPlan)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 77e4eff26c69b..9f0f7e111295b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -38,54 +38,64 @@ class ColumnPruningSuite extends PlanTest {
       CollapseProject) :: Nil
   }
 
-  test("Column pruning for Generate when Generate.join = false") {
-    val input = LocalRelation('a.int, 'b.array(StringType))
+  test("Column pruning for Generate when Generate.unrequiredChildIndex = child.output") {
+    val input = LocalRelation('a.int, 'b.int, 'c.array(StringType))
 
-    val query = input.generate(Explode('b), join = false).analyze
+    val query =
+      input
+        .generate(Explode('c), outputNames = "explode" :: Nil)
+        .select('c, 'explode)
+        .analyze
 
     val optimized = Optimize.execute(query)
 
-    val correctAnswer = input.select('b).generate(Explode('b), join = false).analyze
+    val correctAnswer =
+      input
+        .select('c)
+        .generate(Explode('c), outputNames = "explode" :: Nil)
+        .analyze
 
     comparePlans(optimized, correctAnswer)
   }
 
-  test("Column pruning for Generate when Generate.join = true") {
-    val input = LocalRelation('a.int, 'b.int, 'c.array(StringType))
+  test("Fill Generate.unrequiredChildIndex if possible") {
+    val input = LocalRelation('b.array(StringType))
 
     val query =
       input
-        .generate(Explode('c), join = true, outputNames = "explode" :: Nil)
-        .select('a, 'explode)
+        .generate(Explode('b), outputNames = "explode" :: Nil)
+        .select(('explode + 1).as("result"))
         .analyze
 
     val optimized = Optimize.execute(query)
 
     val correctAnswer =
       input
-        .select('a, 'c)
-        .generate(Explode('c), join = true, outputNames = "explode" :: Nil)
-        .select('a, 'explode)
+        .generate(Explode('b), unrequiredChildIndex = input.output.zipWithIndex.map(_._2),
+          outputNames = "explode" :: Nil)
+         .select(('explode + 1).as("result"))
         .analyze
 
     comparePlans(optimized, correctAnswer)
   }
 
-  test("Turn Generate.join to false if possible") {
-    val input = LocalRelation('b.array(StringType))
+  test("Another fill Generate.unrequiredChildIndex if possible") {
+    val input = LocalRelation('a.int, 'b.int, 'c1.string, 'c2.string)
 
     val query =
       input
-        .generate(Explode('b), join = true, outputNames = "explode" :: Nil)
-        .select(('explode + 1).as("result"))
+        .generate(Explode(CreateArray(Seq('c1, 'c2))), outputNames = "explode" :: Nil)
+        .select('a, 'c1, 'explode)
         .analyze
 
     val optimized = Optimize.execute(query)
 
     val correctAnswer =
       input
-        .generate(Explode('b), join = false, outputNames = "explode" :: Nil)
-        .select(('explode + 1).as("result"))
+        .select('a, 'c1, 'c2)
+        .generate(Explode(CreateArray(Seq('c1, 'c2))),
+          unrequiredChildIndex = Seq(2),
+          outputNames = "explode" :: Nil)
         .analyze
 
     comparePlans(optimized, correctAnswer)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 641824e6698f7..4a23179fc745c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -624,14 +624,14 @@ class FilterPushdownSuite extends PlanTest {
   test("generate: predicate referenced no generated column") {
     val originalQuery = {
       testRelationWithArrayType
-        .generate(Explode('c_arr), true, false, Some("arr"))
+        .generate(Explode('c_arr), alias = Some("arr"))
         .where(('b >= 5) && ('a > 6))
     }
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer = {
       testRelationWithArrayType
         .where(('b >= 5) && ('a > 6))
-        .generate(Explode('c_arr), true, false, Some("arr")).analyze
+        .generate(Explode('c_arr), alias = Some("arr")).analyze
     }
 
     comparePlans(optimized, correctAnswer)
@@ -640,14 +640,14 @@ class FilterPushdownSuite extends PlanTest {
   test("generate: non-deterministic predicate referenced no generated column") {
     val originalQuery = {
       testRelationWithArrayType
-        .generate(Explode('c_arr), true, false, Some("arr"))
+        .generate(Explode('c_arr), alias = Some("arr"))
         .where(('b >= 5) && ('a + Rand(10).as("rnd") > 6) && ('col > 6))
     }
     val optimized = Optimize.execute(originalQuery.analyze)
     val correctAnswer = {
       testRelationWithArrayType
         .where('b >= 5)
-        .generate(Explode('c_arr), true, false, Some("arr"))
+        .generate(Explode('c_arr), alias = Some("arr"))
         .where('a + Rand(10).as("rnd") > 6 && 'col > 6)
         .analyze
     }
@@ -659,14 +659,14 @@ class FilterPushdownSuite extends PlanTest {
     val generator = Explode('c_arr)
     val originalQuery = {
       testRelationWithArrayType
-        .generate(generator, true, false, Some("arr"))
+        .generate(generator, alias = Some("arr"))
         .where(('b >= 5) && ('c > 6))
     }
     val optimized = Optimize.execute(originalQuery.analyze)
     val referenceResult = {
       testRelationWithArrayType
         .where('b >= 5)
-        .generate(generator, true, false, Some("arr"))
+        .generate(generator, alias = Some("arr"))
         .where('c > 6).analyze
     }
 
@@ -687,7 +687,7 @@ class FilterPushdownSuite extends PlanTest {
   test("generate: all conjuncts referenced generated column") {
     val originalQuery = {
       testRelationWithArrayType
-        .generate(Explode('c_arr), true, false, Some("arr"))
+        .generate(Explode('c_arr), alias = Some("arr"))
         .where(('col > 6) || ('b > 5)).analyze
     }
     val optimized = Optimize.execute(originalQuery)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index d34a83c42c67e..812bfdd7bb885 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -276,7 +276,7 @@ class PlanParserSuite extends AnalysisTest {
     assertEqual(
       "select * from t lateral view explode(x) expl as x",
       table("t")
-        .generate(explode, join = true, outer = false, Some("expl"), Seq("x"))
+        .generate(explode, alias = Some("expl"), outputNames = Seq("x"))
         .select(star()))
 
     // Multiple lateral views
@@ -286,12 +286,12 @@ class PlanParserSuite extends AnalysisTest {
         |lateral view explode(x) expl
         |lateral view outer json_tuple(x, y) jtup q, z""".stripMargin,
       table("t")
-        .generate(explode, join = true, outer = false, Some("expl"), Seq.empty)
-        .generate(jsonTuple, join = true, outer = true, Some("jtup"), Seq("q", "z"))
+        .generate(explode, alias = Some("expl"))
+        .generate(jsonTuple, outer = true, alias = Some("jtup"), outputNames = Seq("q", "z"))
         .select(star()))
 
     // Multi-Insert lateral views.
-    val from = table("t1").generate(explode, join = true, outer = false, Some("expl"), Seq("x"))
+    val from = table("t1").generate(explode, alias = Some("expl"), outputNames = Seq("x"))
     assertEqual(
       """from t1
         |lateral view explode(x) expl as x
@@ -303,7 +303,7 @@ class PlanParserSuite extends AnalysisTest {
         |where s < 10
       """.stripMargin,
       Union(from
-        .generate(jsonTuple, join = true, outer = false, Some("jtup"), Seq("q", "z"))
+        .generate(jsonTuple, alias = Some("jtup"), outputNames = Seq("q", "z"))
         .select(star())
         .insertInto("t2"),
         from.where('s < 10).select(star()).insertInto("t3")))
@@ -312,10 +312,8 @@ class PlanParserSuite extends AnalysisTest {
     val expected = table("t")
       .generate(
         UnresolvedGenerator(FunctionIdentifier("posexplode"), Seq('x)),
-        join = true,
-        outer = false,
-        Some("posexpl"),
-        Seq("x", "y"))
+        alias = Some("posexpl"),
+        outputNames = Seq("x", "y"))
       .select(star())
     assertEqual(
       "select * from t lateral view posexplode(x) posexpl as x, y",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 209b800fdc6f4..77e571272920a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2095,7 +2095,7 @@ class Dataset[T] private[sql](
     val generator = UserDefinedGenerator(elementSchema, rowFunction, input.map(_.expr))
 
     withPlan {
-      Generate(generator, join = true, outer = false,
+      Generate(generator, unrequiredChildIndex = Nil, outer = false,
         qualifier = None, generatorOutput = Nil, planWithBarrier)
     }
   }
@@ -2136,7 +2136,7 @@ class Dataset[T] private[sql](
     val generator = UserDefinedGenerator(elementSchema, rowFunction, apply(inputColumn).expr :: Nil)
 
     withPlan {
-      Generate(generator, join = true, outer = false,
+      Generate(generator, unrequiredChildIndex = Nil, outer = false,
         qualifier = None, generatorOutput = Nil, planWithBarrier)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
index e1562befe14f9..0c2c4a1a9100d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GenerateExec.scala
@@ -47,8 +47,7 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  * terminate().
  *
  * @param generator the generator expression
- * @param join  when true, each output row is implicitly joined with the input tuple that produced
- *              it.
+ * @param requiredChildOutput required attributes from child's output
  * @param outer when true, each input row will be output at least once, even if the output of the
  *              given `generator` is empty.
  * @param generatorOutput the qualified output attributes of the generator of this node, which
@@ -57,19 +56,13 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  */
 case class GenerateExec(
     generator: Generator,
-    join: Boolean,
+    requiredChildOutput: Seq[Attribute],
     outer: Boolean,
     generatorOutput: Seq[Attribute],
     child: SparkPlan)
   extends UnaryExecNode with CodegenSupport {
 
-  override def output: Seq[Attribute] = {
-    if (join) {
-      child.output ++ generatorOutput
-    } else {
-      generatorOutput
-    }
-  }
+  override def output: Seq[Attribute] = requiredChildOutput ++ generatorOutput
 
   override lazy val metrics = Map(
     "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))
@@ -85,11 +78,19 @@ case class GenerateExec(
     val numOutputRows = longMetric("numOutputRows")
     child.execute().mapPartitionsWithIndexInternal { (index, iter) =>
       val generatorNullRow = new GenericInternalRow(generator.elementSchema.length)
-      val rows = if (join) {
+      val rows = if (requiredChildOutput.nonEmpty) {
+
+        val pruneChildForResult: InternalRow => InternalRow =
+          if (child.outputSet == AttributeSet(requiredChildOutput)) {
+            identity
+          } else {
+            UnsafeProjection.create(requiredChildOutput, child.output)
+          }
+
         val joinedRow = new JoinedRow
         iter.flatMap { row =>
-          // we should always set the left (child output)
-          joinedRow.withLeft(row)
+          // we should always set the left (required child output)
+          joinedRow.withLeft(pruneChildForResult(row))
           val outputRows = boundGenerator.eval(row)
           if (outer && outputRows.isEmpty) {
             joinedRow.withRight(generatorNullRow) :: Nil
@@ -136,7 +137,7 @@ case class GenerateExec(
 
   override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     // Add input rows to the values when we are joining
-    val values = if (join) {
+    val values = if (requiredChildOutput.nonEmpty) {
       input
     } else {
       Seq.empty
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index 0ed7c2f555507..910294853c318 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -499,10 +499,10 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         execution.GlobalLimitExec(limit, planLater(child)) :: Nil
       case logical.Union(unionChildren) =>
         execution.UnionExec(unionChildren.map(planLater)) :: Nil
-      case g @ logical.Generate(generator, join, outer, _, _, child) =>
+      case g @ logical.Generate(generator, _, outer, _, _, child) =>
         execution.GenerateExec(
-          generator, join = join, outer = outer, g.qualifiedGeneratorOutput,
-          planLater(child)) :: Nil
+          generator, g.requiredChildOutput, outer,
+          g.qualifiedGeneratorOutput, planLater(child)) :: Nil
       case _: logical.OneRowRelation =>
         execution.RDDScanExec(Nil, singleRowRdd, "OneRowRelation") :: Nil
       case r: logical.Range =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala
index 01773c238b0db..f039aeaad442c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/MiscBenchmark.scala
@@ -202,6 +202,42 @@ class MiscBenchmark extends BenchmarkBase {
     generate inline array wholestage off          6901 / 6928          2.4         411.3       1.0X
     generate inline array wholestage on           1001 / 1010         16.8          59.7       6.9X
      */
+
+    val M = 60000
+    runBenchmark("generate big struct array", M) {
+      import sparkSession.implicits._
+      val df = sparkSession.sparkContext.parallelize(Seq(("1",
+        Array.fill(M)({
+          val i = math.random
+          (i.toString, (i + 1).toString, (i + 2).toString, (i + 3).toString)
+        })))).toDF("col", "arr")
+
+      df.selectExpr("*", "expode(arr) as arr_col")
+        .select("col", "arr_col.*").count
+    }
+
+    /*
+    Java HotSpot(TM) 64-Bit Server VM 1.8.0_151-b12 on Mac OS X 10.12.6
+    Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz
+
+    test the impact of adding the optimization of Generate.unrequiredChildIndex,
+    we can see enormous improvement of x250 in this case! and it grows O(n^2).
+
+    with Optimization ON:
+
+    generate big struct array:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    generate big struct array wholestage off       331 /  378          0.2        5524.9       1.0X
+    generate big struct array wholestage on        205 /  232          0.3        3413.1       1.6X
+
+    with Optimization OFF:
+
+    generate big struct array:               Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+    ------------------------------------------------------------------------------------------------
+    generate big struct array wholestage off    49697 / 51496          0.0      828277.7       1.0X
+    generate big struct array wholestage on     50558 / 51434          0.0      842641.6       1.0X
+     */
+
   }
 
   ignore("generate regular generator") {
@@ -227,4 +263,5 @@ class MiscBenchmark extends BenchmarkBase {
     generate stack wholestage on                   836 /  847         20.1          49.8      15.5X
      */
   }
+
 }

From dbd492b7e293f0bf4c13076ba21deb506c5f0969 Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 29 Dec 2017 07:30:49 -0600
Subject: [PATCH 221/356] [SPARK-22921][PROJECT-INFRA] Choices for Assigning
 Jira on Merge

In general jiras are assigned to the original reporter or one of
the commentors.  This updates the merge script to give you a simple
choice to do that, so you don't have to do it manually.

Author: Imran Rashid <irashid@cloudera.com>

Closes #20107 from squito/SPARK-22921.
---
 dev/merge_spark_pr.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index 28971b87f403f..cd5dd1e1f4943 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -241,6 +241,9 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
     cur_status = issue.fields.status.name
     cur_summary = issue.fields.summary
     cur_assignee = issue.fields.assignee
+    if cur_assignee is None:
+        cur_assignee = choose_jira_assignee(issue)
+    # Check again, we might not have chose an assignee
     if cur_assignee is None:
         cur_assignee = "NOT ASSIGNED!!!"
     else:
@@ -290,6 +293,31 @@ def get_version_json(version_str):
     print("Successfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions))
 
 
+def choose_jira_assignee(issue, asf_jira):
+    """
+    Prompt the user to choose who to assign the issue to in jira, given a list of candidates,
+    including the original reporter and all commentors
+    """
+    reporter = issue.fields.reporter
+    commentors = map(lambda x: x.author, issue.fields.comment.comments)
+    candidates = set(commentors)
+    candidates.add(reporter)
+    candidates = list(candidates)
+    print("JIRA is unassigned, choose assignee")
+    for idx, author in enumerate(candidates):
+        annotations = ["Reporter"] if author == reporter else []
+        if author in commentors:
+            annotations.append("Commentor")
+        print("[%d] %s (%s)" % (idx, author.displayName, ",".join(annotations)))
+    assignee = raw_input("Enter number of user to assign to (blank to leave unassigned):")
+    if assignee == "":
+        return None
+    else:
+        assignee = candidates[int(assignee)]
+        asf_jira.assign_issue(issue.key, assignee.key)
+        return assignee
+
+
 def resolve_jira_issues(title, merge_branches, comment):
     jira_ids = re.findall("SPARK-[0-9]{4,5}", title)
 

From 11a849b3a7b3d03c48d3e17c8a721acedfd89285 Mon Sep 17 00:00:00 2001
From: Takuya UESHIN <ueshin@databricks.com>
Date: Fri, 29 Dec 2017 23:04:28 +0900
Subject: [PATCH 222/356] [SPARK-22370][SQL][PYSPARK][FOLLOW-UP] Fix a test
 failure when xmlrunner is installed.

## What changes were proposed in this pull request?

This is a follow-up pr of #19587.

If `xmlrunner` is installed, `VectorizedUDFTests.test_vectorized_udf_check_config` fails by the following error because the `self` which is a subclass of `unittest.TestCase` in the UDF `check_records_per_batch` can't be pickled anymore.

```
PicklingError: Cannot pickle files that are not opened for reading: w
```

This changes the UDF not to refer the `self`.

## How was this patch tested?

Tested locally.

Author: Takuya UESHIN <ueshin@databricks.com>

Closes #20115 from ueshin/issues/SPARK-22370_fup1.
---
 python/pyspark/sql/tests.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 3ef1522887325..1c34c897eecb5 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3825,6 +3825,7 @@ def gen_timestamps(id):
 
     def test_vectorized_udf_check_config(self):
         from pyspark.sql.functions import pandas_udf, col
+        import pandas as pd
         orig_value = self.spark.conf.get("spark.sql.execution.arrow.maxRecordsPerBatch", None)
         self.spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 3)
         try:
@@ -3832,11 +3833,11 @@ def test_vectorized_udf_check_config(self):
 
             @pandas_udf(returnType=LongType())
             def check_records_per_batch(x):
-                self.assertTrue(x.size <= 3)
-                return x
+                return pd.Series(x.size).repeat(x.size)
 
-            result = df.select(check_records_per_batch(col("id")))
-            self.assertEqual(df.collect(), result.collect())
+            result = df.select(check_records_per_batch(col("id"))).collect()
+            for (r,) in result:
+                self.assertTrue(r <= 3)
         finally:
             if orig_value is None:
                 self.spark.conf.unset("spark.sql.execution.arrow.maxRecordsPerBatch")

From 8b497046c647a21bbed1bdfbdcb176745a1d5cd5 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 29 Dec 2017 10:40:09 -0600
Subject: [PATCH 223/356] [SPARK-20654][CORE] Add config to limit disk usage of
 the history server.

This change adds a new configuration option and support code that limits
how much disk space the SHS will use. The default value is pretty generous
so that applications will, hopefully, only rarely need to be replayed
because of their disk stored being evicted.

This works by keeping track of how much data each application is using.
Also, because it's not possible to know, before replaying, how much space
will be needed, it's possible that usage will exceed the configured limit
temporarily. The code uses the concept of a "lease" to try to limit how
much the SHS will exceed the limit in those cases.

Active UIs are also tracked, so they're never deleted. This works in
tandem with the existing option of how many active UIs are loaded; because
unused UIs will be unloaded, their disk stores will also become candidates
for deletion. If the data is not deleted, though, re-loading the UI is
pretty quick.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20011 from vanzin/SPARK-20654.
---
 .../deploy/history/FsHistoryProvider.scala    | 173 +++++----
 .../history/HistoryServerDiskManager.scala    | 327 ++++++++++++++++++
 .../apache/spark/deploy/history/config.scala  |   5 +
 .../scheduler/EventLoggingListener.scala      |  15 +-
 .../HistoryServerDiskManagerSuite.scala       | 160 +++++++++
 5 files changed, 606 insertions(+), 74 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index a299b79850613..94c80ebd55e74 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -24,6 +24,7 @@ import java.util.zip.{ZipEntry, ZipOutputStream}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
+import scala.util.Try
 import scala.xml.Node
 
 import com.fasterxml.jackson.annotation.JsonIgnore
@@ -39,11 +40,13 @@ import org.fusesource.leveldbjni.internal.NativeDB
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.ReplayListenerBus._
 import org.apache.spark.status._
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1.{ApplicationAttemptInfo, ApplicationInfo}
+import org.apache.spark.status.config._
 import org.apache.spark.ui.SparkUI
 import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 import org.apache.spark.util.kvstore._
@@ -149,6 +152,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
   }.getOrElse(new InMemoryStore())
 
+  private val diskManager = storePath.map { path =>
+    new HistoryServerDiskManager(conf, path, listing, clock)
+  }
+
   private val activeUIs = new mutable.HashMap[(String, Option[String]), LoadedAppUI]()
 
   /**
@@ -219,6 +226,8 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
   }
 
   private def startPolling(): Unit = {
+    diskManager.foreach(_.initialize())
+
     // Validate the log directory.
     val path = new Path(logDir)
     try {
@@ -299,63 +308,24 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
       attempt.adminAclsGroups.getOrElse(""))
     secManager.setViewAclsGroups(attempt.viewAclsGroups.getOrElse(""))
 
-    val uiStorePath = storePath.map { path => getStorePath(path, appId, attemptId) }
+    val kvstore = try {
+      diskManager match {
+        case Some(sm) =>
+          loadDiskStore(sm, appId, attempt)
 
-    val (kvstore, needReplay) = uiStorePath match {
-      case Some(path) =>
-        try {
-          // The store path is not guaranteed to exist - maybe it hasn't been created, or was
-          // invalidated because changes to the event log were detected. Need to replay in that
-          // case.
-          val _replay = !path.isDirectory()
-          (createDiskStore(path, conf), _replay)
-        } catch {
-          case e: Exception =>
-            // Get rid of the old data and re-create it. The store is either old or corrupted.
-            logWarning(s"Failed to load disk store $uiStorePath for $appId.", e)
-            Utils.deleteRecursively(path)
-            (createDiskStore(path, conf), true)
-        }
-
-      case _ =>
-        (new InMemoryStore(), true)
-    }
-
-    val plugins = ServiceLoader.load(
-      classOf[AppHistoryServerPlugin], Utils.getContextOrSparkClassLoader).asScala
-    val trackingStore = new ElementTrackingStore(kvstore, conf)
-    if (needReplay) {
-      val replayBus = new ReplayListenerBus()
-      val listener = new AppStatusListener(trackingStore, conf, false,
-        lastUpdateTime = Some(attempt.info.lastUpdated.getTime()))
-      replayBus.addListener(listener)
-      for {
-        plugin <- plugins
-        listener <- plugin.createListeners(conf, trackingStore)
-      } replayBus.addListener(listener)
-      try {
-        val fileStatus = fs.getFileStatus(new Path(logDir, attempt.logPath))
-        replay(fileStatus, isApplicationCompleted(fileStatus), replayBus)
-        trackingStore.close(false)
-      } catch {
-        case e: Exception =>
-          Utils.tryLogNonFatalError {
-            trackingStore.close()
-          }
-          uiStorePath.foreach(Utils.deleteRecursively)
-          if (e.isInstanceOf[FileNotFoundException]) {
-            return None
-          } else {
-            throw e
-          }
+        case _ =>
+          createInMemoryStore(attempt)
       }
+    } catch {
+      case _: FileNotFoundException =>
+        return None
     }
 
     val ui = SparkUI.create(None, new AppStatusStore(kvstore), conf, secManager, app.info.name,
       HistoryServer.getAttemptURI(appId, attempt.info.attemptId),
       attempt.info.startTime.getTime(),
       attempt.info.appSparkVersion)
-    plugins.foreach(_.setupUI(ui))
+    loadPlugins().foreach(_.setupUI(ui))
 
     val loadedUI = LoadedAppUI(ui)
 
@@ -417,11 +387,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
         loadedUI.lock.writeLock().unlock()
       }
 
-      // If the UI is not valid, delete its files from disk, if any. This relies on the fact that
-      // ApplicationCache will never call this method concurrently with getAppUI() for the same
-      // appId / attemptId.
-      if (!loadedUI.valid && storePath.isDefined) {
-        Utils.deleteRecursively(getStorePath(storePath.get, appId, attemptId))
+      diskManager.foreach { dm =>
+        // If the UI is not valid, delete its files from disk, if any. This relies on the fact that
+        // ApplicationCache will never call this method concurrently with getAppUI() for the same
+        // appId / attemptId.
+        dm.release(appId, attemptId, delete = !loadedUI.valid)
       }
     }
   }
@@ -569,12 +539,11 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     }
 
     val logPath = fileStatus.getPath()
-
     val bus = new ReplayListenerBus()
     val listener = new AppListingListener(fileStatus, clock)
     bus.addListener(listener)
+    replay(fileStatus, bus, eventsFilter = eventsFilter)
 
-    replay(fileStatus, isApplicationCompleted(fileStatus), bus, eventsFilter)
     listener.applicationInfo.foreach { app =>
       // Invalidate the existing UI for the reloaded app attempt, if any. See LoadedAppUI for a
       // discussion on the UI lifecycle.
@@ -651,10 +620,10 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
    */
   private def replay(
       eventLog: FileStatus,
-      appCompleted: Boolean,
       bus: ReplayListenerBus,
       eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = {
     val logPath = eventLog.getPath()
+    val isCompleted = !logPath.getName().endsWith(EventLoggingListener.IN_PROGRESS)
     logInfo(s"Replaying log path: $logPath")
     // Note that the eventLog may have *increased* in size since when we grabbed the filestatus,
     // and when we read the file here.  That is OK -- it may result in an unnecessary refresh
@@ -664,18 +633,44 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     // after it's created, so we get a file size that is no bigger than what is actually read.
     val logInput = EventLoggingListener.openEventLog(logPath, fs)
     try {
-      bus.replay(logInput, logPath.toString, !appCompleted, eventsFilter)
-      logInfo(s"Finished replaying $logPath")
+      bus.replay(logInput, logPath.toString, !isCompleted, eventsFilter)
+      logInfo(s"Finished parsing $logPath")
     } finally {
       logInput.close()
     }
   }
 
   /**
-   * Return true when the application has completed.
+   * Rebuilds the application state store from its event log.
    */
-  private def isApplicationCompleted(entry: FileStatus): Boolean = {
-    !entry.getPath().getName().endsWith(EventLoggingListener.IN_PROGRESS)
+  private def rebuildAppStore(
+      store: KVStore,
+      eventLog: FileStatus,
+      lastUpdated: Long): Unit = {
+    // Disable async updates, since they cause higher memory usage, and it's ok to take longer
+    // to parse the event logs in the SHS.
+    val replayConf = conf.clone().set(ASYNC_TRACKING_ENABLED, false)
+    val trackingStore = new ElementTrackingStore(store, replayConf)
+    val replayBus = new ReplayListenerBus()
+    val listener = new AppStatusListener(trackingStore, replayConf, false,
+      lastUpdateTime = Some(lastUpdated))
+    replayBus.addListener(listener)
+
+    for {
+      plugin <- loadPlugins()
+      listener <- plugin.createListeners(conf, trackingStore)
+    } replayBus.addListener(listener)
+
+    try {
+      replay(eventLog, replayBus)
+      trackingStore.close(false)
+    } catch {
+      case e: Exception =>
+        Utils.tryLogNonFatalError {
+          trackingStore.close()
+        }
+        throw e
+    }
   }
 
   /**
@@ -751,14 +746,58 @@ private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
     listing.write(newAppInfo)
   }
 
-  private def createDiskStore(path: File, conf: SparkConf): KVStore = {
+  private def loadDiskStore(
+      dm: HistoryServerDiskManager,
+      appId: String,
+      attempt: AttemptInfoWrapper): KVStore = {
     val metadata = new AppStatusStoreMetadata(AppStatusStore.CURRENT_VERSION)
-    KVUtils.open(path, metadata)
+
+    // First check if the store already exists and try to open it. If that fails, then get rid of
+    // the existing data.
+    dm.openStore(appId, attempt.info.attemptId).foreach { path =>
+      try {
+        return KVUtils.open(path, metadata)
+      } catch {
+        case e: Exception =>
+          logInfo(s"Failed to open existing store for $appId/${attempt.info.attemptId}.", e)
+          dm.release(appId, attempt.info.attemptId, delete = true)
+      }
+    }
+
+    // At this point the disk data either does not exist or was deleted because it failed to
+    // load, so the event log needs to be replayed.
+    val status = fs.getFileStatus(new Path(logDir, attempt.logPath))
+    val isCompressed = EventLoggingListener.codecName(status.getPath()).flatMap { name =>
+      Try(CompressionCodec.getShortName(name)).toOption
+    }.isDefined
+    logInfo(s"Leasing disk manager space for app $appId / ${attempt.info.attemptId}...")
+    val lease = dm.lease(status.getLen(), isCompressed)
+    val newStorePath = try {
+      val store = KVUtils.open(lease.tmpPath, metadata)
+      try {
+        rebuildAppStore(store, status, attempt.info.lastUpdated.getTime())
+      } finally {
+        store.close()
+      }
+      lease.commit(appId, attempt.info.attemptId)
+    } catch {
+      case e: Exception =>
+        lease.rollback()
+        throw e
+    }
+
+    KVUtils.open(newStorePath, metadata)
+  }
+
+  private def createInMemoryStore(attempt: AttemptInfoWrapper): KVStore = {
+    val store = new InMemoryStore()
+    val status = fs.getFileStatus(new Path(logDir, attempt.logPath))
+    rebuildAppStore(store, status, attempt.info.lastUpdated.getTime())
+    store
   }
 
-  private def getStorePath(path: File, appId: String, attemptId: Option[String]): File = {
-    val fileName = appId + attemptId.map("_" + _).getOrElse("") + ".ldb"
-    new File(path, fileName)
+  private def loadPlugins(): Iterable[AppHistoryServerPlugin] = {
+    ServiceLoader.load(classOf[AppHistoryServerPlugin], Utils.getContextOrSparkClassLoader).asScala
   }
 
   /** For testing. Returns internal data about a single attempt. */
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala
new file mode 100644
index 0000000000000..c03a360b91ef8
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServerDiskManager.scala
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import java.io.File
+import java.nio.file.Files
+import java.nio.file.attribute.PosixFilePermissions
+import java.util.concurrent.atomic.AtomicLong
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.{HashMap, ListBuffer}
+
+import org.apache.commons.io.FileUtils
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+import org.apache.spark.status.KVUtils._
+import org.apache.spark.util.{Clock, Utils}
+import org.apache.spark.util.kvstore.KVStore
+
+/**
+ * A class used to keep track of disk usage by the SHS, allowing application data to be deleted
+ * from disk when usage exceeds a configurable threshold.
+ *
+ * The goal of the class is not to guarantee that usage will never exceed the threshold; because of
+ * how application data is written, disk usage may temporarily go higher. But, eventually, it
+ * should fall back under the threshold.
+ *
+ * @param conf Spark configuration.
+ * @param path Path where to store application data.
+ * @param listing The listing store, used to persist usage data.
+ * @param clock Clock instance to use.
+ */
+private class HistoryServerDiskManager(
+    conf: SparkConf,
+    path: File,
+    listing: KVStore,
+    clock: Clock) extends Logging {
+
+  import config._
+
+  private val appStoreDir = new File(path, "apps")
+  if (!appStoreDir.isDirectory() && !appStoreDir.mkdir()) {
+    throw new IllegalArgumentException(s"Failed to create app directory ($appStoreDir).")
+  }
+
+  private val tmpStoreDir = new File(path, "temp")
+  if (!tmpStoreDir.isDirectory() && !tmpStoreDir.mkdir()) {
+    throw new IllegalArgumentException(s"Failed to create temp directory ($tmpStoreDir).")
+  }
+
+  private val maxUsage = conf.get(MAX_LOCAL_DISK_USAGE)
+  private val currentUsage = new AtomicLong(0L)
+  private val committedUsage = new AtomicLong(0L)
+  private val active = new HashMap[(String, Option[String]), Long]()
+
+  def initialize(): Unit = {
+    updateUsage(sizeOf(appStoreDir), committed = true)
+
+    // Clean up any temporary stores during start up. This assumes that they're leftover from other
+    // instances and are not useful.
+    tmpStoreDir.listFiles().foreach(FileUtils.deleteQuietly)
+
+    // Go through the recorded store directories and remove any that may have been removed by
+    // external code.
+    val orphans = listing.view(classOf[ApplicationStoreInfo]).asScala.filter { info =>
+      !new File(info.path).exists()
+    }.toSeq
+
+    orphans.foreach { info =>
+      listing.delete(info.getClass(), info.path)
+    }
+
+    logInfo("Initialized disk manager: " +
+      s"current usage = ${Utils.bytesToString(currentUsage.get())}, " +
+      s"max usage = ${Utils.bytesToString(maxUsage)}")
+  }
+
+  /**
+   * Lease some space from the store. The leased space is calculated as a fraction of the given
+   * event log size; this is an approximation, and doesn't mean the application store cannot
+   * outgrow the lease.
+   *
+   * If there's not enough space for the lease, other applications might be evicted to make room.
+   * This method always returns a lease, meaning that it's possible for local disk usage to grow
+   * past the configured threshold if there aren't enough idle applications to evict.
+   *
+   * While the lease is active, the data is written to a temporary location, so `openStore()`
+   * will still return `None` for the application.
+   */
+  def lease(eventLogSize: Long, isCompressed: Boolean = false): Lease = {
+    val needed = approximateSize(eventLogSize, isCompressed)
+    makeRoom(needed)
+
+    val perms = PosixFilePermissions.fromString("rwx------")
+    val tmp = Files.createTempDirectory(tmpStoreDir.toPath(), "appstore",
+      PosixFilePermissions.asFileAttribute(perms)).toFile()
+
+    updateUsage(needed)
+    val current = currentUsage.get()
+    if (current > maxUsage) {
+      logInfo(s"Lease of ${Utils.bytesToString(needed)} may cause usage to exceed max " +
+        s"(${Utils.bytesToString(current)} > ${Utils.bytesToString(maxUsage)})")
+    }
+
+    new Lease(tmp, needed)
+  }
+
+  /**
+   * Returns the location of an application store if it's still available. Marks the store as
+   * being used so that it's not evicted when running out of designated space.
+   */
+  def openStore(appId: String, attemptId: Option[String]): Option[File] = {
+    val storePath = active.synchronized {
+      val path = appStorePath(appId, attemptId)
+      if (path.isDirectory()) {
+        active(appId -> attemptId) = sizeOf(path)
+        Some(path)
+      } else {
+        None
+      }
+    }
+
+    storePath.foreach { path =>
+      updateAccessTime(appId, attemptId)
+    }
+
+    storePath
+  }
+
+  /**
+   * Tell the disk manager that the store for the given application is not being used anymore.
+   *
+   * @param delete Whether to delete the store from disk.
+   */
+  def release(appId: String, attemptId: Option[String], delete: Boolean = false): Unit = {
+    // Because LevelDB may modify the structure of the store files even when just reading, update
+    // the accounting for this application when it's closed.
+    val oldSizeOpt = active.synchronized {
+      active.remove(appId -> attemptId)
+    }
+
+    oldSizeOpt.foreach { oldSize =>
+      val path = appStorePath(appId, attemptId)
+      updateUsage(-oldSize, committed = true)
+      if (path.isDirectory()) {
+        if (delete) {
+          deleteStore(path)
+        } else {
+          val newSize = sizeOf(path)
+          val newInfo = listing.read(classOf[ApplicationStoreInfo], path.getAbsolutePath())
+            .copy(size = newSize)
+          listing.write(newInfo)
+          updateUsage(newSize, committed = true)
+        }
+      }
+    }
+  }
+
+  /**
+   * A non-scientific approximation of how large an app state store will be given the size of the
+   * event log.
+   */
+  def approximateSize(eventLogSize: Long, isCompressed: Boolean): Long = {
+    if (isCompressed) {
+      // For compressed logs, assume that compression reduces the log size a lot, and the disk
+      // store will actually grow compared to the log size.
+      eventLogSize * 2
+    } else {
+      // For non-compressed logs, assume the disk store will end up at approximately 50% of the
+      // size of the logs. This is loosely based on empirical evidence.
+      eventLogSize / 2
+    }
+  }
+
+  /** Current free space. Considers space currently leased out too. */
+  def free(): Long = {
+    math.max(maxUsage - currentUsage.get(), 0L)
+  }
+
+  /** Current committed space. */
+  def committed(): Long = committedUsage.get()
+
+  private def deleteStore(path: File): Unit = {
+    FileUtils.deleteDirectory(path)
+    listing.delete(classOf[ApplicationStoreInfo], path.getAbsolutePath())
+  }
+
+  private def makeRoom(size: Long): Unit = {
+    if (free() < size) {
+      logDebug(s"Not enough free space, looking at candidates for deletion...")
+      val evicted = new ListBuffer[ApplicationStoreInfo]()
+      Utils.tryWithResource(
+        listing.view(classOf[ApplicationStoreInfo]).index("lastAccess").closeableIterator()
+      ) { iter =>
+        var needed = size
+        while (needed > 0 && iter.hasNext()) {
+          val info = iter.next()
+          val isActive = active.synchronized {
+            active.contains(info.appId -> info.attemptId)
+          }
+          if (!isActive) {
+            evicted += info
+            needed -= info.size
+          }
+        }
+      }
+
+      if (evicted.nonEmpty) {
+        val freed = evicted.map { info =>
+          logInfo(s"Deleting store for ${info.appId}/${info.attemptId}.")
+          deleteStore(new File(info.path))
+          updateUsage(-info.size, committed = true)
+          info.size
+        }.sum
+
+        logInfo(s"Deleted ${evicted.size} store(s) to free ${Utils.bytesToString(freed)} " +
+          s"(target = ${Utils.bytesToString(size)}).")
+      } else {
+        logWarning(s"Unable to free any space to make room for ${Utils.bytesToString(size)}.")
+      }
+    }
+  }
+
+  private def appStorePath(appId: String, attemptId: Option[String]): File = {
+    val fileName = appId + attemptId.map("_" + _).getOrElse("") + ".ldb"
+    new File(appStoreDir, fileName)
+  }
+
+  private def updateAccessTime(appId: String, attemptId: Option[String]): Unit = {
+    val path = appStorePath(appId, attemptId)
+    val info = ApplicationStoreInfo(path.getAbsolutePath(), clock.getTimeMillis(), appId, attemptId,
+      sizeOf(path))
+    listing.write(info)
+  }
+
+  private def updateUsage(delta: Long, committed: Boolean = false): Unit = {
+    val updated = currentUsage.addAndGet(delta)
+    if (updated < 0) {
+      throw new IllegalStateException(
+        s"Disk usage tracker went negative (now = $updated, delta = $delta)")
+    }
+    if (committed) {
+      val updatedCommitted = committedUsage.addAndGet(delta)
+      if (updatedCommitted < 0) {
+        throw new IllegalStateException(
+          s"Disk usage tracker went negative (now = $updatedCommitted, delta = $delta)")
+      }
+    }
+  }
+
+  /** Visible for testing. Return the size of a directory. */
+  private[history] def sizeOf(path: File): Long = FileUtils.sizeOf(path)
+
+  private[history] class Lease(val tmpPath: File, private val leased: Long) {
+
+    /**
+     * Commits a lease to its final location, and update accounting information. This method
+     * marks the application as active, so its store is not available for eviction.
+     */
+    def commit(appId: String, attemptId: Option[String]): File = {
+      val dst = appStorePath(appId, attemptId)
+
+      active.synchronized {
+        require(!active.contains(appId -> attemptId),
+          s"Cannot commit lease for active application $appId / $attemptId")
+
+        if (dst.isDirectory()) {
+          val size = sizeOf(dst)
+          deleteStore(dst)
+          updateUsage(-size, committed = true)
+        }
+      }
+
+      updateUsage(-leased)
+
+      val newSize = sizeOf(tmpPath)
+      makeRoom(newSize)
+      tmpPath.renameTo(dst)
+
+      updateUsage(newSize, committed = true)
+      if (committedUsage.get() > maxUsage) {
+        val current = Utils.bytesToString(committedUsage.get())
+        val max = Utils.bytesToString(maxUsage)
+        logWarning(s"Commit of application $appId / $attemptId causes maximum disk usage to be " +
+          s"exceeded ($current > $max)")
+      }
+
+      updateAccessTime(appId, attemptId)
+
+      active.synchronized {
+        active(appId -> attemptId) = newSize
+      }
+      dst
+    }
+
+    /** Deletes the temporary directory created for the lease. */
+    def rollback(): Unit = {
+      updateUsage(-leased)
+      FileUtils.deleteDirectory(tmpPath)
+    }
+
+  }
+
+}
+
+private case class ApplicationStoreInfo(
+    @KVIndexParam path: String,
+    @KVIndexParam("lastAccess") lastAccess: Long,
+    appId: String,
+    attemptId: Option[String],
+    size: Long)
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
index 52dedc1a2ed41..22b6d49d8e2a4 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/config.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -20,6 +20,7 @@ package org.apache.spark.deploy.history
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.internal.config.ConfigBuilder
+import org.apache.spark.network.util.ByteUnit
 
 private[spark] object config {
 
@@ -39,4 +40,8 @@ private[spark] object config {
     .stringConf
     .createOptional
 
+  val MAX_LOCAL_DISK_USAGE = ConfigBuilder("spark.history.store.maxDiskUsage")
+    .bytesConf(ByteUnit.BYTE)
+    .createWithDefaultString("10g")
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
index a77adc5ff3545..b3a5b1f1e05b3 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -352,14 +352,8 @@ private[spark] object EventLoggingListener extends Logging {
    */
   def openEventLog(log: Path, fs: FileSystem): InputStream = {
     val in = new BufferedInputStream(fs.open(log))
-
-    // Compression codec is encoded as an extension, e.g. app_123.lzf
-    // Since we sanitize the app ID to not include periods, it is safe to split on it
-    val logName = log.getName.stripSuffix(IN_PROGRESS)
-    val codecName: Option[String] = logName.split("\\.").tail.lastOption
-
     try {
-      val codec = codecName.map { c =>
+      val codec = codecName(log).map { c =>
         codecMap.getOrElseUpdate(c, CompressionCodec.createCodec(new SparkConf, c))
       }
       codec.map(_.compressedInputStream(in)).getOrElse(in)
@@ -370,4 +364,11 @@ private[spark] object EventLoggingListener extends Logging {
     }
   }
 
+  def codecName(log: Path): Option[String] = {
+    // Compression codec is encoded as an extension, e.g. app_123.lzf
+    // Since we sanitize the app ID to not include periods, it is safe to split on it
+    val logName = log.getName.stripSuffix(IN_PROGRESS)
+    logName.split("\\.").tail.lastOption
+  }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala
new file mode 100644
index 0000000000000..4b1b921582e00
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerDiskManagerSuite.scala
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.history
+
+import java.io.File
+
+import org.mockito.AdditionalAnswers
+import org.mockito.Matchers.{any, anyBoolean, anyLong, eq => meq}
+import org.mockito.Mockito._
+import org.scalatest.BeforeAndAfter
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.status.KVUtils
+import org.apache.spark.util.{ManualClock, Utils}
+import org.apache.spark.util.kvstore.KVStore
+
+class HistoryServerDiskManagerSuite extends SparkFunSuite with BeforeAndAfter {
+
+  import config._
+
+  private val MAX_USAGE = 3L
+
+  private var testDir: File = _
+  private var store: KVStore = _
+
+  before {
+    testDir = Utils.createTempDir()
+    store = KVUtils.open(new File(testDir, "listing"), "test")
+  }
+
+  after {
+    store.close()
+    if (testDir != null) {
+      Utils.deleteRecursively(testDir)
+    }
+  }
+
+  private def mockManager(): HistoryServerDiskManager = {
+    val conf = new SparkConf().set(MAX_LOCAL_DISK_USAGE, MAX_USAGE)
+    val manager = spy(new HistoryServerDiskManager(conf, testDir, store, new ManualClock()))
+    doAnswer(AdditionalAnswers.returnsFirstArg[Long]()).when(manager)
+      .approximateSize(anyLong(), anyBoolean())
+    manager
+  }
+
+  test("leasing space") {
+    val manager = mockManager()
+
+    // Lease all available space.
+    val leaseA = manager.lease(1)
+    val leaseB = manager.lease(1)
+    val leaseC = manager.lease(1)
+    assert(manager.free() === 0)
+
+    // Revert one lease, get another one.
+    leaseA.rollback()
+    assert(manager.free() > 0)
+    assert(!leaseA.tmpPath.exists())
+
+    val leaseD = manager.lease(1)
+    assert(manager.free() === 0)
+
+    // Committing B should bring the "used" space up to 4, so there shouldn't be space left yet.
+    doReturn(2L).when(manager).sizeOf(meq(leaseB.tmpPath))
+    val dstB = leaseB.commit("app2", None)
+    assert(manager.free() === 0)
+    assert(manager.committed() === 2)
+
+    // Rollback C and D, now there should be 1 left.
+    leaseC.rollback()
+    leaseD.rollback()
+    assert(manager.free() === 1)
+
+    // Release app 2 to make it available for eviction.
+    doReturn(2L).when(manager).sizeOf(meq(dstB))
+    manager.release("app2", None)
+    assert(manager.committed() === 2)
+
+    // Emulate an updated event log by replacing the store for lease B. Lease 1, and commit with
+    // size 3.
+    val leaseE = manager.lease(1)
+    doReturn(3L).when(manager).sizeOf(meq(leaseE.tmpPath))
+    val dstE = leaseE.commit("app2", None)
+    assert(dstE === dstB)
+    assert(dstE.exists())
+    doReturn(3L).when(manager).sizeOf(meq(dstE))
+    assert(!leaseE.tmpPath.exists())
+    assert(manager.free() === 0)
+    manager.release("app2", None)
+    assert(manager.committed() === 3)
+
+    // Try a big lease that should cause the released app to be evicted.
+    val leaseF = manager.lease(6)
+    assert(!dstB.exists())
+    assert(manager.free() === 0)
+    assert(manager.committed() === 0)
+
+    // Leasing when no free space is available should still be allowed.
+    manager.lease(1)
+    assert(manager.free() === 0)
+  }
+
+  test("tracking active stores") {
+    val manager = mockManager()
+
+    // Lease and commit space for app 1, making it active.
+    val leaseA = manager.lease(2)
+    assert(manager.free() === 1)
+    doReturn(2L).when(manager).sizeOf(leaseA.tmpPath)
+    assert(manager.openStore("appA", None).isEmpty)
+    val dstA = leaseA.commit("appA", None)
+
+    // Create a new lease. Leases are always granted, but this shouldn't cause app1's store
+    // to be deleted.
+    val leaseB = manager.lease(2)
+    assert(dstA.exists())
+
+    // Trying to commit on top of an active application should fail.
+    intercept[IllegalArgumentException] {
+      leaseB.commit("appA", None)
+    }
+
+    leaseB.rollback()
+
+    // Close appA with an updated size, then create a new lease. Now the app's directory should be
+    // deleted.
+    doReturn(3L).when(manager).sizeOf(dstA)
+    manager.release("appA", None)
+    assert(manager.free() === 0)
+
+    val leaseC = manager.lease(1)
+    assert(!dstA.exists())
+    leaseC.rollback()
+
+    assert(manager.openStore("appA", None).isEmpty)
+  }
+
+  test("approximate size heuristic") {
+    val manager = new HistoryServerDiskManager(new SparkConf(false), testDir, store,
+      new ManualClock())
+    assert(manager.approximateSize(50L, false) < 50L)
+    assert(manager.approximateSize(50L, true) > 50L)
+  }
+
+}

From 4e9e6aee44bb2ddb41b567d659358b22fd824222 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 29 Dec 2017 10:51:37 -0600
Subject: [PATCH 224/356] [SPARK-22864][CORE] Disable allocation schedule in
 ExecutorAllocationManagerSuite.

The scheduled task was racing with the test code and could influence
the values returned to the test, triggering assertions. The change adds
a new config that is only used during testing, and overrides it
on the affected test suite.

The issue in the bug can be reliably reproduced by reducing the interval
in the test (e.g. to 10ms).

While there, fixed an exception that shows up in the logs while these
tests run, and simplified some code (which was also causing misleading
log messages in the log output of the test).

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20050 from vanzin/SPARK-22864.
---
 .../spark/ExecutorAllocationManager.scala     |  7 +++++-
 .../scala/org/apache/spark/SparkContext.scala | 20 ++++++++--------
 .../spark/scheduler/AsyncEventQueue.scala     |  2 +-
 .../ExecutorAllocationManagerSuite.scala      | 24 +++++++++----------
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 5bc2d9ef1b949..2e00dc8b49dd5 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -141,7 +141,11 @@ private[spark] class ExecutorAllocationManager(
   private val removeTimes = new mutable.HashMap[String, Long]
 
   // Polling loop interval (ms)
-  private val intervalMillis: Long = 100
+  private val intervalMillis: Long = if (Utils.isTesting) {
+      conf.getLong(TESTING_SCHEDULE_INTERVAL_KEY, 100)
+    } else {
+      100
+    }
 
   // Clock used to schedule when executors should be added and removed
   private var clock: Clock = new SystemClock()
@@ -856,4 +860,5 @@ private[spark] class ExecutorAllocationManager(
 
 private object ExecutorAllocationManager {
   val NOT_SET = Long.MaxValue
+  val TESTING_SCHEDULE_INTERVAL_KEY = "spark.testing.dynamicAllocation.scheduleInterval"
 }
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index fcbeddd2a9ac3..1dbb3789b0c9d 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1575,10 +1575,10 @@ class SparkContext(config: SparkConf) extends Logging {
 
   private[spark] def getExecutorIds(): Seq[String] = {
     schedulerBackend match {
-      case b: CoarseGrainedSchedulerBackend =>
+      case b: ExecutorAllocationClient =>
         b.getExecutorIds()
       case _ =>
-        logWarning("Requesting executors is only supported in coarse-grained mode")
+        logWarning("Requesting executors is not supported by current scheduler.")
         Nil
     }
   }
@@ -1604,10 +1604,10 @@ class SparkContext(config: SparkConf) extends Logging {
       hostToLocalTaskCount: scala.collection.immutable.Map[String, Int]
     ): Boolean = {
     schedulerBackend match {
-      case b: CoarseGrainedSchedulerBackend =>
+      case b: ExecutorAllocationClient =>
         b.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount)
       case _ =>
-        logWarning("Requesting executors is only supported in coarse-grained mode")
+        logWarning("Requesting executors is not supported by current scheduler.")
         false
     }
   }
@@ -1620,10 +1620,10 @@ class SparkContext(config: SparkConf) extends Logging {
   @DeveloperApi
   def requestExecutors(numAdditionalExecutors: Int): Boolean = {
     schedulerBackend match {
-      case b: CoarseGrainedSchedulerBackend =>
+      case b: ExecutorAllocationClient =>
         b.requestExecutors(numAdditionalExecutors)
       case _ =>
-        logWarning("Requesting executors is only supported in coarse-grained mode")
+        logWarning("Requesting executors is not supported by current scheduler.")
         false
     }
   }
@@ -1642,10 +1642,10 @@ class SparkContext(config: SparkConf) extends Logging {
   @DeveloperApi
   def killExecutors(executorIds: Seq[String]): Boolean = {
     schedulerBackend match {
-      case b: CoarseGrainedSchedulerBackend =>
+      case b: ExecutorAllocationClient =>
         b.killExecutors(executorIds, replace = false, force = true).nonEmpty
       case _ =>
-        logWarning("Killing executors is only supported in coarse-grained mode")
+        logWarning("Killing executors is not supported by current scheduler.")
         false
     }
   }
@@ -1680,10 +1680,10 @@ class SparkContext(config: SparkConf) extends Logging {
    */
   private[spark] def killAndReplaceExecutor(executorId: String): Boolean = {
     schedulerBackend match {
-      case b: CoarseGrainedSchedulerBackend =>
+      case b: ExecutorAllocationClient =>
         b.killExecutors(Seq(executorId), replace = true, force = true).nonEmpty
       case _ =>
-        logWarning("Killing executors is only supported in coarse-grained mode")
+        logWarning("Killing executors is not supported by current scheduler.")
         false
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
index 8605e1da161c7..7e14938acd8e0 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AsyncEventQueue.scala
@@ -127,8 +127,8 @@ private class AsyncEventQueue(val name: String, conf: SparkConf, metrics: LiveLi
       throw new IllegalStateException(s"Attempted to stop $name that has not yet started!")
     }
     if (stopped.compareAndSet(false, true)) {
-      eventQueue.put(POISON_PILL)
       eventCount.incrementAndGet()
+      eventQueue.put(POISON_PILL)
     }
     dispatchThread.join()
   }
diff --git a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
index 90b7ec4384abd..a0cae5a9e011c 100644
--- a/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ExecutorAllocationManagerSuite.scala
@@ -265,7 +265,11 @@ class ExecutorAllocationManagerSuite
 
     val task2Info = createTaskInfo(1, 0, "executor-1")
     post(sc.listenerBus, SparkListenerTaskStart(2, 0, task2Info))
+
+    task1Info.markFinished(TaskState.FINISHED, System.currentTimeMillis())
     post(sc.listenerBus, SparkListenerTaskEnd(2, 0, null, Success, task1Info, null))
+
+    task2Info.markFinished(TaskState.FINISHED, System.currentTimeMillis())
     post(sc.listenerBus, SparkListenerTaskEnd(2, 0, null, Success, task2Info, null))
 
     assert(adjustRequestedExecutors(manager) === -1)
@@ -1063,6 +1067,9 @@ class ExecutorAllocationManagerSuite
         s"${sustainedSchedulerBacklogTimeout.toString}s")
       .set("spark.dynamicAllocation.executorIdleTimeout", s"${executorIdleTimeout.toString}s")
       .set("spark.dynamicAllocation.testing", "true")
+      // SPARK-22864: effectively disable the allocation schedule by setting the period to a
+      // really long value.
+      .set(TESTING_SCHEDULE_INTERVAL_KEY, "10000")
     val sc = new SparkContext(conf)
     contexts += sc
     sc
@@ -1250,28 +1257,19 @@ private class DummyLocalExternalClusterManager extends ExternalClusterManager {
 private class DummyLocalSchedulerBackend (sc: SparkContext, sb: SchedulerBackend)
   extends SchedulerBackend with ExecutorAllocationClient {
 
-  override private[spark] def getExecutorIds(): Seq[String] = sc.getExecutorIds()
+  override private[spark] def getExecutorIds(): Seq[String] = Nil
 
   override private[spark] def requestTotalExecutors(
       numExecutors: Int,
       localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int]): Boolean =
-    sc.requestTotalExecutors(numExecutors, localityAwareTasks, hostToLocalTaskCount)
+      hostToLocalTaskCount: Map[String, Int]): Boolean = true
 
-  override def requestExecutors(numAdditionalExecutors: Int): Boolean =
-    sc.requestExecutors(numAdditionalExecutors)
+  override def requestExecutors(numAdditionalExecutors: Int): Boolean = true
 
   override def killExecutors(
       executorIds: Seq[String],
       replace: Boolean,
-      force: Boolean): Seq[String] = {
-    val response = sc.killExecutors(executorIds)
-    if (response) {
-      executorIds
-    } else {
-      Seq.empty[String]
-    }
-  }
+      force: Boolean): Seq[String] = executorIds
 
   override def start(): Unit = sb.start()
 

From afc36414601d3f1a1946ccf2c630f43b7b7246a8 Mon Sep 17 00:00:00 2001
From: Zheng RuiFeng <ruifengz@foxmail.com>
Date: Fri, 29 Dec 2017 10:08:03 -0800
Subject: [PATCH 225/356] [SPARK-22905][ML][FOLLOWUP] Fix GaussianMixtureModel
 save

## What changes were proposed in this pull request?
make sure model data is stored in order.  WeichenXu123

## How was this patch tested?
existing tests

Author: Zheng RuiFeng <ruifengz@foxmail.com>

Closes #20113 from zhengruifeng/gmm_save.
---
 .../apache/spark/mllib/clustering/GaussianMixtureModel.scala    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index afbe4f978b286..1933d5499c3bf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -154,7 +154,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] {
       val dataArray = Array.tabulate(weights.length) { i =>
         Data(weights(i), gaussians(i).mu, gaussians(i).sigma)
       }
-      spark.createDataFrame(dataArray).repartition(1).write.parquet(Loader.dataPath(path))
+      spark.createDataFrame(sc.makeRDD(dataArray, 1)).write.parquet(Loader.dataPath(path))
     }
 
     def load(sc: SparkContext, path: String): GaussianMixtureModel = {

From 66a7d6b30fe5581d09ef660abe2a9c8c334d29f2 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Fri, 29 Dec 2017 10:51:43 -0800
Subject: [PATCH 226/356] [SPARK-22920][SPARKR] sql functions for current_date,
 current_timestamp, rtrim/ltrim/trim with trimString

## What changes were proposed in this pull request?

Add sql functions

## How was this patch tested?

manual, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20105 from felixcheung/rsqlfuncs.
---
 R/pkg/DESCRIPTION                     |   1 +
 R/pkg/NAMESPACE                       |   2 +
 R/pkg/R/functions.R                   | 105 +++++++++++++++++++++-----
 R/pkg/R/generics.R                    |  17 ++++-
 R/pkg/tests/fulltests/test_sparkSQL.R |   4 +-
 5 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index d1c846c048274..6d46c31906260 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -59,3 +59,4 @@ Collate:
     'window.R'
 RoxygenNote: 5.0.1
 VignetteBuilder: knitr
+NeedsCompilation: no
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ce3eec069efe5..3219c6f0cc47b 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -228,6 +228,8 @@ exportMethods("%<=>%",
               "crc32",
               "create_array",
               "create_map",
+              "current_date",
+              "current_timestamp",
               "hash",
               "cume_dist",
               "date_add",
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 3a96f94e269f4..fff230da1834a 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -39,7 +39,8 @@ NULL
 #' Date time functions defined for \code{Column}.
 #'
 #' @param x Column to compute on. In \code{window}, it must be a time Column of
-#'          \code{TimestampType}.
+#'          \code{TimestampType}. This is not used with \code{current_date} and
+#'          \code{current_timestamp}
 #' @param format The format for the given dates or timestamps in Column \code{x}. See the
 #'               format used in the following methods:
 #'               \itemize{
@@ -1109,10 +1110,11 @@ setMethod("lower",
           })
 
 #' @details
-#' \code{ltrim}: Trims the spaces from left end for the specified string value.
+#' \code{ltrim}: Trims the spaces from left end for the specified string value. Optionally a
+#' \code{trimString} can be specified.
 #'
 #' @rdname column_string_functions
-#' @aliases ltrim ltrim,Column-method
+#' @aliases ltrim ltrim,Column,missing-method
 #' @export
 #' @examples
 #'
@@ -1128,12 +1130,24 @@ setMethod("lower",
 #' head(tmp)}
 #' @note ltrim since 1.5.0
 setMethod("ltrim",
-          signature(x = "Column"),
-          function(x) {
+          signature(x = "Column", trimString = "missing"),
+          function(x, trimString) {
             jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc)
             column(jc)
           })
 
+#' @param trimString a character string to trim with
+#' @rdname column_string_functions
+#' @aliases ltrim,Column,character-method
+#' @export
+#' @note ltrim(Column, character) since 2.3.0
+setMethod("ltrim",
+          signature(x = "Column", trimString = "character"),
+          function(x, trimString) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "ltrim", x@jc, trimString)
+            column(jc)
+          })
+
 #' @details
 #' \code{max}: Returns the maximum value of the expression in a group.
 #'
@@ -1348,19 +1362,31 @@ setMethod("bround",
           })
 
 #' @details
-#' \code{rtrim}: Trims the spaces from right end for the specified string value.
+#' \code{rtrim}: Trims the spaces from right end for the specified string value. Optionally a
+#' \code{trimString} can be specified.
 #'
 #' @rdname column_string_functions
-#' @aliases rtrim rtrim,Column-method
+#' @aliases rtrim rtrim,Column,missing-method
 #' @export
 #' @note rtrim since 1.5.0
 setMethod("rtrim",
-          signature(x = "Column"),
-          function(x) {
+          signature(x = "Column", trimString = "missing"),
+          function(x, trimString) {
             jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc)
             column(jc)
           })
 
+#' @rdname column_string_functions
+#' @aliases rtrim,Column,character-method
+#' @export
+#' @note rtrim(Column, character) since 2.3.0
+setMethod("rtrim",
+          signature(x = "Column", trimString = "character"),
+          function(x, trimString) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "rtrim", x@jc, trimString)
+            column(jc)
+          })
+
 #' @details
 #' \code{sd}: Alias for \code{stddev_samp}.
 #'
@@ -1789,19 +1815,31 @@ setMethod("to_timestamp",
           })
 
 #' @details
-#' \code{trim}: Trims the spaces from both ends for the specified string column.
+#' \code{trim}: Trims the spaces from both ends for the specified string column. Optionally a
+#' \code{trimString} can be specified.
 #'
 #' @rdname column_string_functions
-#' @aliases trim trim,Column-method
+#' @aliases trim trim,Column,missing-method
 #' @export
 #' @note trim since 1.5.0
 setMethod("trim",
-          signature(x = "Column"),
-          function(x) {
+          signature(x = "Column", trimString = "missing"),
+          function(x, trimString) {
             jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc)
             column(jc)
           })
 
+#' @rdname column_string_functions
+#' @aliases trim,Column,character-method
+#' @export
+#' @note trim(Column, character) since 2.3.0
+setMethod("trim",
+          signature(x = "Column", trimString = "character"),
+          function(x, trimString) {
+            jc <- callJStatic("org.apache.spark.sql.functions", "trim", x@jc, trimString)
+            column(jc)
+          })
+
 #' @details
 #' \code{unbase64}: Decodes a BASE64 encoded string column and returns it as a binary column.
 #' This is the reverse of base64.
@@ -2777,11 +2815,11 @@ setMethod("rpad", signature(x = "Column", len = "numeric", pad = "character"),
           })
 
 #' @details
-#' \code{substring_index}: Returns the substring from string str before count occurrences of
-#' the delimiter delim. If count is positive, everything the left of the final delimiter
-#' (counting from left) is returned. If count is negative, every to the right of the final
-#' delimiter (counting from the right) is returned. substring_index performs a case-sensitive
-#' match when searching for delim.
+#' \code{substring_index}: Returns the substring from string (\code{x}) before \code{count}
+#' occurrences of the delimiter (\code{delim}). If \code{count} is positive, everything the left of
+#' the final delimiter (counting from left) is returned. If \code{count} is negative, every to the
+#' right of the final delimiter (counting from the right) is returned. \code{substring_index}
+#' performs a case-sensitive match when searching for the delimiter.
 #'
 #' @param delim a delimiter string.
 #' @param count number of occurrences of \code{delim} before the substring is returned.
@@ -3504,3 +3542,34 @@ setMethod("date_trunc",
             jc <- callJStatic("org.apache.spark.sql.functions", "date_trunc", format, x@jc)
             column(jc)
           })
+
+#' @details
+#' \code{current_date}: Returns the current date as a date column.
+#'
+#' @rdname column_datetime_functions
+#' @aliases current_date current_date,missing-method
+#' @export
+#' @examples
+#' \dontrun{
+#' head(select(df, current_date(), current_timestamp()))}
+#' @note current_date since 2.3.0
+setMethod("current_date",
+          signature("missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "current_date")
+            column(jc)
+          })
+
+#' @details
+#' \code{current_timestamp}: Returns the current timestamp as a timestamp column.
+#'
+#' @rdname column_datetime_functions
+#' @aliases current_timestamp current_timestamp,missing-method
+#' @export
+#' @note current_timestamp since 2.3.0
+setMethod("current_timestamp",
+          signature("missing"),
+          function() {
+            jc <- callJStatic("org.apache.spark.sql.functions", "current_timestamp")
+            column(jc)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index d5d0bc9d8a97d..5369c32544e5e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1027,6 +1027,17 @@ setGeneric("hash", function(x, ...) { standardGeneric("hash") })
 #' @name NULL
 setGeneric("cume_dist", function(x = "missing") { standardGeneric("cume_dist") })
 
+#' @rdname column_datetime_functions
+#' @export
+#' @name NULL
+setGeneric("current_date", function(x = "missing") { standardGeneric("current_date") })
+
+#' @rdname column_datetime_functions
+#' @export
+#' @name NULL
+setGeneric("current_timestamp", function(x = "missing") { standardGeneric("current_timestamp") })
+
+
 #' @rdname column_datetime_diff_functions
 #' @export
 #' @name NULL
@@ -1230,7 +1241,7 @@ setGeneric("lpad", function(x, len, pad) { standardGeneric("lpad") })
 #' @rdname column_string_functions
 #' @export
 #' @name NULL
-setGeneric("ltrim", function(x) { standardGeneric("ltrim") })
+setGeneric("ltrim", function(x, trimString) { standardGeneric("ltrim") })
 
 #' @rdname column_collection_functions
 #' @export
@@ -1380,7 +1391,7 @@ setGeneric("rpad", function(x, len, pad) { standardGeneric("rpad") })
 #' @rdname column_string_functions
 #' @export
 #' @name NULL
-setGeneric("rtrim", function(x) { standardGeneric("rtrim") })
+setGeneric("rtrim", function(x, trimString) { standardGeneric("rtrim") })
 
 #' @rdname column_aggregate_functions
 #' @export
@@ -1520,7 +1531,7 @@ setGeneric("translate", function(x, matchingString, replaceString) { standardGen
 #' @rdname column_string_functions
 #' @export
 #' @name NULL
-setGeneric("trim", function(x) { standardGeneric("trim") })
+setGeneric("trim", function(x, trimString) { standardGeneric("trim") })
 
 #' @rdname column_string_functions
 #' @export
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 650e7c05f46e8..1b7d53fd73a08 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -1427,7 +1427,7 @@ test_that("column functions", {
   c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
   c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
   c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
-  c12 <- variance(c)
+  c12 <- variance(c) + ltrim(c, "a") + rtrim(c, "b") + trim(c, "c")
   c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
   c14 <- cume_dist() + ntile(1) + corr(c, c1)
   c15 <- dense_rank() + percent_rank() + rank() + row_number()
@@ -1441,7 +1441,7 @@ test_that("column functions", {
   c23 <- trunc(c, "year") + trunc(c, "yyyy") + trunc(c, "yy") +
     trunc(c, "month") + trunc(c, "mon") + trunc(c, "mm")
   c24 <- date_trunc("hour", c) + date_trunc("minute", c) + date_trunc("week", c) +
-    date_trunc("quarter", c)
+    date_trunc("quarter", c) + current_date() + current_timestamp()
 
   # Test if base::is.nan() is exposed
   expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))

From ccda75b0d17637abaa7ab54b68e4db8e1a16f2ce Mon Sep 17 00:00:00 2001
From: Imran Rashid <irashid@cloudera.com>
Date: Fri, 29 Dec 2017 17:07:01 -0600
Subject: [PATCH 227/356] [SPARK-22921][PROJECT-INFRA] Bug fix in jira
 assigning

Small bug fix from last pr, ran a successful merge with this code.

Author: Imran Rashid <irashid@cloudera.com>

Closes #20117 from squito/SPARK-22921.
---
 dev/merge_spark_pr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index cd5dd1e1f4943..57ca8400b6f3d 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -242,8 +242,8 @@ def resolve_jira_issue(merge_branches, comment, default_jira_id=""):
     cur_summary = issue.fields.summary
     cur_assignee = issue.fields.assignee
     if cur_assignee is None:
-        cur_assignee = choose_jira_assignee(issue)
-    # Check again, we might not have chose an assignee
+        cur_assignee = choose_jira_assignee(issue, asf_jira)
+    # Check again, we might not have chosen an assignee
     if cur_assignee is None:
         cur_assignee = "NOT ASSIGNED!!!"
     else:

From 30fcdc0380de4f107977d39d067b07e149ab2cb1 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Fri, 29 Dec 2017 16:31:25 -0800
Subject: [PATCH 228/356] [SPARK-22922][ML][PYSPARK] Pyspark portion of the
 fit-multiple API

## What changes were proposed in this pull request?

Adding fitMultiple API to `Estimator` with default implementation. Also update have ml.tuning meta-estimators use this API.

## How was this patch tested?

Unit tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20058 from MrBago/python-fitMultiple.
---
 python/pyspark/ml/base.py   | 69 +++++++++++++++++++++++++++++++++++--
 python/pyspark/ml/tests.py  | 15 ++++++++
 python/pyspark/ml/tuning.py | 44 +++++++++++++++--------
 3 files changed, 110 insertions(+), 18 deletions(-)

diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
index a6767cee9bf28..d4470b5bf2900 100644
--- a/python/pyspark/ml/base.py
+++ b/python/pyspark/ml/base.py
@@ -18,13 +18,52 @@
 from abc import ABCMeta, abstractmethod
 
 import copy
+import threading
 
 from pyspark import since
-from pyspark.ml.param import Params
 from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
 from pyspark.sql.functions import udf
-from pyspark.sql.types import StructField, StructType, DoubleType
+from pyspark.sql.types import StructField, StructType
+
+
+class _FitMultipleIterator(object):
+    """
+    Used by default implementation of Estimator.fitMultiple to produce models in a thread safe
+    iterator. This class handles the simple case of fitMultiple where each param map should be
+    fit independently.
+
+    :param fitSingleModel: Function: (int => Model) which fits an estimator to a dataset.
+        `fitSingleModel` may be called up to `numModels` times, with a unique index each time.
+        Each call to `fitSingleModel` with an index should return the Model associated with
+        that index.
+    :param numModel: Number of models this iterator should produce.
+
+    See Estimator.fitMultiple for more info.
+    """
+    def __init__(self, fitSingleModel, numModels):
+        """
+
+        """
+        self.fitSingleModel = fitSingleModel
+        self.numModel = numModels
+        self.counter = 0
+        self.lock = threading.Lock()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            index = self.counter
+            if index >= self.numModel:
+                raise StopIteration("No models remaining.")
+            self.counter += 1
+        return index, self.fitSingleModel(index)
+
+    def next(self):
+        """For python2 compatibility."""
+        return self.__next__()
 
 
 @inherit_doc
@@ -47,6 +86,27 @@ def _fit(self, dataset):
         """
         raise NotImplementedError()
 
+    @since("2.3.0")
+    def fitMultiple(self, dataset, paramMaps):
+        """
+        Fits a model to the input dataset for each param map in `paramMaps`.
+
+        :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`.
+        :param paramMaps: A Sequence of param maps.
+        :return: A thread safe iterable which contains one model for each param map. Each
+                 call to `next(modelIterator)` will return `(index, model)` where model was fit
+                 using `paramMaps[index]`. `index` values may not be sequential.
+
+        .. note:: DeveloperApi
+        .. note:: Experimental
+        """
+        estimator = self.copy()
+
+        def fitSingleModel(index):
+            return estimator.fit(dataset, paramMaps[index])
+
+        return _FitMultipleIterator(fitSingleModel, len(paramMaps))
+
     @since("1.3.0")
     def fit(self, dataset, params=None):
         """
@@ -61,7 +121,10 @@ def fit(self, dataset, params=None):
         if params is None:
             params = dict()
         if isinstance(params, (list, tuple)):
-            return [self.fit(dataset, paramMap) for paramMap in params]
+            models = [None] * len(params)
+            for index, model in self.fitMultiple(dataset, params):
+                models[index] = model
+            return models
         elif isinstance(params, dict):
             if params:
                 return self.copy(params)._fit(dataset)
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index afcb0881c4dcb..1af2b91da900d 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -2380,6 +2380,21 @@ def test_unary_transformer_transform(self):
             self.assertEqual(res.input + shiftVal, res.output)
 
 
+class EstimatorTest(unittest.TestCase):
+
+    def testDefaultFitMultiple(self):
+        N = 4
+        data = MockDataset()
+        estimator = MockEstimator()
+        params = [{estimator.fake: i} for i in range(N)]
+        modelIter = estimator.fitMultiple(data, params)
+        indexList = []
+        for index, model in modelIter:
+            self.assertEqual(model.getFake(), index)
+            indexList.append(index)
+        self.assertEqual(sorted(indexList), list(range(N)))
+
+
 if __name__ == "__main__":
     from pyspark.ml.tests import *
     if xmlrunner:
diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
index 47351133524e7..6c0cad6cbaaa1 100644
--- a/python/pyspark/ml/tuning.py
+++ b/python/pyspark/ml/tuning.py
@@ -31,6 +31,28 @@
            'TrainValidationSplitModel']
 
 
+def _parallelFitTasks(est, train, eva, validation, epm):
+    """
+    Creates a list of callables which can be called from different threads to fit and evaluate
+    an estimator in parallel. Each callable returns an `(index, metric)` pair.
+
+    :param est: Estimator, the estimator to be fit.
+    :param train: DataFrame, training data set, used for fitting.
+    :param eva: Evaluator, used to compute `metric`
+    :param validation: DataFrame, validation data set, used for evaluation.
+    :param epm: Sequence of ParamMap, params maps to be used during fitting & evaluation.
+    :return: (int, float), an index into `epm` and the associated metric value.
+    """
+    modelIter = est.fitMultiple(train, epm)
+
+    def singleTask():
+        index, model = next(modelIter)
+        metric = eva.evaluate(model.transform(validation, epm[index]))
+        return index, metric
+
+    return [singleTask] * len(epm)
+
+
 class ParamGridBuilder(object):
     r"""
     Builder for a param grid used in grid search-based model selection.
@@ -266,15 +288,9 @@ def _fit(self, dataset):
             validation = df.filter(condition).cache()
             train = df.filter(~condition).cache()
 
-            def singleTrain(paramMap):
-                model = est.fit(train, paramMap)
-                # TODO: duplicate evaluator to take extra params from input
-                metric = eva.evaluate(model.transform(validation, paramMap))
-                return metric
-
-            currentFoldMetrics = pool.map(singleTrain, epm)
-            for j in range(numModels):
-                metrics[j] += (currentFoldMetrics[j] / nFolds)
+            tasks = _parallelFitTasks(est, train, eva, validation, epm)
+            for j, metric in pool.imap_unordered(lambda f: f(), tasks):
+                metrics[j] += (metric / nFolds)
             validation.unpersist()
             train.unpersist()
 
@@ -523,13 +539,11 @@ def _fit(self, dataset):
         validation = df.filter(condition).cache()
         train = df.filter(~condition).cache()
 
-        def singleTrain(paramMap):
-            model = est.fit(train, paramMap)
-            metric = eva.evaluate(model.transform(validation, paramMap))
-            return metric
-
+        tasks = _parallelFitTasks(est, train, eva, validation, epm)
         pool = ThreadPool(processes=min(self.getParallelism(), numModels))
-        metrics = pool.map(singleTrain, epm)
+        metrics = [None] * numModels
+        for j, metric in pool.imap_unordered(lambda f: f(), tasks):
+            metrics[j] = metric
         train.unpersist()
         validation.unpersist()
 

From 816963043a09b082e8b9280dfa97fdaa19211015 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Fri, 29 Dec 2017 19:45:14 -0800
Subject: [PATCH 229/356] [SPARK-22734][ML][PYSPARK] Added Python API for
 VectorSizeHint.

(Please fill in changes proposed in this fix)

Python API for VectorSizeHint Transformer.

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)

doc-tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20112 from MrBago/vectorSizeHint-PythonAPI.
---
 .../spark/ml/feature/VectorSizeHint.scala     |  1 +
 python/pyspark/ml/feature.py                  | 79 +++++++++++++++++++
 2 files changed, 80 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
index 1fe3cfc74c76d..f5947d61fe349 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSizeHint.scala
@@ -35,6 +35,7 @@ import org.apache.spark.sql.types.StructType
  * VectorAssembler needs size information for its input columns and cannot be used on streaming
  * dataframes without this metadata.
  *
+ * Note: VectorSizeHint modifies `inputCol` to include size metadata and does not have an outputCol.
  */
 @Experimental
 @Since("2.3.0")
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 608f2a5715497..5094324e5c1fe 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -57,6 +57,7 @@
            'Tokenizer',
            'VectorAssembler',
            'VectorIndexer', 'VectorIndexerModel',
+           'VectorSizeHint',
            'VectorSlicer',
            'Word2Vec', 'Word2VecModel']
 
@@ -3466,6 +3467,84 @@ def selectedFeatures(self):
         return self._call_java("selectedFeatures")
 
 
+@inherit_doc
+class VectorSizeHint(JavaTransformer, HasInputCol, HasHandleInvalid, JavaMLReadable,
+                     JavaMLWritable):
+    """
+    .. note:: Experimental
+
+    A feature transformer that adds size information to the metadata of a vector column.
+    VectorAssembler needs size information for its input columns and cannot be used on streaming
+    dataframes without this metadata.
+
+    .. note:: VectorSizeHint modifies `inputCol` to include size metadata and does not have an
+        outputCol.
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> from pyspark.ml import Pipeline, PipelineModel
+    >>> data = [(Vectors.dense([1., 2., 3.]), 4.)]
+    >>> df = spark.createDataFrame(data, ["vector", "float"])
+    >>>
+    >>> sizeHint = VectorSizeHint(inputCol="vector", size=3, handleInvalid="skip")
+    >>> vecAssembler = VectorAssembler(inputCols=["vector", "float"], outputCol="assembled")
+    >>> pipeline = Pipeline(stages=[sizeHint, vecAssembler])
+    >>>
+    >>> pipelineModel = pipeline.fit(df)
+    >>> pipelineModel.transform(df).head().assembled
+    DenseVector([1.0, 2.0, 3.0, 4.0])
+    >>> vectorSizeHintPath = temp_path + "/vector-size-hint-pipeline"
+    >>> pipelineModel.save(vectorSizeHintPath)
+    >>> loadedPipeline = PipelineModel.load(vectorSizeHintPath)
+    >>> loaded = loadedPipeline.transform(df).head().assembled
+    >>> expected = pipelineModel.transform(df).head().assembled
+    >>> loaded == expected
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    size = Param(Params._dummy(), "size", "Size of vectors in column.",
+                 typeConverter=TypeConverters.toInt)
+
+    handleInvalid = Param(Params._dummy(), "handleInvalid",
+                          "How to handle invalid vectors in inputCol. Invalid vectors include "
+                          "nulls and vectors with the wrong size. The options are `skip` (filter "
+                          "out rows with invalid vectors), `error` (throw an error) and "
+                          "`optimistic` (do not check the vector size, and keep all rows). "
+                          "`error` by default.",
+                          TypeConverters.toString)
+
+    @keyword_only
+    def __init__(self, inputCol=None, size=None, handleInvalid="error"):
+        """
+        __init__(self, inputCol=None, size=None, handleInvalid="error")
+        """
+        super(VectorSizeHint, self).__init__()
+        self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.VectorSizeHint", self.uid)
+        self._setDefault(handleInvalid="error")
+        self.setParams(**self._input_kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, inputCol=None, size=None, handleInvalid="error"):
+        """
+        setParams(self, inputCol=None, size=None, handleInvalid="error")
+        Sets params for this VectorSizeHint.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    @since("2.3.0")
+    def getSize(self):
+        """ Gets size param, the size of vectors in `inputCol`."""
+        self.getOrDefault(self.size)
+
+    @since("2.3.0")
+    def setSize(self, value):
+        """ Sets size param, the size of vectors in `inputCol`."""
+        self._set(size=value)
+
+
 if __name__ == "__main__":
     import doctest
     import tempfile

From 2ea17afb63f976500273518bf1b32f9efe250812 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 29 Dec 2017 20:06:56 -0800
Subject: [PATCH 230/356] [SPARK-22881][ML][TEST] ML regression package
 testsuite add StructuredStreaming test

## What changes were proposed in this pull request?

ML regression package testsuite add StructuredStreaming test

In order to make testsuite easier to modify, new helper function added in `MLTest`:
```
def testTransformerByGlobalCheckFunc[A : Encoder](
      dataframe: DataFrame,
      transformer: Transformer,
      firstResultCol: String,
      otherResultCols: String*)
      (globalCheckFunction: Seq[Row] => Unit): Unit
```

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>
Author: Bago Amirbekian <bago@databricks.com>

Closes #19979 from WeichenXu123/ml_stream_test.
---
 .../AFTSurvivalRegressionSuite.scala          | 19 ++++----
 .../DecisionTreeRegressorSuite.scala          | 43 ++++++++---------
 .../ml/regression/GBTRegressorSuite.scala     | 23 ++++-----
 .../GeneralizedLinearRegressionSuite.scala    | 47 ++++++++++---------
 .../regression/IsotonicRegressionSuite.scala  | 43 +++++++----------
 .../ml/regression/LinearRegressionSuite.scala | 25 +++++-----
 .../org/apache/spark/ml/util/MLTest.scala     | 39 +++++++++++----
 .../apache/spark/ml/util/MLTestSuite.scala    | 12 ++++-
 .../spark/sql/streaming/StreamTest.scala      | 27 +++++------
 9 files changed, 147 insertions(+), 131 deletions(-)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index 02e5c6d294f44..4e4ff71c9de90 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -19,19 +19,16 @@ package org.apache.spark.ml.regression
 
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.random.{ExponentialGenerator, WeibullGenerator}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions.{col, lit}
 import org.apache.spark.sql.types._
 
-class AFTSurvivalRegressionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class AFTSurvivalRegressionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -191,8 +188,8 @@ class AFTSurvivalRegressionSuite
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetUnivariate).select("features", "prediction", "quantiles")
-      .collect().foreach {
+    testTransformer[(Vector, Double, Double)](datasetUnivariate, model,
+      "features", "prediction", "quantiles") {
         case Row(features: Vector, prediction: Double, quantiles: Vector) =>
           assert(prediction ~== model.predict(features) relTol 1E-5)
           assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
@@ -261,8 +258,8 @@ class AFTSurvivalRegressionSuite
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
-      .collect().foreach {
+    testTransformer[(Vector, Double, Double)](datasetMultivariate, model,
+      "features", "prediction", "quantiles") {
         case Row(features: Vector, prediction: Double, quantiles: Vector) =>
           assert(prediction ~== model.predict(features) relTol 1E-5)
           assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
@@ -331,8 +328,8 @@ class AFTSurvivalRegressionSuite
     assert(model.predict(features) ~== responsePredictR relTol 1E-3)
     assert(model.predictQuantiles(features) ~== quantilePredictR relTol 1E-3)
 
-    model.transform(datasetMultivariate).select("features", "prediction", "quantiles")
-      .collect().foreach {
+    testTransformer[(Vector, Double, Double)](datasetMultivariate, model,
+      "features", "prediction", "quantiles") {
         case Row(features: Vector, prediction: Double, quantiles: Vector) =>
           assert(prediction ~== model.predict(features) relTol 1E-5)
           assert(quantiles ~== model.predictQuantiles(features) relTol 1E-5)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
index 642f266891b57..68a1218c23ece 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala
@@ -21,19 +21,18 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.tree.impl.TreeTests
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.tree.{DecisionTree => OldDecisionTree,
   DecisionTreeSuite => OldDecisionTreeSuite}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Row}
 
-class DecisionTreeRegressorSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class DecisionTreeRegressorSuite extends MLTest with DefaultReadWriteTest {
 
   import DecisionTreeRegressorSuite.compareAPIs
+  import testImplicits._
 
   private var categoricalDataPointsRDD: RDD[LabeledPoint] = _
 
@@ -89,14 +88,11 @@ class DecisionTreeRegressorSuite
     val df = TreeTests.setMetadata(categoricalDataPointsRDD, categoricalFeatures, numClasses = 0)
     val model = dt.fit(df)
 
-    val predictions = model.transform(df)
-      .select(model.getFeaturesCol, model.getVarianceCol)
-      .collect()
-
-    predictions.foreach { case Row(features: Vector, variance: Double) =>
-      val expectedVariance = model.rootNode.predictImpl(features).impurityStats.calculate()
-      assert(variance === expectedVariance,
-        s"Expected variance $expectedVariance but got $variance.")
+    testTransformer[(Vector, Double)](df, model, "features", "variance") {
+      case Row(features: Vector, variance: Double) =>
+        val expectedVariance = model.rootNode.predictImpl(features).impurityStats.calculate()
+        assert(variance === expectedVariance,
+          s"Expected variance $expectedVariance but got $variance.")
     }
 
     val varianceData: RDD[LabeledPoint] = TreeTests.varianceData(sc)
@@ -104,18 +100,19 @@ class DecisionTreeRegressorSuite
     dt.setMaxDepth(1)
       .setMaxBins(6)
       .setSeed(0)
-    val transformVarDF = dt.fit(varianceDF).transform(varianceDF)
-    val calculatedVariances = transformVarDF.select(dt.getVarianceCol).collect().map {
-      case Row(variance: Double) => variance
-    }
 
-    // Since max depth is set to 1, the best split point is that which splits the data
-    // into (0.0, 1.0, 2.0) and (10.0, 12.0, 14.0). The predicted variance for each
-    // data point in the left node is 0.667 and for each data point in the right node
-    // is 2.667
-    val expectedVariances = Array(0.667, 0.667, 0.667, 2.667, 2.667, 2.667)
-    calculatedVariances.zip(expectedVariances).foreach { case (actual, expected) =>
-      assert(actual ~== expected absTol 1e-3)
+    testTransformerByGlobalCheckFunc[(Vector, Double)](varianceDF, dt.fit(varianceDF),
+      "variance") { case rows: Seq[Row] =>
+      val calculatedVariances = rows.map(_.getDouble(0))
+
+      // Since max depth is set to 1, the best split point is that which splits the data
+      // into (0.0, 1.0, 2.0) and (10.0, 12.0, 14.0). The predicted variance for each
+      // data point in the left node is 0.667 and for each data point in the right node
+      // is 2.667
+      val expectedVariances = Array(0.667, 0.667, 0.667, 2.667, 2.667, 2.667)
+      calculatedVariances.zip(expectedVariances).foreach { case (actual, expected) =>
+        assert(actual ~== expected absTol 1e-3)
+      }
     }
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
index ecbb57126d759..11c593b521e65 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala
@@ -19,22 +19,20 @@ package org.apache.spark.ml.regression
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.LabeledPoint
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.tree.impl.TreeTests
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint}
 import org.apache.spark.mllib.tree.{EnsembleTestHelper, GradientBoostedTrees => OldGBT}
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.util.Utils
 
 /**
  * Test suite for [[GBTRegressor]].
  */
-class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext
-  with DefaultReadWriteTest {
+class GBTRegressorSuite extends MLTest with DefaultReadWriteTest {
 
   import GBTRegressorSuite.compareAPIs
   import testImplicits._
@@ -91,11 +89,14 @@ class GBTRegressorSuite extends SparkFunSuite with MLlibTestSparkContext
     val model = gbt.fit(df)
 
     MLTestingUtils.checkCopyAndUids(gbt, model)
-    val preds = model.transform(df)
-    val predictions = preds.select("prediction").rdd.map(_.getDouble(0))
-    // Checks based on SPARK-8736 (to ensure it is not doing classification)
-    assert(predictions.max() > 2)
-    assert(predictions.min() < -1)
+
+    testTransformerByGlobalCheckFunc[(Double, Vector)](df, model, "prediction") {
+      case rows: Seq[Row] =>
+        val predictions = rows.map(_.getDouble(0))
+        // Checks based on SPARK-8736 (to ensure it is not doing classification)
+        assert(predictions.max > 2)
+        assert(predictions.min < -1)
+    }
   }
 
   test("Checkpointing") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index df7dee869d058..ef2ff94a5e213 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.feature.{Instance, OffsetInstance}
 import org.apache.spark.ml.feature.{LabeledPoint, RFormula}
 import org.apache.spark.ml.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
 import org.apache.spark.mllib.random._
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -33,8 +33,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.FloatType
 
-class GeneralizedLinearRegressionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -268,8 +267,8 @@ class GeneralizedLinearRegressionSuite
           s"$link link and fitIntercept = $fitIntercept.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(dataset).select("features", "prediction", "linkPrediction").collect()
-          .foreach {
+        testTransformer[(Double, Vector)](dataset, model,
+          "features", "prediction", "linkPrediction") {
             case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
               val eta = BLAS.dot(features, model.coefficients) + model.intercept
               val prediction2 = familyLink.fitted(eta)
@@ -278,7 +277,7 @@ class GeneralizedLinearRegressionSuite
                 s"gaussian family, $link link and fitIntercept = $fitIntercept.")
               assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
                 s"GLM with gaussian family, $link link and fitIntercept = $fitIntercept.")
-          }
+        }
 
         idx += 1
       }
@@ -384,8 +383,8 @@ class GeneralizedLinearRegressionSuite
           s"$link link and fitIntercept = $fitIntercept.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(dataset).select("features", "prediction", "linkPrediction").collect()
-          .foreach {
+        testTransformer[(Double, Vector)](dataset, model,
+          "features", "prediction", "linkPrediction") {
             case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
               val eta = BLAS.dot(features, model.coefficients) + model.intercept
               val prediction2 = familyLink.fitted(eta)
@@ -394,7 +393,7 @@ class GeneralizedLinearRegressionSuite
                 s"binomial family, $link link and fitIntercept = $fitIntercept.")
               assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
                 s"GLM with binomial family, $link link and fitIntercept = $fitIntercept.")
-          }
+        }
 
         idx += 1
       }
@@ -456,8 +455,8 @@ class GeneralizedLinearRegressionSuite
           s"$link link and fitIntercept = $fitIntercept.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(dataset).select("features", "prediction", "linkPrediction").collect()
-          .foreach {
+        testTransformer[(Double, Vector)](dataset, model,
+          "features", "prediction", "linkPrediction") {
             case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
               val eta = BLAS.dot(features, model.coefficients) + model.intercept
               val prediction2 = familyLink.fitted(eta)
@@ -466,7 +465,7 @@ class GeneralizedLinearRegressionSuite
                 s"poisson family, $link link and fitIntercept = $fitIntercept.")
               assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
                 s"GLM with poisson family, $link link and fitIntercept = $fitIntercept.")
-          }
+        }
 
         idx += 1
       }
@@ -562,8 +561,8 @@ class GeneralizedLinearRegressionSuite
           s"$link link and fitIntercept = $fitIntercept.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(dataset).select("features", "prediction", "linkPrediction").collect()
-          .foreach {
+        testTransformer[(Double, Vector)](dataset, model,
+          "features", "prediction", "linkPrediction") {
             case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
               val eta = BLAS.dot(features, model.coefficients) + model.intercept
               val prediction2 = familyLink.fitted(eta)
@@ -572,7 +571,7 @@ class GeneralizedLinearRegressionSuite
                 s"gamma family, $link link and fitIntercept = $fitIntercept.")
               assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
                 s"GLM with gamma family, $link link and fitIntercept = $fitIntercept.")
-          }
+        }
 
         idx += 1
       }
@@ -649,8 +648,8 @@ class GeneralizedLinearRegressionSuite
         s"and variancePower = $variancePower.")
 
       val familyLink = FamilyAndLink(trainer)
-      model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect()
-        .foreach {
+      testTransformer[(Double, Double, Vector)](datasetTweedie, model,
+        "features", "prediction", "linkPrediction") {
           case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
             val eta = BLAS.dot(features, model.coefficients) + model.intercept
             val prediction2 = familyLink.fitted(eta)
@@ -661,7 +660,8 @@ class GeneralizedLinearRegressionSuite
             assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
               s"GLM with tweedie family, linkPower = $linkPower, fitIntercept = $fitIntercept " +
               s"and variancePower = $variancePower.")
-        }
+      }
+
       idx += 1
     }
   }
@@ -724,8 +724,8 @@ class GeneralizedLinearRegressionSuite
           s"fitIntercept = $fitIntercept and variancePower = $variancePower.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(datasetTweedie).select("features", "prediction", "linkPrediction").collect()
-          .foreach {
+        testTransformer[(Double, Double, Vector)](datasetTweedie, model,
+          "features", "prediction", "linkPrediction") {
             case Row(features: DenseVector, prediction1: Double, linkPrediction1: Double) =>
               val eta = BLAS.dot(features, model.coefficients) + model.intercept
               val prediction2 = familyLink.fitted(eta)
@@ -736,7 +736,8 @@ class GeneralizedLinearRegressionSuite
               assert(linkPrediction1 ~= linkPrediction2 relTol 1E-5, "Link Prediction mismatch: " +
                 s"GLM with tweedie family, fitIntercept = $fitIntercept " +
                 s"and variancePower = $variancePower.")
-          }
+        }
+
         idx += 1
       }
     }
@@ -861,8 +862,8 @@ class GeneralizedLinearRegressionSuite
           s" and fitIntercept = $fitIntercept.")
 
         val familyLink = FamilyAndLink(trainer)
-        model.transform(dataset).select("features", "offset", "prediction", "linkPrediction")
-          .collect().foreach {
+        testTransformer[(Double, Double, Double, Vector)](dataset, model,
+          "features", "offset", "prediction", "linkPrediction") {
           case Row(features: DenseVector, offset: Double, prediction1: Double,
           linkPrediction1: Double) =>
             val eta = BLAS.dot(features, model.coefficients) + model.intercept + offset
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
index 180f5f7ce5ab2..18fbbce936a2e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/IsotonicRegressionSuite.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.ml.regression
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.sql.{DataFrame, Row}
 
-class IsotonicRegressionSuite
-  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class IsotonicRegressionSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -44,13 +41,11 @@ class IsotonicRegressionSuite
 
     val model = ir.fit(dataset)
 
-    val predictions = model
-      .transform(dataset)
-      .select("prediction").rdd.map { case Row(pred) =>
-        pred
-      }.collect()
-
-    assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18))
+    testTransformerByGlobalCheckFunc[(Double, Double, Double)](dataset, model,
+      "prediction") { case rows: Seq[Row] =>
+      val predictions = rows.map(_.getDouble(0))
+      assert(predictions === Array(1, 2, 2, 2, 6, 16.5, 16.5, 17, 18))
+    }
 
     assert(model.boundaries === Vectors.dense(0, 1, 3, 4, 5, 6, 7, 8))
     assert(model.predictions === Vectors.dense(1, 2, 2, 6, 16.5, 16.5, 17.0, 18.0))
@@ -64,13 +59,11 @@ class IsotonicRegressionSuite
     val model = ir.fit(dataset)
     val features = generatePredictionInput(Seq(-2.0, -1.0, 0.5, 0.75, 1.0, 2.0, 9.0))
 
-    val predictions = model
-      .transform(features)
-      .select("prediction").rdd.map {
-        case Row(pred) => pred
-      }.collect()
-
-    assert(predictions === Array(7, 7, 6, 5.5, 5, 4, 1))
+    testTransformerByGlobalCheckFunc[Tuple1[Double]](features, model,
+      "prediction") { case rows: Seq[Row] =>
+      val predictions = rows.map(_.getDouble(0))
+      assert(predictions === Array(7, 7, 6, 5.5, 5, 4, 1))
+    }
   }
 
   test("params validation") {
@@ -157,13 +150,11 @@ class IsotonicRegressionSuite
 
     val features = generatePredictionInput(Seq(2.0, 3.0, 4.0, 5.0))
 
-    val predictions = model
-      .transform(features)
-      .select("prediction").rdd.map {
-      case Row(pred) => pred
-    }.collect()
-
-    assert(predictions === Array(3.5, 5.0, 5.0, 5.0))
+    testTransformerByGlobalCheckFunc[Tuple1[Double]](features, model,
+      "prediction") { case rows: Seq[Row] =>
+      val predictions = rows.map(_.getDouble(0))
+      assert(predictions === Array(3.5, 5.0, 5.0, 5.0))
+    }
   }
 
   test("read/write") {
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 9bb2895858f33..d42cb1714478f 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -19,14 +19,13 @@ package org.apache.spark.ml.regression
 
 import scala.util.Random
 
-import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
 import org.apache.spark.ml.util.TestingUtils._
-import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
+import org.apache.spark.mllib.util.LinearDataGenerator
 import org.apache.spark.sql.{DataFrame, Row}
 
 class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
@@ -363,8 +362,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 relTol 1E-3)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-3)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction")
-        .collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
@@ -416,8 +415,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 absTol 1E-2)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction")
-        .collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
           case Row(features: DenseVector, prediction1: Double) =>
             val prediction2 =
               features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
@@ -467,7 +466,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 relTol 1E-2)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
@@ -518,7 +518,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 absTol 1E-2)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction").collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
@@ -570,8 +571,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 relTol 1E-2)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction")
-        .collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
@@ -624,8 +625,8 @@ class LinearRegressionSuite extends MLTest with DefaultReadWriteTest {
       assert(model2.intercept ~== interceptR2 absTol 1E-2)
       assert(model2.coefficients ~= coefficientsR2 relTol 1E-2)
 
-      model1.transform(datasetWithDenseFeature).select("features", "prediction")
-        .collect().foreach {
+      testTransformer[(Double, Vector)](datasetWithDenseFeature, model1,
+        "features", "prediction") {
         case Row(features: DenseVector, prediction1: Double) =>
           val prediction2 =
             features(0) * model1.coefficients(0) + features(1) * model1.coefficients(1) +
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
index 7a5426ebadaa5..17678aa611a48 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTest.scala
@@ -53,12 +53,12 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite =>
     }
   }
 
-  def testTransformerOnStreamData[A : Encoder](
+  private[util] def testTransformerOnStreamData[A : Encoder](
       dataframe: DataFrame,
       transformer: Transformer,
       firstResultCol: String,
       otherResultCols: String*)
-      (checkFunction: Row => Unit): Unit = {
+      (globalCheckFunction: Seq[Row] => Unit): Unit = {
 
     val columnNames = dataframe.schema.fieldNames
     val stream = MemoryStream[A]
@@ -70,22 +70,43 @@ trait MLTest extends StreamTest with TempDirectory { self: Suite =>
       .select(firstResultCol, otherResultCols: _*)
     testStream(streamOutput) (
       AddData(stream, data: _*),
-      CheckAnswer(checkFunction)
+      CheckAnswer(globalCheckFunction)
     )
   }
 
+  private[util] def testTransformerOnDF(
+      dataframe: DataFrame,
+      transformer: Transformer,
+      firstResultCol: String,
+      otherResultCols: String*)
+      (globalCheckFunction: Seq[Row] => Unit): Unit = {
+    val dfOutput = transformer.transform(dataframe)
+    val outputs = dfOutput.select(firstResultCol, otherResultCols: _*).collect()
+    globalCheckFunction(outputs)
+  }
+
   def testTransformer[A : Encoder](
       dataframe: DataFrame,
       transformer: Transformer,
       firstResultCol: String,
       otherResultCols: String*)
       (checkFunction: Row => Unit): Unit = {
-    testTransformerOnStreamData(dataframe, transformer, firstResultCol,
-      otherResultCols: _*)(checkFunction)
+    testTransformerByGlobalCheckFunc(
+      dataframe,
+      transformer,
+      firstResultCol,
+      otherResultCols: _*) { rows: Seq[Row] => rows.foreach(checkFunction(_)) }
+  }
 
-    val dfOutput = transformer.transform(dataframe)
-    dfOutput.select(firstResultCol, otherResultCols: _*).collect().foreach { row =>
-      checkFunction(row)
-    }
+  def testTransformerByGlobalCheckFunc[A : Encoder](
+      dataframe: DataFrame,
+      transformer: Transformer,
+      firstResultCol: String,
+      otherResultCols: String*)
+      (globalCheckFunction: Seq[Row] => Unit): Unit = {
+    testTransformerOnStreamData(dataframe, transformer, firstResultCol,
+      otherResultCols: _*)(globalCheckFunction)
+    testTransformerOnDF(dataframe, transformer, firstResultCol,
+      otherResultCols: _*)(globalCheckFunction)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala
index 56217ec4f3b0c..20c5b5395f6a4 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/util/MLTestSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.ml.util
 
-import org.apache.spark.ml.{PipelineModel, Transformer}
 import org.apache.spark.ml.feature.StringIndexer
 import org.apache.spark.sql.Row
 
@@ -32,10 +31,13 @@ class MLTestSuite extends MLTest {
     val indexer = new StringIndexer().setStringOrderType("alphabetAsc")
       .setInputCol("label").setOutputCol("indexed")
     val indexerModel = indexer.fit(data)
-    testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") {
+    testTransformer[(Int, String)](data, indexerModel, "id", "indexed") {
       case Row(id: Int, indexed: Double) =>
         assert(id === indexed.toInt)
     }
+    testTransformerByGlobalCheckFunc[(Int, String)] (data, indexerModel, "id", "indexed") { rows =>
+      assert(rows.map(_.getDouble(1)).max === 5.0)
+    }
 
     intercept[Exception] {
       testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") {
@@ -43,5 +45,11 @@ class MLTestSuite extends MLTest {
           assert(id != indexed.toInt)
       }
     }
+    intercept[Exception] {
+      testTransformerOnStreamData[(Int, String)](data, indexerModel, "id", "indexed") {
+        rows: Seq[Row] =>
+          assert(rows.map(_.getDouble(1)).max === 1.0)
+      }
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index fb9ebc81dd750..4b7f0fbe97d4e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -137,8 +137,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, false, false)
 
-    def apply(checkFunction: Row => Unit): CheckAnswerRowsByFunc =
-      CheckAnswerRowsByFunc(checkFunction, false)
+    def apply(globalCheckFunction: Seq[Row] => Unit): CheckAnswerRowsByFunc =
+      CheckAnswerRowsByFunc(globalCheckFunction, false)
   }
 
   /**
@@ -161,8 +161,8 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, true, false)
 
-    def apply(checkFunction: Row => Unit): CheckAnswerRowsByFunc =
-      CheckAnswerRowsByFunc(checkFunction, true)
+    def apply(globalCheckFunction: Seq[Row] => Unit): CheckAnswerRowsByFunc =
+      CheckAnswerRowsByFunc(globalCheckFunction, true)
   }
 
   case class CheckAnswerRows(expectedAnswer: Seq[Row], lastOnly: Boolean, isSorted: Boolean)
@@ -177,9 +177,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
   }
 
-  case class CheckAnswerRowsByFunc(checkFunction: Row => Unit, lastOnly: Boolean)
-      extends StreamAction with StreamMustBeRunning {
-    override def toString: String = s"$operatorName: ${checkFunction.toString()}"
+  case class CheckAnswerRowsByFunc(
+      globalCheckFunction: Seq[Row] => Unit,
+      lastOnly: Boolean) extends StreamAction with StreamMustBeRunning {
+    override def toString: String = s"$operatorName"
     private def operatorName = if (lastOnly) "CheckLastBatchByFunc" else "CheckAnswerByFunc"
   }
 
@@ -639,14 +640,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
               error => failTest(error)
             }
 
-          case CheckAnswerRowsByFunc(checkFunction, lastOnly) =>
+          case CheckAnswerRowsByFunc(globalCheckFunction, lastOnly) =>
             val sparkAnswer = fetchStreamAnswer(currentStream, lastOnly)
-            sparkAnswer.foreach { row =>
-              try {
-                checkFunction(row)
-              } catch {
-                case e: Throwable => failTest(e.toString)
-              }
+            try {
+              globalCheckFunction(sparkAnswer)
+            } catch {
+              case e: Throwable => failTest(e.toString)
             }
         }
         pos += 1

From f2b3525c17d660cf6f082bbafea8632615b4f58e Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sat, 30 Dec 2017 14:09:56 +0800
Subject: [PATCH 231/356] [SPARK-22771][SQL] Concatenate binary inputs into a
 binary output

## What changes were proposed in this pull request?
This pr modified `concat` to concat binary inputs into a single binary output.
`concat` in the current master always output data as a string. But, in some databases (e.g., PostgreSQL), if all inputs are binary, `concat` also outputs binary.

## How was this patch tested?
Added tests in `SQLQueryTestSuite` and `TypeCoercionSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #19977 from maropu/SPARK-22771.
---
 R/pkg/R/functions.R                           |   3 +-
 .../apache/spark/unsafe/types/ByteArray.java  |  25 ++
 docs/sql-programming-guide.md                 |   2 +
 python/pyspark/sql/functions.py               |   3 +-
 .../sql/catalyst/analysis/Analyzer.scala      |   2 +-
 .../sql/catalyst/analysis/TypeCoercion.scala  |  26 +-
 .../expressions/stringExpressions.scala       |  52 +++-
 .../sql/catalyst/optimizer/expressions.scala  |  15 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   8 +
 .../catalyst/analysis/TypeCoercionSuite.scala |  54 ++++
 .../optimizer/CombineConcatsSuite.scala       |  14 +-
 .../org/apache/spark/sql/functions.scala      |   3 +-
 .../sql-tests/inputs/string-functions.sql     |  23 ++
 .../inputs/typeCoercion/native/concat.sql     |  93 +++++++
 .../results/string-functions.sql.out          |  45 +++-
 .../typeCoercion/native/concat.sql.out        | 239 ++++++++++++++++++
 16 files changed, 587 insertions(+), 20 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/concat.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index fff230da1834a..55365a41d774b 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2133,7 +2133,8 @@ setMethod("countDistinct",
           })
 
 #' @details
-#' \code{concat}: Concatenates multiple input string columns together into a single string column.
+#' \code{concat}: Concatenates multiple input columns together into a single column.
+#' If all inputs are binary, concat returns an output as binary. Otherwise, it returns as string.
 #'
 #' @rdname column_string_functions
 #' @aliases concat concat,Column-method
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
index 7ced13d357237..c03caf0076f61 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/ByteArray.java
@@ -74,4 +74,29 @@ public static byte[] subStringSQL(byte[] bytes, int pos, int len) {
     }
     return Arrays.copyOfRange(bytes, start, end);
   }
+
+  public static byte[] concat(byte[]... inputs) {
+    // Compute the total length of the result
+    int totalLength = 0;
+    for (int i = 0; i < inputs.length; i++) {
+      if (inputs[i] != null) {
+        totalLength += inputs[i].length;
+      } else {
+        return null;
+      }
+    }
+
+    // Allocate a new byte array, and copy the inputs one by one into it
+    final byte[] result = new byte[totalLength];
+    int offset = 0;
+    for (int i = 0; i < inputs.length; i++) {
+      int len = inputs[i].length;
+      Platform.copyMemory(
+        inputs[i], Platform.BYTE_ARRAY_OFFSET,
+        result, Platform.BYTE_ARRAY_OFFSET + offset,
+        len);
+      offset += len;
+    }
+    return result;
+  }
 }
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index f02f46236e2b0..4b5f56c44444d 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1780,6 +1780,8 @@ options.
  
  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
 
+ - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
+
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 66ee033bab998..a4ed562ad48b4 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1374,7 +1374,8 @@ def hash(*cols):
 @ignore_unicode_prefix
 def concat(*cols):
     """
-    Concatenates multiple input string columns together into a single string column.
+    Concatenates multiple input columns together into a single column.
+    If all inputs are binary, concat returns an output as binary. Otherwise, it returns as string.
 
     >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
     >>> df.select(concat(df.s, df.d).alias('s')).collect()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 1f7191ce0db44..6d294d48c0ee7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -150,7 +150,7 @@ class Analyzer(
       TimeWindowing ::
       ResolveInlineTables(conf) ::
       ResolveTimeZone(conf) ::
-      TypeCoercion.typeCoercionRules ++
+      TypeCoercion.typeCoercionRules(conf) ++
       extendedResolutionRules : _*),
     Batch("Post-Hoc Resolution", Once, postHocResolutionRules: _*),
     Batch("View", Once,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index 1c4be547739dd..e9436367c7e2e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 
@@ -45,13 +46,14 @@ import org.apache.spark.sql.types._
  */
 object TypeCoercion {
 
-  val typeCoercionRules =
+  def typeCoercionRules(conf: SQLConf): List[Rule[LogicalPlan]] =
     InConversion ::
       WidenSetOperationTypes ::
       PromoteStrings ::
       DecimalPrecision ::
       BooleanEquality ::
       FunctionArgumentConversion ::
+      ConcatCoercion(conf) ::
       CaseWhenCoercion ::
       IfCoercion ::
       StackCoercion ::
@@ -660,6 +662,28 @@ object TypeCoercion {
     }
   }
 
+  /**
+   * Coerces the types of [[Concat]] children to expected ones.
+   *
+   * If `spark.sql.function.concatBinaryAsString` is false and all children types are binary,
+   * the expected types are binary. Otherwise, the expected ones are strings.
+   */
+  case class ConcatCoercion(conf: SQLConf) extends TypeCoercionRule {
+
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
+      p transformExpressionsUp {
+        // Skip nodes if unresolved or empty children
+        case c @ Concat(children) if !c.childrenResolved || children.isEmpty => c
+        case c @ Concat(children) if conf.concatBinaryAsString ||
+            !children.map(_.dataType).forall(_ == BinaryType) =>
+          val newChildren = c.children.map { e =>
+            ImplicitTypeCasts.implicitCast(e, StringType).getOrElse(e)
+          }
+          c.copy(children = newChildren)
+      }
+    }
+  }
+
   /**
    * Turns Add/Subtract of DateType/TimestampType/StringType and CalendarIntervalType
    * to TimeAdd/TimeSub
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index c02c41db1668e..b0da55a4a961b 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -24,11 +24,10 @@ import java.util.regex.Pattern
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen._
-import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
+import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData, TypeUtils}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 
@@ -38,7 +37,8 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 
 
 /**
- * An expression that concatenates multiple input strings into a single string.
+ * An expression that concatenates multiple inputs into a single output.
+ * If all inputs are binary, concat returns an output as binary. Otherwise, it returns as string.
  * If any input is null, concat returns null.
  */
 @ExpressionDescription(
@@ -48,17 +48,37 @@ import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
       > SELECT _FUNC_('Spark', 'SQL');
        SparkSQL
   """)
-case class Concat(children: Seq[Expression]) extends Expression with ImplicitCastInputTypes {
+case class Concat(children: Seq[Expression]) extends Expression {
 
-  override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringType)
-  override def dataType: DataType = StringType
+  private lazy val isBinaryMode: Boolean = dataType == BinaryType
+
+  override def checkInputDataTypes(): TypeCheckResult = {
+    if (children.isEmpty) {
+      TypeCheckResult.TypeCheckSuccess
+    } else {
+      val childTypes = children.map(_.dataType)
+      if (childTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
+        TypeCheckResult.TypeCheckFailure(
+          s"input to function $prettyName should have StringType or BinaryType, but it's " +
+            childTypes.map(_.simpleString).mkString("[", ", ", "]"))
+      }
+      TypeUtils.checkForSameTypeInputExpr(childTypes, s"function $prettyName")
+    }
+  }
+
+  override def dataType: DataType = children.map(_.dataType).headOption.getOrElse(StringType)
 
   override def nullable: Boolean = children.exists(_.nullable)
   override def foldable: Boolean = children.forall(_.foldable)
 
   override def eval(input: InternalRow): Any = {
-    val inputs = children.map(_.eval(input).asInstanceOf[UTF8String])
-    UTF8String.concat(inputs : _*)
+    if (isBinaryMode) {
+      val inputs = children.map(_.eval(input).asInstanceOf[Array[Byte]])
+      ByteArray.concat(inputs: _*)
+    } else {
+      val inputs = children.map(_.eval(input).asInstanceOf[UTF8String])
+      UTF8String.concat(inputs : _*)
+    }
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -73,17 +93,27 @@ case class Concat(children: Seq[Expression]) extends Expression with ImplicitCas
         }
       """
     }
+
+    val (concatenator, initCode) = if (isBinaryMode) {
+      (classOf[ByteArray].getName, s"byte[][] $args = new byte[${evals.length}][];")
+    } else {
+      ("UTF8String", s"UTF8String[] $args = new UTF8String[${evals.length}];")
+    }
     val codes = ctx.splitExpressionsWithCurrentInputs(
       expressions = inputs,
       funcName = "valueConcat",
-      extraArguments = ("UTF8String[]", args) :: Nil)
+      extraArguments = (s"${ctx.javaType(dataType)}[]", args) :: Nil)
     ev.copy(s"""
-      UTF8String[] $args = new UTF8String[${evals.length}];
+      $initCode
       $codes
-      UTF8String ${ev.value} = UTF8String.concat($args);
+      ${ctx.javaType(dataType)} ${ev.value} = $concatenator.concat($args);
       boolean ${ev.isNull} = ${ev.value} == null;
     """)
   }
+
+  override def toString: String = s"concat(${children.mkString(", ")})"
+
+  override def sql: String = s"concat(${children.map(_.sql).mkString(", ")})"
 }
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 85295aff19808..7d830bbb7dc32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -21,6 +21,7 @@ import scala.collection.immutable.HashSet
 import scala.collection.mutable.{ArrayBuffer, Stack}
 
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.analysis.TypeCoercion.ImplicitTypeCasts
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral}
 import org.apache.spark.sql.catalyst.expressions.aggregate._
@@ -645,6 +646,12 @@ object CombineConcats extends Rule[LogicalPlan] {
       stack.pop() match {
         case Concat(children) =>
           stack.pushAll(children.reverse)
+        // If `spark.sql.function.concatBinaryAsString` is false, nested `Concat` exprs possibly
+        // have `Concat`s with binary output. Since `TypeCoercion` casts them into strings,
+        // we need to handle the case to combine all nested `Concat`s.
+        case c @ Cast(Concat(children), StringType, _) =>
+          val newChildren = children.map { e => c.copy(child = e) }
+          stack.pushAll(newChildren.reverse)
         case child =>
           flattened += child
       }
@@ -652,8 +659,14 @@ object CombineConcats extends Rule[LogicalPlan] {
     Concat(flattened)
   }
 
+  private def hasNestedConcats(concat: Concat): Boolean = concat.children.exists {
+    case c: Concat => true
+    case c @ Cast(Concat(children), StringType, _) => true
+    case _ => false
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan.transformExpressionsDown {
-    case concat: Concat if concat.children.exists(_.isInstanceOf[Concat]) =>
+    case concat: Concat if hasNestedConcats(concat) =>
       flattenConcats(concat)
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f16972e5427e2..4f77c54a7af57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1044,6 +1044,12 @@ object SQLConf {
         "When this conf is not set, the value from `spark.redaction.string.regex` is used.")
       .fallbackConf(org.apache.spark.internal.config.STRING_REDACTION_PATTERN)
 
+  val CONCAT_BINARY_AS_STRING = buildConf("spark.sql.function.concatBinaryAsString")
+    .doc("When this option is set to false and all inputs are binary, `functions.concat` returns " +
+      "an output as binary. Otherwise, it returns as a string. ")
+    .booleanConf
+    .createWithDefault(false)
+
   val CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE =
     buildConf("spark.sql.streaming.continuous.executorQueueSize")
     .internal()
@@ -1378,6 +1384,8 @@ class SQLConf extends Serializable with Logging {
   def continuousStreamingExecutorPollIntervalMs: Long =
     getConf(CONTINUOUS_STREAMING_EXECUTOR_POLL_INTERVAL_MS)
 
+  def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 5dcd653e9b341..3661530cd622b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -869,6 +869,60 @@ class TypeCoercionSuite extends AnalysisTest {
         Literal.create(null, IntegerType), Literal.create(null, StringType))))
   }
 
+  test("type coercion for Concat") {
+    val rule = TypeCoercion.ConcatCoercion(conf)
+
+    ruleTest(rule,
+      Concat(Seq(Literal("ab"), Literal("cde"))),
+      Concat(Seq(Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Concat(Seq(Literal(null), Literal("abc"))),
+      Concat(Seq(Cast(Literal(null), StringType), Literal("abc"))))
+    ruleTest(rule,
+      Concat(Seq(Literal(1), Literal("234"))),
+      Concat(Seq(Cast(Literal(1), StringType), Literal("234"))))
+    ruleTest(rule,
+      Concat(Seq(Literal("1"), Literal("234".getBytes()))),
+      Concat(Seq(Literal("1"), Cast(Literal("234".getBytes()), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(1L), Literal(2.toByte), Literal(0.1))),
+      Concat(Seq(Cast(Literal(1L), StringType), Cast(Literal(2.toByte), StringType),
+        Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(true), Literal(0.1f), Literal(3.toShort))),
+      Concat(Seq(Cast(Literal(true), StringType), Cast(Literal(0.1f), StringType),
+        Cast(Literal(3.toShort), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(1L), Literal(0.1))),
+      Concat(Seq(Cast(Literal(1L), StringType), Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(Decimal(10)))),
+      Concat(Seq(Cast(Literal(Decimal(10)), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(BigDecimal.valueOf(10)))),
+      Concat(Seq(Cast(Literal(BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(java.math.BigDecimal.valueOf(10)))),
+      Concat(Seq(Cast(Literal(java.math.BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Concat(Seq(Literal(new java.sql.Date(0)), Literal(new Timestamp(0)))),
+      Concat(Seq(Cast(Literal(new java.sql.Date(0)), StringType),
+        Cast(Literal(new Timestamp(0)), StringType))))
+
+    withSQLConf("spark.sql.function.concatBinaryAsString" -> "true") {
+      ruleTest(rule,
+        Concat(Seq(Literal("123".getBytes), Literal("456".getBytes))),
+        Concat(Seq(Cast(Literal("123".getBytes), StringType),
+          Cast(Literal("456".getBytes), StringType))))
+    }
+
+    withSQLConf("spark.sql.function.concatBinaryAsString" -> "false") {
+      ruleTest(rule,
+        Concat(Seq(Literal("123".getBytes), Literal("456".getBytes))),
+        Concat(Seq(Literal("123".getBytes), Literal("456".getBytes))))
+    }
+  }
+
   test("BooleanEquality type cast") {
     val be = TypeCoercion.BooleanEquality
     // Use something more than a literal to avoid triggering the simplification rules.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
index 412e199dfaae3..441c15340a778 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CombineConcatsSuite.scala
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.types.StringType
 
 
 class CombineConcatsSuite extends PlanTest {
@@ -37,8 +36,10 @@ class CombineConcatsSuite extends PlanTest {
     comparePlans(actual, correctAnswer)
   }
 
+  def str(s: String): Literal = Literal(s)
+  def binary(s: String): Literal = Literal(s.getBytes)
+
   test("combine nested Concat exprs") {
-    def str(s: String): Literal = Literal(s, StringType)
     assertEquivalent(
       Concat(
         Concat(str("a") :: str("b") :: Nil) ::
@@ -72,4 +73,13 @@ class CombineConcatsSuite extends PlanTest {
         Nil),
       Concat(str("a") :: str("b") :: str("c") :: str("d") :: Nil))
   }
+
+  test("combine string and binary exprs") {
+    assertEquivalent(
+      Concat(
+        Concat(str("a") :: str("b") :: Nil) ::
+        Concat(binary("c") :: binary("d") :: Nil) ::
+        Nil),
+      Concat(str("a") :: str("b") :: binary("c") :: binary("d") :: Nil))
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 052a3f533da5a..530a525a01dec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2171,7 +2171,8 @@ object functions {
   def base64(e: Column): Column = withExpr { Base64(e.expr) }
 
   /**
-   * Concatenates multiple input string columns together into a single string column.
+   * Concatenates multiple input columns together into a single column.
+   * If all inputs are binary, concat returns an output as binary. Otherwise, it returns as string.
    *
    * @group string_funcs
    * @since 1.5.0
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 40d0c064f5c44..4113734e1707e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -24,3 +24,26 @@ select left("abcd", 2), left("abcd", 5), left("abcd", '2'), left("abcd", null);
 select left(null, -2), left("abcd", -2), left("abcd", 0), left("abcd", 'a');
 select right("abcd", 2), right("abcd", 5), right("abcd", '2'), right("abcd", null);
 select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a');
+
+-- turn off concatBinaryAsString
+set spark.sql.function.concatBinaryAsString=false;
+
+-- Check if catalyst combine nested `Concat`s if concatBinaryAsString=false
+EXPLAIN SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+EXPLAIN SELECT (col1 || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/concat.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/concat.sql
new file mode 100644
index 0000000000000..0beebec5702fd
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/concat.sql
@@ -0,0 +1,93 @@
+-- Concatenate mixed inputs (output type is string)
+SELECT (col1 || col2 || col3) col
+FROM (
+  SELECT
+    id col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3
+  FROM range(10)
+);
+
+SELECT ((col1 || col2) || (col3 || col4) || col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+);
+
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+-- turn on concatBinaryAsString
+set spark.sql.function.concatBinaryAsString=true;
+
+SELECT (col1 || col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
+
+SELECT (col1 || col2 || col3 || col4) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+-- turn off concatBinaryAsString
+set spark.sql.function.concatBinaryAsString=false;
+
+-- Concatenate binary inputs (output type is binary)
+SELECT (col1 || col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
+
+SELECT (col1 || col2 || col3 || col4) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 2d9b3d7d2ca33..d5f8705a35ed6 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 12
+-- Number of queries: 15
 
 
 -- !query 0
@@ -118,3 +118,46 @@ select right(null, -2), right("abcd", -2), right("abcd", 0), right("abcd", 'a')
 struct<right(NULL, -2):string,right('abcd', -2):string,right('abcd', 0):string,right('abcd', 'a'):string>
 -- !query 11 output
 NULL			NULL
+
+
+-- !query 12
+set spark.sql.function.concatBinaryAsString=false
+-- !query 12 schema
+struct<key:string,value:string>
+-- !query 12 output
+spark.sql.function.concatBinaryAsString	false
+
+
+-- !query 13
+EXPLAIN SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 13 schema
+struct<plan:string>
+-- !query 13 output
+== Physical Plan ==
+*Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
++- *Range (0, 10, step=1, splits=2)
+
+
+-- !query 14
+EXPLAIN SELECT (col1 || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 14 schema
+struct<plan:string>
+-- !query 14 output
+== Physical Plan ==
+*Project [concat(cast(id#xL as string), cast(encode(cast((id#xL + 2) as string), utf-8) as string), cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]
++- *Range (0, 10, step=1, splits=2)
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out
new file mode 100644
index 0000000000000..09729fdc2ec32
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/concat.sql.out
@@ -0,0 +1,239 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 11
+
+
+-- !query 0
+SELECT (col1 || col2 || col3) col
+FROM (
+  SELECT
+    id col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3
+  FROM range(10)
+)
+-- !query 0 schema
+struct<col:string>
+-- !query 0 output
+012
+123
+234
+345
+456
+567
+678
+789
+8910
+91011
+
+
+-- !query 1
+SELECT ((col1 || col2) || (col3 || col4) || col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+)
+-- !query 1 schema
+struct<col:string>
+-- !query 1 output
+prefix_0120.0
+prefix_1231.0
+prefix_2342.0
+prefix_3453.0
+prefix_4564.0
+prefix_5675.0
+prefix_6786.0
+prefix_7897.0
+prefix_89108.0
+prefix_910119.0
+
+
+-- !query 2
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 2 schema
+struct<col:string>
+-- !query 2 output
+0123
+1234
+2345
+3456
+4567
+5678
+6789
+78910
+891011
+9101112
+
+
+-- !query 3
+set spark.sql.function.concatBinaryAsString=true
+-- !query 3 schema
+struct<key:string,value:string>
+-- !query 3 output
+spark.sql.function.concatBinaryAsString	true
+
+
+-- !query 4
+SELECT (col1 || col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 4 schema
+struct<col:string>
+-- !query 4 output
+01
+12
+23
+34
+45
+56
+67
+78
+89
+910
+
+
+-- !query 5
+SELECT (col1 || col2 || col3 || col4) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 5 schema
+struct<col:string>
+-- !query 5 output
+0123
+1234
+2345
+3456
+4567
+5678
+6789
+78910
+891011
+9101112
+
+
+-- !query 6
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 6 schema
+struct<col:string>
+-- !query 6 output
+0123
+1234
+2345
+3456
+4567
+5678
+6789
+78910
+891011
+9101112
+
+
+-- !query 7
+set spark.sql.function.concatBinaryAsString=false
+-- !query 7 schema
+struct<key:string,value:string>
+-- !query 7 output
+spark.sql.function.concatBinaryAsString	false
+
+
+-- !query 8
+SELECT (col1 || col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 8 schema
+struct<col:binary>
+-- !query 8 output
+01
+12
+23
+34
+45
+56
+67
+78
+89
+910
+
+
+-- !query 9
+SELECT (col1 || col2 || col3 || col4) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 9 schema
+struct<col:binary>
+-- !query 9 output
+0123
+1234
+2345
+3456
+4567
+5678
+6789
+78910
+891011
+9101112
+
+
+-- !query 10
+SELECT ((col1 || col2) || (col3 || col4)) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 10 schema
+struct<col:binary>
+-- !query 10 output
+0123
+1234
+2345
+3456
+4567
+5678
+6789
+78910
+891011
+9101112

From 14c4a62c126ac5e542f7b82565b5f0fcf3e7fa5a Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Fri, 29 Dec 2017 22:33:29 -0800
Subject: [PATCH 232/356] [SPARK-21475][Core]Revert "[SPARK-21475][CORE] Use
 NIO's Files API to replace FileInputStream/FileOutputStream in some critical
 paths"

## What changes were proposed in this pull request?

This reverts commit 5fd0294ff8960982cfb3b901f84bc91a9f51bf28 because of a huge performance regression.
I manually fixed a minor conflict in `OneForOneBlockFetcher.java`.

`Files.newInputStream` returns `sun.nio.ch.ChannelInputStream`. `ChannelInputStream` doesn't override `InputStream.skip`, so it's using the default `InputStream.skip` which just consumes and discards data. This causes a huge performance regression when reading shuffle files.

## How was this patch tested?

Jenkins

Author: Shixiong Zhu <zsxwing@gmail.com>

Closes #20119 from zsxwing/revert-SPARK-21475.
---
 .../buffer/FileSegmentManagedBuffer.java      |  9 ++++----
 .../shuffle/OneForOneBlockFetcher.java        |  4 ++--
 .../shuffle/ShuffleIndexInformation.java      |  4 ++--
 .../sort/BypassMergeSortShuffleWriter.java    | 16 +++++++-------
 .../shuffle/sort/UnsafeShuffleWriter.java     |  9 ++++----
 .../shuffle/IndexShuffleBlockResolver.scala   |  6 ++----
 .../collection/ExternalAppendOnlyMap.scala    | 21 ++++++++-----------
 .../util/collection/ExternalSorter.scala      | 17 +++++++--------
 8 files changed, 37 insertions(+), 49 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
index ea9b3ce4e3522..c20fab83c3460 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
@@ -18,13 +18,12 @@
 package org.apache.spark.network.buffer;
 
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.file.Files;
-import java.nio.file.StandardOpenOption;
 
 import com.google.common.base.Objects;
 import com.google.common.io.ByteStreams;
@@ -94,9 +93,9 @@ public ByteBuffer nioByteBuffer() throws IOException {
 
   @Override
   public InputStream createInputStream() throws IOException {
-    InputStream is = null;
+    FileInputStream is = null;
     try {
-      is = Files.newInputStream(file.toPath());
+      is = new FileInputStream(file);
       ByteStreams.skipFully(is, offset);
       return new LimitedInputStream(is, length);
     } catch (IOException e) {
@@ -133,7 +132,7 @@ public Object convertToNetty() throws IOException {
     if (conf.lazyFileDescriptor()) {
       return new DefaultFileRegion(file, offset, length);
     } else {
-      FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
+      FileChannel fileChannel = new FileInputStream(file).getChannel();
       return new DefaultFileRegion(fileChannel, offset, length);
     }
   }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index 3f2f20b4149f1..9cac7d00cc6b6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -18,11 +18,11 @@
 package org.apache.spark.network.shuffle;
 
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.channels.Channels;
 import java.nio.channels.WritableByteChannel;
-import java.nio.file.Files;
 import java.util.Arrays;
 
 import org.slf4j.Logger;
@@ -165,7 +165,7 @@ private class DownloadCallback implements StreamCallback {
 
     DownloadCallback(int chunkIndex) throws IOException {
       this.targetFile = tempFileManager.createTempFile();
-      this.channel = Channels.newChannel(Files.newOutputStream(targetFile.toPath()));
+      this.channel = Channels.newChannel(new FileOutputStream(targetFile));
       this.chunkIndex = chunkIndex;
     }
 
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index 386738ece51a6..eacf485344b76 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -19,10 +19,10 @@
 
 import java.io.DataInputStream;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
-import java.nio.file.Files;
 
 /**
  * Keeps the index information for a particular map output
@@ -39,7 +39,7 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
     try {
-      dis = new DataInputStream(Files.newInputStream(indexFile.toPath()));
+      dis = new DataInputStream(new FileInputStream(indexFile));
       dis.readFully(buffer.array());
     } finally {
       if (dis != null) {
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
index a9b5236ab8173..323a5d3c52831 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java
@@ -18,9 +18,9 @@
 package org.apache.spark.shuffle.sort;
 
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
-import java.nio.channels.FileChannel;
-import static java.nio.file.StandardOpenOption.*;
 import javax.annotation.Nullable;
 
 import scala.None$;
@@ -75,6 +75,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private static final Logger logger = LoggerFactory.getLogger(BypassMergeSortShuffleWriter.class);
 
   private final int fileBufferSize;
+  private final boolean transferToEnabled;
   private final int numPartitions;
   private final BlockManager blockManager;
   private final Partitioner partitioner;
@@ -106,6 +107,7 @@ final class BypassMergeSortShuffleWriter<K, V> extends ShuffleWriter<K, V> {
       SparkConf conf) {
     // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided
     this.fileBufferSize = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024;
+    this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true);
     this.blockManager = blockManager;
     final ShuffleDependency<K, V, V> dep = handle.dependency();
     this.mapId = mapId;
@@ -186,21 +188,17 @@ private long[] writePartitionedFile(File outputFile) throws IOException {
       return lengths;
     }
 
-    // This file needs to opened in append mode in order to work around a Linux kernel bug that
-    // affects transferTo; see SPARK-3948 for more details.
-    final FileChannel out = FileChannel.open(outputFile.toPath(), WRITE, APPEND, CREATE);
+    final FileOutputStream out = new FileOutputStream(outputFile, true);
     final long writeStartTime = System.nanoTime();
     boolean threwException = true;
     try {
       for (int i = 0; i < numPartitions; i++) {
         final File file = partitionWriterSegments[i].file();
         if (file.exists()) {
-          final FileChannel in = FileChannel.open(file.toPath(), READ);
+          final FileInputStream in = new FileInputStream(file);
           boolean copyThrewException = true;
           try {
-            long size = in.size();
-            Utils.copyFileStreamNIO(in, out, 0, size);
-            lengths[i] = size;
+            lengths[i] = Utils.copyStream(in, out, false, transferToEnabled);
             copyThrewException = false;
           } finally {
             Closeables.close(in, copyThrewException);
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
index e9c2a69c47cba..4839d04522f10 100644
--- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
+++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java
@@ -20,7 +20,6 @@
 import javax.annotation.Nullable;
 import java.io.*;
 import java.nio.channels.FileChannel;
-import static java.nio.file.StandardOpenOption.*;
 import java.util.Iterator;
 
 import scala.Option;
@@ -291,7 +290,7 @@ private long[] mergeSpills(SpillInfo[] spills, File outputFile) throws IOExcepti
     final boolean encryptionEnabled = blockManager.serializerManager().encryptionEnabled();
     try {
       if (spills.length == 0) {
-        java.nio.file.Files.newOutputStream(outputFile.toPath()).close(); // Create an empty file
+        new FileOutputStream(outputFile).close(); // Create an empty file
         return new long[partitioner.numPartitions()];
       } else if (spills.length == 1) {
         // Here, we don't need to perform any metrics updates because the bytes written to this
@@ -368,7 +367,7 @@ private long[] mergeSpillsWithFileStream(
     final InputStream[] spillInputStreams = new InputStream[spills.length];
 
     final OutputStream bos = new BufferedOutputStream(
-            java.nio.file.Files.newOutputStream(outputFile.toPath()),
+            new FileOutputStream(outputFile),
             outputBufferSizeInBytes);
     // Use a counting output stream to avoid having to close the underlying file and ask
     // the file system for its size after each partition is written.
@@ -443,11 +442,11 @@ private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) th
     boolean threwException = true;
     try {
       for (int i = 0; i < spills.length; i++) {
-        spillInputChannels[i] = FileChannel.open(spills[i].file.toPath(), READ);
+        spillInputChannels[i] = new FileInputStream(spills[i].file).getChannel();
       }
       // This file needs to opened in append mode in order to work around a Linux kernel bug that
       // affects transferTo; see SPARK-3948 for more details.
-      mergedFileOutputChannel = FileChannel.open(outputFile.toPath(), WRITE, CREATE, APPEND);
+      mergedFileOutputChannel = new FileOutputStream(outputFile, true).getChannel();
 
       long bytesWrittenToMergedFile = 0;
       for (int partition = 0; partition < numPartitions; partition++) {
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 94a3a78e94165..15540485170d0 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.shuffle
 
 import java.io._
-import java.nio.file.Files
 
 import com.google.common.io.ByteStreams
 
@@ -142,8 +141,7 @@ private[spark] class IndexShuffleBlockResolver(
     val indexFile = getIndexFile(shuffleId, mapId)
     val indexTmp = Utils.tempFileWith(indexFile)
     try {
-      val out = new DataOutputStream(
-        new BufferedOutputStream(Files.newOutputStream(indexTmp.toPath)))
+      val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexTmp)))
       Utils.tryWithSafeFinally {
         // We take in lengths of each block, need to convert it to offsets.
         var offset = 0L
@@ -198,7 +196,7 @@ private[spark] class IndexShuffleBlockResolver(
     // find out the consolidated file, then the offset within that from our index
     val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId)
 
-    val in = new DataInputStream(Files.newInputStream(indexFile.toPath))
+    val in = new DataInputStream(new FileInputStream(indexFile))
     try {
       ByteStreams.skipFully(in, blockId.reduceId * 8)
       val offset = in.readLong()
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 6f5b5bb3652de..375f4a6921225 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -18,8 +18,6 @@
 package org.apache.spark.util.collection
 
 import java.io._
-import java.nio.channels.{Channels, FileChannel}
-import java.nio.file.StandardOpenOption
 import java.util.Comparator
 
 import scala.collection.BufferedIterator
@@ -461,7 +459,7 @@ class ExternalAppendOnlyMap[K, V, C](
     )
 
     private var batchIndex = 0  // Which batch we're in
-    private var fileChannel: FileChannel = null
+    private var fileStream: FileInputStream = null
 
     // An intermediate stream that reads from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
@@ -478,14 +476,14 @@ class ExternalAppendOnlyMap[K, V, C](
       if (batchIndex < batchOffsets.length - 1) {
         if (deserializeStream != null) {
           deserializeStream.close()
-          fileChannel.close()
+          fileStream.close()
           deserializeStream = null
-          fileChannel = null
+          fileStream = null
         }
 
         val start = batchOffsets(batchIndex)
-        fileChannel = FileChannel.open(file.toPath, StandardOpenOption.READ)
-        fileChannel.position(start)
+        fileStream = new FileInputStream(file)
+        fileStream.getChannel.position(start)
         batchIndex += 1
 
         val end = batchOffsets(batchIndex)
@@ -493,8 +491,7 @@ class ExternalAppendOnlyMap[K, V, C](
         assert(end >= start, "start = " + start + ", end = " + end +
           ", batchOffsets = " + batchOffsets.mkString("[", ", ", "]"))
 
-        val bufferedStream = new BufferedInputStream(
-          ByteStreams.limit(Channels.newInputStream(fileChannel), end - start))
+        val bufferedStream = new BufferedInputStream(ByteStreams.limit(fileStream, end - start))
         val wrappedStream = serializerManager.wrapStream(blockId, bufferedStream)
         ser.deserializeStream(wrappedStream)
       } else {
@@ -554,9 +551,9 @@ class ExternalAppendOnlyMap[K, V, C](
         ds.close()
         deserializeStream = null
       }
-      if (fileChannel != null) {
-        fileChannel.close()
-        fileChannel = null
+      if (fileStream != null) {
+        fileStream.close()
+        fileStream = null
       }
       if (file.exists()) {
         if (!file.delete()) {
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index 3593cfd507783..176f84fa2a0d2 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -18,8 +18,6 @@
 package org.apache.spark.util.collection
 
 import java.io._
-import java.nio.channels.{Channels, FileChannel}
-import java.nio.file.StandardOpenOption
 import java.util.Comparator
 
 import scala.collection.mutable
@@ -494,7 +492,7 @@ private[spark] class ExternalSorter[K, V, C](
 
     // Intermediate file and deserializer streams that read from exactly one batch
     // This guards against pre-fetching and other arbitrary behavior of higher level streams
-    var fileChannel: FileChannel = null
+    var fileStream: FileInputStream = null
     var deserializeStream = nextBatchStream()  // Also sets fileStream
 
     var nextItem: (K, C) = null
@@ -507,14 +505,14 @@ private[spark] class ExternalSorter[K, V, C](
       if (batchId < batchOffsets.length - 1) {
         if (deserializeStream != null) {
           deserializeStream.close()
-          fileChannel.close()
+          fileStream.close()
           deserializeStream = null
-          fileChannel = null
+          fileStream = null
         }
 
         val start = batchOffsets(batchId)
-        fileChannel = FileChannel.open(spill.file.toPath, StandardOpenOption.READ)
-        fileChannel.position(start)
+        fileStream = new FileInputStream(spill.file)
+        fileStream.getChannel.position(start)
         batchId += 1
 
         val end = batchOffsets(batchId)
@@ -522,8 +520,7 @@ private[spark] class ExternalSorter[K, V, C](
         assert(end >= start, "start = " + start + ", end = " + end +
           ", batchOffsets = " + batchOffsets.mkString("[", ", ", "]"))
 
-        val bufferedStream = new BufferedInputStream(
-          ByteStreams.limit(Channels.newInputStream(fileChannel), end - start))
+        val bufferedStream = new BufferedInputStream(ByteStreams.limit(fileStream, end - start))
 
         val wrappedStream = serializerManager.wrapStream(spill.blockId, bufferedStream)
         serInstance.deserializeStream(wrappedStream)
@@ -613,7 +610,7 @@ private[spark] class ExternalSorter[K, V, C](
       batchId = batchOffsets.length  // Prevent reading any other batch
       val ds = deserializeStream
       deserializeStream = null
-      fileChannel = null
+      fileStream = null
       if (ds != null) {
         ds.close()
       }

From 234d9435d47ad9a9cfb0f349f1fd20062c25b8dd Mon Sep 17 00:00:00 2001
From: Zhenhua Wang <wzh_zju@163.com>
Date: Sat, 30 Dec 2017 20:48:39 +0800
Subject: [PATCH 233/356] [TEST][MINOR] remove redundant
 `EliminateSubqueryAliases` in test code

## What changes were proposed in this pull request?

The `analyze` method in `implicit class DslLogicalPlan` already includes `EliminateSubqueryAliases`. So there's no need to call `EliminateSubqueryAliases` again after calling `analyze` in some test code.

## How was this patch tested?

Existing tests.

Author: Zhenhua Wang <wzh_zju@163.com>

Closes #20122 from wzhfy/redundant_code.
---
 .../catalyst/optimizer/ColumnPruningSuite.scala    |  4 ++--
 .../catalyst/optimizer/FilterPushdownSuite.scala   | 14 +++++++-------
 .../catalyst/optimizer/JoinOptimizationSuite.scala |  3 +--
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
index 9f0f7e111295b..3f41f4b144096 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ColumnPruningSuite.scala
@@ -256,7 +256,7 @@ class ColumnPruningSuite extends PlanTest {
       x.select('a)
         .sortBy(SortOrder('a, Ascending)).analyze
 
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
 
     // push down invalid
     val originalQuery1 = {
@@ -271,7 +271,7 @@ class ColumnPruningSuite extends PlanTest {
         .sortBy(SortOrder('a, Ascending))
         .select('b).analyze
 
-    comparePlans(optimized1, analysis.EliminateSubqueryAliases(correctAnswer1))
+    comparePlans(optimized1, correctAnswer1)
   }
 
   test("Column pruning on Window with useless aggregate functions") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 4a23179fc745c..a9c2306e37fd5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -504,7 +504,7 @@ class FilterPushdownSuite extends PlanTest {
     }
     val optimized = Optimize.execute(originalQuery.analyze)
 
-    comparePlans(analysis.EliminateSubqueryAliases(originalQuery.analyze), optimized)
+    comparePlans(originalQuery.analyze, optimized)
   }
 
   test("joins: conjunctive predicates") {
@@ -523,7 +523,7 @@ class FilterPushdownSuite extends PlanTest {
       left.join(right, condition = Some("x.b".attr === "y.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   test("joins: conjunctive predicates #2") {
@@ -542,7 +542,7 @@ class FilterPushdownSuite extends PlanTest {
       left.join(right, condition = Some("x.b".attr === "y.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   test("joins: conjunctive predicates #3") {
@@ -566,7 +566,7 @@ class FilterPushdownSuite extends PlanTest {
           condition = Some("z.a".attr === "x.b".attr))
         .analyze
 
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   test("joins: push down where clause into left anti join") {
@@ -581,7 +581,7 @@ class FilterPushdownSuite extends PlanTest {
       x.where("x.a".attr > 10)
         .join(y, LeftAnti, Some("x.b".attr === "y.b".attr))
         .analyze
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   test("joins: only push down join conditions to the right of a left anti join") {
@@ -598,7 +598,7 @@ class FilterPushdownSuite extends PlanTest {
         LeftAnti,
         Some("x.b".attr === "y.b".attr && "x.a".attr > 10))
         .analyze
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   test("joins: only push down join conditions to the right of an existence join") {
@@ -616,7 +616,7 @@ class FilterPushdownSuite extends PlanTest {
         ExistenceJoin(fillerVal),
         Some("x.a".attr > 1))
       .analyze
-    comparePlans(optimized, analysis.EliminateSubqueryAliases(correctAnswer))
+    comparePlans(optimized, correctAnswer)
   }
 
   val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
index 97733a754ccc2..ccd9d8dd4d213 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/JoinOptimizationSuite.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
-import org.apache.spark.sql.catalyst.analysis
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
@@ -118,7 +117,7 @@ class JoinOptimizationSuite extends PlanTest {
 
     queryAnswers foreach { queryAnswerPair =>
       val optimized = Optimize.execute(queryAnswerPair._1.analyze)
-      comparePlans(optimized, analysis.EliminateSubqueryAliases(queryAnswerPair._2.analyze))
+      comparePlans(optimized, queryAnswerPair._2.analyze)
     }
   }
 

From fd7d141d8bbba229c7304714bd418d3386f8f253 Mon Sep 17 00:00:00 2001
From: Fokko Driesprong <fokkodriesprong@godatadriven.com>
Date: Sat, 30 Dec 2017 10:37:41 -0600
Subject: [PATCH 234/356] [SPARK-22919] Bump httpclient versions

Hi all,

I would like to bump the PATCH versions of both the Apache httpclient Apache httpcore. I use the SparkTC Stocator library for connecting to an object store, and I would align the versions to reduce java version mismatches. Furthermore it is good to bump these versions since they fix stability and performance issues:
https://archive.apache.org/dist/httpcomponents/httpclient/RELEASE_NOTES-4.5.x.txt
https://www.apache.org/dist/httpcomponents/httpcore/RELEASE_NOTES-4.4.x.txt

Cheers, Fokko

## What changes were proposed in this pull request?

Update the versions of the httpclient and httpcore. Only update the PATCH versions, so no breaking changes.

## How was this patch tested?

(Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests)
(If this patch involves UI changes, please attach a screenshot; otherwise, remove this)

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Fokko Driesprong <fokkodriesprong@godatadriven.com>

Closes #20103 from Fokko/SPARK-22919-bump-httpclient-versions.
---
 dev/deps/spark-deps-hadoop-2.6 | 4 ++--
 dev/deps/spark-deps-hadoop-2.7 | 4 ++--
 pom.xml                        | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index fea642737cc0c..a7fce2ede0ea5 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -84,8 +84,8 @@ hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
 hppc-0.7.2.jar
 htrace-core-3.0.4.jar
-httpclient-4.5.2.jar
-httpcore-4.4.4.jar
+httpclient-4.5.4.jar
+httpcore-4.4.8.jar
 ivy-2.4.0.jar
 jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 6dd44333f21ca..94b2e98d85e74 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -84,8 +84,8 @@ hk2-locator-2.4.0-b34.jar
 hk2-utils-2.4.0-b34.jar
 hppc-0.7.2.jar
 htrace-core-3.1.0-incubating.jar
-httpclient-4.5.2.jar
-httpcore-4.4.4.jar
+httpclient-4.5.4.jar
+httpcore-4.4.8.jar
 ivy-2.4.0.jar
 jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
diff --git a/pom.xml b/pom.xml
index 34793b0c41708..1b37164376460 100644
--- a/pom.xml
+++ b/pom.xml
@@ -148,8 +148,8 @@
     <!-- the producer is used in tests -->
     <aws.kinesis.producer.version>0.10.2</aws.kinesis.producer.version>
     <!--  org.apache.httpcomponents/httpclient-->
-    <commons.httpclient.version>4.5.2</commons.httpclient.version>
-    <commons.httpcore.version>4.4.4</commons.httpcore.version>
+    <commons.httpclient.version>4.5.4</commons.httpclient.version>
+    <commons.httpcore.version>4.4.8</commons.httpcore.version>
     <!--  commons-httpclient/commons-httpclient-->
     <httpclient.classic.version>3.1</httpclient.classic.version>
     <commons.math3.version>3.4.1</commons.math3.version>

From ea0a5eef2238daa68a15b60a6f1a74c361216140 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 31 Dec 2017 02:50:00 +0900
Subject: [PATCH 235/356] [SPARK-22924][SPARKR] R API for sortWithinPartitions

## What changes were proposed in this pull request?

Add to `arrange` the option to sort only within partition

## How was this patch tested?

manual, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20118 from felixcheung/rsortwithinpartition.
---
 R/pkg/R/DataFrame.R                   | 14 ++++++++++----
 R/pkg/tests/fulltests/test_sparkSQL.R |  5 +++++
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ace49daf9cd83..fe238f6dd4eb0 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -2297,6 +2297,7 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' @param ... additional sorting fields
 #' @param decreasing a logical argument indicating sorting order for columns when
 #'                   a character vector is specified for col
+#' @param withinPartitions a logical argument indicating whether to sort only within each partition
 #' @return A SparkDataFrame where all elements are sorted.
 #' @family SparkDataFrame functions
 #' @aliases arrange,SparkDataFrame,Column-method
@@ -2312,16 +2313,21 @@ setClassUnion("characterOrColumn", c("character", "Column"))
 #' arrange(df, asc(df$col1), desc(abs(df$col2)))
 #' arrange(df, "col1", decreasing = TRUE)
 #' arrange(df, "col1", "col2", decreasing = c(TRUE, FALSE))
+#' arrange(df, "col1", "col2", withinPartitions = TRUE)
 #' }
 #' @note arrange(SparkDataFrame, Column) since 1.4.0
 setMethod("arrange",
           signature(x = "SparkDataFrame", col = "Column"),
-          function(x, col, ...) {
+          function(x, col, ..., withinPartitions = FALSE) {
               jcols <- lapply(list(col, ...), function(c) {
                 c@jc
               })
 
-            sdf <- callJMethod(x@sdf, "sort", jcols)
+            if (withinPartitions) {
+              sdf <- callJMethod(x@sdf, "sortWithinPartitions", jcols)
+            } else {
+              sdf <- callJMethod(x@sdf, "sort", jcols)
+            }
             dataFrame(sdf)
           })
 
@@ -2332,7 +2338,7 @@ setMethod("arrange",
 #' @note arrange(SparkDataFrame, character) since 1.4.0
 setMethod("arrange",
           signature(x = "SparkDataFrame", col = "character"),
-          function(x, col, ..., decreasing = FALSE) {
+          function(x, col, ..., decreasing = FALSE, withinPartitions = FALSE) {
 
             # all sorting columns
             by <- list(col, ...)
@@ -2356,7 +2362,7 @@ setMethod("arrange",
               }
             })
 
-            do.call("arrange", c(x, jcols))
+            do.call("arrange", c(x, jcols, withinPartitions = withinPartitions))
           })
 
 #' @rdname arrange
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
index 1b7d53fd73a08..5197838eaac66 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -2130,6 +2130,11 @@ test_that("arrange() and orderBy() on a DataFrame", {
 
   sorted7 <- arrange(df, "name", decreasing = FALSE)
   expect_equal(collect(sorted7)[2, "age"], 19)
+
+  df <- createDataFrame(cars, numPartitions = 10)
+  expect_equal(getNumPartitions(df), 10)
+  sorted8 <- arrange(df, "dist", withinPartitions = TRUE)
+  expect_equal(collect(sorted8)[5:6, "dist"], c(22, 10))
 })
 
 test_that("filter() on a DataFrame", {

From ee3af15fea18356a9223d61cfe6aaa98ab4dc733 Mon Sep 17 00:00:00 2001
From: Gabor Somogyi <gabor.g.somogyi@gmail.com>
Date: Sun, 31 Dec 2017 14:47:23 +0800
Subject: [PATCH 236/356] [SPARK-22363][SQL][TEST] Add unit test for Window
 spilling

## What changes were proposed in this pull request?

There is already test using window spilling, but the test coverage is not ideal.

In this PR the already existing test was fixed and additional cases added.

## How was this patch tested?

Automated: Pass the Jenkins.

Author: Gabor Somogyi <gabor.g.somogyi@gmail.com>

Closes #20022 from gaborgsomogyi/SPARK-22363.
---
 .../sql/DataFrameWindowFunctionsSuite.scala   | 44 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
index ea725af8d1ad8..01c988ecc3726 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameWindowFunctionsSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql
 
 import java.sql.{Date, Timestamp}
 
+import org.apache.spark.TestUtils.{assertNotSpilled, assertSpilled}
 import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction, Window}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -518,9 +519,46 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
       Seq(Row(3, "1", null, 3.0, 4.0, 3.0), Row(5, "1", false, 4.0, 5.0, 5.0)))
   }
 
+  test("Window spill with less than the inMemoryThreshold") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD.key -> "2",
+      SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "2") {
+      assertNotSpilled(sparkContext, "select") {
+        df.select($"key", sum("value").over(window)).collect()
+      }
+    }
+  }
+
+  test("Window spill with more than the inMemoryThreshold but less than the spillThreshold") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD.key -> "1",
+      SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "2") {
+      assertNotSpilled(sparkContext, "select") {
+        df.select($"key", sum("value").over(window)).collect()
+      }
+    }
+  }
+
+  test("Window spill with more than the inMemoryThreshold and spillThreshold") {
+    val df = Seq((1, "1"), (2, "2"), (1, "3"), (2, "4")).toDF("key", "value")
+    val window = Window.partitionBy($"key").orderBy($"value")
+
+    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD.key -> "1",
+      SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "1") {
+      assertSpilled(sparkContext, "select") {
+        df.select($"key", sum("value").over(window)).collect()
+      }
+    }
+  }
+
   test("SPARK-21258: complex object in combination with spilling") {
     // Make sure we trigger the spilling path.
-    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "17") {
+    withSQLConf(SQLConf.WINDOW_EXEC_BUFFER_IN_MEMORY_THRESHOLD.key -> "1",
+      SQLConf.WINDOW_EXEC_BUFFER_SPILL_THRESHOLD.key -> "17") {
       val sampleSchema = new StructType().
         add("f0", StringType).
         add("f1", LongType).
@@ -558,7 +596,9 @@ class DataFrameWindowFunctionsSuite extends QueryTest with SharedSQLContext {
 
       import testImplicits._
 
-      spark.read.schema(sampleSchema).json(input.toDS()).select(c0, c1).foreach { _ => () }
+      assertSpilled(sparkContext, "select") {
+        spark.read.schema(sampleSchema).json(input.toDS()).select(c0, c1).foreach { _ => () }
+      }
     }
   }
 }

From cfbe11e8164c04cd7d388e4faeded21a9331dac4 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sun, 31 Dec 2017 15:06:54 +0800
Subject: [PATCH 237/356] [SPARK-22895][SQL] Push down the deterministic
 predicates that are after the first non-deterministic

## What changes were proposed in this pull request?
Currently, we do not guarantee an order evaluation of conjuncts in either Filter or Join operator. This is also true to the mainstream RDBMS vendors like DB2 and MS SQL Server. Thus, we should also push down the deterministic predicates that are after the first non-deterministic, if possible.

## How was this patch tested?
Updated the existing test cases.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20069 from gatorsmile/morePushDown.
---
 docs/sql-programming-guide.md                 |  1 +
 .../sql/catalyst/optimizer/Optimizer.scala    | 40 ++++++++-----------
 .../optimizer/FilterPushdownSuite.scala       | 33 +++++++--------
 .../v2/PushDownOperatorsToDataSource.scala    | 10 ++---
 .../execution/python/ExtractPythonUDFs.scala  |  6 +--
 .../StreamingSymmetricHashJoinHelper.scala    |  5 +--
 .../python/BatchEvalPythonExecSuite.scala     | 10 +++--
 ...treamingSymmetricHashJoinHelperSuite.scala | 14 +++----
 8 files changed, 54 insertions(+), 65 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 4b5f56c44444d..dc3e384008d27 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1636,6 +1636,7 @@ options.
 
   - Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named `_corrupt_record` by default). For example, `spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()` and `spark.read.schema(schema).json(file).select("_corrupt_record").show()`. Instead, you can cache or save the parsed results and then send the same query. For example, `val df = spark.read.schema(schema).json(file).cache()` and then `df.filter($"_corrupt_record".isNotNull).count()`.
   - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.
+  - Since Spark 2.3, the Join/Filter's deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown.
   - Partition column inference previously found incorrect common type for different inferred types, for example, previously it ended up with double type as the common type for double type and date type. Now it finds the correct common type for such conflicts. The conflict resolution follows the table below:
 
     <table class="table">
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index eeb1b130c578b..0d4b02c6e7d8a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -805,15 +805,15 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
 
       // For each filter, expand the alias and check if the filter can be evaluated using
       // attributes produced by the aggregate operator's child operator.
-      val (candidates, containingNonDeterministic) =
-        splitConjunctivePredicates(condition).span(_.deterministic)
+      val (candidates, nonDeterministic) =
+        splitConjunctivePredicates(condition).partition(_.deterministic)
 
       val (pushDown, rest) = candidates.partition { cond =>
         val replaced = replaceAlias(cond, aliasMap)
         cond.references.nonEmpty && replaced.references.subsetOf(aggregate.child.outputSet)
       }
 
-      val stayUp = rest ++ containingNonDeterministic
+      val stayUp = rest ++ nonDeterministic
 
       if (pushDown.nonEmpty) {
         val pushDownPredicate = pushDown.reduce(And)
@@ -835,14 +835,14 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       if w.partitionSpec.forall(_.isInstanceOf[AttributeReference]) =>
       val partitionAttrs = AttributeSet(w.partitionSpec.flatMap(_.references))
 
-      val (candidates, containingNonDeterministic) =
-        splitConjunctivePredicates(condition).span(_.deterministic)
+      val (candidates, nonDeterministic) =
+        splitConjunctivePredicates(condition).partition(_.deterministic)
 
       val (pushDown, rest) = candidates.partition { cond =>
         cond.references.subsetOf(partitionAttrs)
       }
 
-      val stayUp = rest ++ containingNonDeterministic
+      val stayUp = rest ++ nonDeterministic
 
       if (pushDown.nonEmpty) {
         val pushDownPredicate = pushDown.reduce(And)
@@ -854,7 +854,7 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
 
     case filter @ Filter(condition, union: Union) =>
       // Union could change the rows, so non-deterministic predicate can't be pushed down
-      val (pushDown, stayUp) = splitConjunctivePredicates(condition).span(_.deterministic)
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition(_.deterministic)
 
       if (pushDown.nonEmpty) {
         val pushDownCond = pushDown.reduceLeft(And)
@@ -878,13 +878,9 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       }
 
     case filter @ Filter(condition, watermark: EventTimeWatermark) =>
-      // We can only push deterministic predicates which don't reference the watermark attribute.
-      // We could in theory span() only on determinism and pull out deterministic predicates
-      // on the watermark separately. But it seems unnecessary and a bit confusing to not simply
-      // use the prefix as we do for nondeterminism in other cases.
-
-      val (pushDown, stayUp) = splitConjunctivePredicates(condition).span(
-        p => p.deterministic && !p.references.contains(watermark.eventTime))
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition { p =>
+        p.deterministic && !p.references.contains(watermark.eventTime)
+      }
 
       if (pushDown.nonEmpty) {
         val pushDownPredicate = pushDown.reduceLeft(And)
@@ -925,14 +921,14 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
     // come from grandchild.
     // TODO: non-deterministic predicates could be pushed through some operators that do not change
     // the rows.
-    val (candidates, containingNonDeterministic) =
-      splitConjunctivePredicates(filter.condition).span(_.deterministic)
+    val (candidates, nonDeterministic) =
+      splitConjunctivePredicates(filter.condition).partition(_.deterministic)
 
     val (pushDown, rest) = candidates.partition { cond =>
       cond.references.subsetOf(grandchild.outputSet)
     }
 
-    val stayUp = rest ++ containingNonDeterministic
+    val stayUp = rest ++ nonDeterministic
 
     if (pushDown.nonEmpty) {
       val newChild = insertFilter(pushDown.reduceLeft(And))
@@ -975,23 +971,19 @@ object PushPredicateThroughJoin extends Rule[LogicalPlan] with PredicateHelper {
   /**
    * Splits join condition expressions or filter predicates (on a given join's output) into three
    * categories based on the attributes required to evaluate them. Note that we explicitly exclude
-   * on-deterministic (i.e., stateful) condition expressions in canEvaluateInLeft or
+   * non-deterministic (i.e., stateful) condition expressions in canEvaluateInLeft or
    * canEvaluateInRight to prevent pushing these predicates on either side of the join.
    *
    * @return (canEvaluateInLeft, canEvaluateInRight, haveToEvaluateInBoth)
    */
   private def split(condition: Seq[Expression], left: LogicalPlan, right: LogicalPlan) = {
-    // Note: In order to ensure correctness, it's important to not change the relative ordering of
-    // any deterministic expression that follows a non-deterministic expression. To achieve this,
-    // we only consider pushing down those expressions that precede the first non-deterministic
-    // expression in the condition.
-    val (pushDownCandidates, containingNonDeterministic) = condition.span(_.deterministic)
+    val (pushDownCandidates, nonDeterministic) = condition.partition(_.deterministic)
     val (leftEvaluateCondition, rest) =
       pushDownCandidates.partition(_.references.subsetOf(left.outputSet))
     val (rightEvaluateCondition, commonCondition) =
         rest.partition(expr => expr.references.subsetOf(right.outputSet))
 
-    (leftEvaluateCondition, rightEvaluateCondition, commonCondition ++ containingNonDeterministic)
+    (leftEvaluateCondition, rightEvaluateCondition, commonCondition ++ nonDeterministic)
   }
 
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index a9c2306e37fd5..85a5e979f6021 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -831,9 +831,9 @@ class FilterPushdownSuite extends PlanTest {
     val optimized = Optimize.execute(originalQuery.analyze)
 
     val correctAnswer = Union(Seq(
-      testRelation.where('a === 2L),
-      testRelation2.where('d === 2L)))
-      .where('b + Rand(10).as("rnd") === 3 && 'c > 5L)
+      testRelation.where('a === 2L && 'c > 5L),
+      testRelation2.where('d === 2L && 'f > 5L)))
+      .where('b + Rand(10).as("rnd") === 3)
       .analyze
 
     comparePlans(optimized, correctAnswer)
@@ -1134,12 +1134,13 @@ class FilterPushdownSuite extends PlanTest {
     val x = testRelation.subquery('x)
     val y = testRelation.subquery('y)
 
-    // Verify that all conditions preceding the first non-deterministic condition are pushed down
+    // Verify that all conditions except the watermark touching condition are pushed down
     // by the optimizer and others are not.
     val originalQuery = x.join(y, condition = Some("x.a".attr === 5 && "y.a".attr === 5 &&
       "x.a".attr === Rand(10) && "y.b".attr === 5))
-    val correctAnswer = x.where("x.a".attr === 5).join(y.where("y.a".attr === 5),
-        condition = Some("x.a".attr === Rand(10) && "y.b".attr === 5))
+    val correctAnswer =
+      x.where("x.a".attr === 5).join(y.where("y.a".attr === 5 && "y.b".attr === 5),
+        condition = Some("x.a".attr === Rand(10)))
 
     // CheckAnalysis will ensure nondeterministic expressions not appear in join condition.
     // TODO support nondeterministic expressions in join condition.
@@ -1147,16 +1148,16 @@ class FilterPushdownSuite extends PlanTest {
       checkAnalysis = false)
   }
 
-  test("watermark pushdown: no pushdown on watermark attribute") {
+  test("watermark pushdown: no pushdown on watermark attribute #1") {
     val interval = new CalendarInterval(2, 2000L)
 
-    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // Verify that all conditions except the watermark touching condition are pushed down
     // by the optimizer and others are not.
     val originalQuery = EventTimeWatermark('b, interval, testRelation)
       .where('a === 5 && 'b === 10 && 'c === 5)
     val correctAnswer = EventTimeWatermark(
-      'b, interval, testRelation.where('a === 5))
-      .where('b === 10 && 'c === 5)
+      'b, interval, testRelation.where('a === 5 && 'c === 5))
+      .where('b === 10)
 
     comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
       checkAnalysis = false)
@@ -1165,7 +1166,7 @@ class FilterPushdownSuite extends PlanTest {
   test("watermark pushdown: no pushdown for nondeterministic filter") {
     val interval = new CalendarInterval(2, 2000L)
 
-    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // Verify that all conditions except the watermark touching condition are pushed down
     // by the optimizer and others are not.
     val originalQuery = EventTimeWatermark('c, interval, testRelation)
       .where('a === 5 && 'b === Rand(10) && 'c === 5)
@@ -1180,7 +1181,7 @@ class FilterPushdownSuite extends PlanTest {
   test("watermark pushdown: full pushdown") {
     val interval = new CalendarInterval(2, 2000L)
 
-    // Verify that all conditions preceding the first watermark touching condition are pushed down
+    // Verify that all conditions except the watermark touching condition are pushed down
     // by the optimizer and others are not.
     val originalQuery = EventTimeWatermark('c, interval, testRelation)
       .where('a === 5 && 'b === 10)
@@ -1191,15 +1192,15 @@ class FilterPushdownSuite extends PlanTest {
       checkAnalysis = false)
   }
 
-  test("watermark pushdown: empty pushdown") {
+  test("watermark pushdown: no pushdown on watermark attribute #2") {
     val interval = new CalendarInterval(2, 2000L)
 
-    // Verify that all conditions preceding the first watermark touching condition are pushed down
-    // by the optimizer and others are not.
     val originalQuery = EventTimeWatermark('a, interval, testRelation)
       .where('a === 5 && 'b === 10)
+    val correctAnswer = EventTimeWatermark(
+      'a, interval, testRelation.where('b === 10)).where('a === 5)
 
-    comparePlans(Optimize.execute(originalQuery.analyze), originalQuery.analyze,
+    comparePlans(Optimize.execute(originalQuery.analyze), correctAnswer.analyze,
       checkAnalysis = false)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
index 0c1708131ae46..df034adf1e7d6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownOperatorsToDataSource.scala
@@ -40,12 +40,8 @@ object PushDownOperatorsToDataSource extends Rule[LogicalPlan] with PredicateHel
     // top-down, then we can simplify the logic here and only collect target operators.
     val filterPushed = plan transformUp {
       case FilterAndProject(fields, condition, r @ DataSourceV2Relation(_, reader)) =>
-        // Non-deterministic expressions are stateful and we must keep the input sequence unchanged
-        // to avoid changing the result. This means, we can't evaluate the filter conditions that
-        // are after the first non-deterministic condition ahead. Here we only try to push down
-        // deterministic conditions that are before the first non-deterministic condition.
-        val (candidates, containingNonDeterministic) =
-          splitConjunctivePredicates(condition).span(_.deterministic)
+        val (candidates, nonDeterministic) =
+          splitConjunctivePredicates(condition).partition(_.deterministic)
 
         val stayUpFilters: Seq[Expression] = reader match {
           case r: SupportsPushDownCatalystFilters =>
@@ -74,7 +70,7 @@ object PushDownOperatorsToDataSource extends Rule[LogicalPlan] with PredicateHel
           case _ => candidates
         }
 
-        val filterCondition = (stayUpFilters ++ containingNonDeterministic).reduceLeftOption(And)
+        val filterCondition = (stayUpFilters ++ nonDeterministic).reduceLeftOption(And)
         val withFilter = filterCondition.map(Filter(_, r)).getOrElse(r)
         if (withFilter.output == fields) {
           withFilter
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
index f5a4cbc4793e3..2f53fe788c7d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ExtractPythonUDFs.scala
@@ -202,12 +202,12 @@ object ExtractPythonUDFs extends Rule[SparkPlan] with PredicateHelper {
   private def trySplitFilter(plan: SparkPlan): SparkPlan = {
     plan match {
       case filter: FilterExec =>
-        val (candidates, containingNonDeterministic) =
-          splitConjunctivePredicates(filter.condition).span(_.deterministic)
+        val (candidates, nonDeterministic) =
+          splitConjunctivePredicates(filter.condition).partition(_.deterministic)
         val (pushDown, rest) = candidates.partition(!hasPythonUDF(_))
         if (pushDown.nonEmpty) {
           val newChild = FilterExec(pushDown.reduceLeft(And), filter.child)
-          FilterExec((rest ++ containingNonDeterministic).reduceLeft(And), newChild)
+          FilterExec((rest ++ nonDeterministic).reduceLeft(And), newChild)
         } else {
           filter
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
index 167e991ca62f8..217e98a6419e9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -72,8 +72,7 @@ object StreamingSymmetricHashJoinHelper extends Logging {
    * left AND right AND joined is equivalent to full.
    *
    * Note that left and right do not necessarily contain *all* conjuncts which satisfy
-   * their condition. Any conjuncts after the first nondeterministic one are treated as
-   * nondeterministic for purposes of the split.
+   * their condition.
    *
    * @param leftSideOnly Deterministic conjuncts which reference only the left side of the join.
    * @param rightSideOnly Deterministic conjuncts which reference only the right side of the join.
@@ -111,7 +110,7 @@ object StreamingSymmetricHashJoinHelper extends Logging {
           // Span rather than partition, because nondeterministic expressions don't commute
           // across AND.
           val (deterministicConjuncts, nonDeterministicConjuncts) =
-            splitConjunctivePredicates(condition.get).span(_.deterministic)
+            splitConjunctivePredicates(condition.get).partition(_.deterministic)
 
           val (leftConjuncts, nonLeftConjuncts) = deterministicConjuncts.partition { cond =>
             cond.references.subsetOf(left.outputSet)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
index 9e4a2e8776956..d456c931f5275 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExecSuite.scala
@@ -75,13 +75,17 @@ class BatchEvalPythonExecSuite extends SparkPlanTest with SharedSQLContext {
     assert(qualifiedPlanNodes.size == 2)
   }
 
-  test("Python UDF: no push down on predicates starting from the first non-deterministic") {
+  test("Python UDF: push down on deterministic predicates after the first non-deterministic") {
     val df = Seq(("Hello", 4)).toDF("a", "b")
       .where("dummyPythonUDF(a) and rand() > 0.3 and b > 4")
+
     val qualifiedPlanNodes = df.queryExecution.executedPlan.collect {
-      case f @ FilterExec(And(_: And, _: GreaterThan), InputAdapter(_: BatchEvalPythonExec)) => f
+      case f @ FilterExec(
+          And(_: AttributeReference, _: GreaterThan),
+          InputAdapter(_: BatchEvalPythonExec)) => f
+      case b @ BatchEvalPythonExec(_, _, WholeStageCodegenExec(_: FilterExec)) => b
     }
-    assert(qualifiedPlanNodes.size == 1)
+    assert(qualifiedPlanNodes.size == 2)
   }
 
   test("Python UDF refers to the attributes from more than one child") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala
index 2a854e37bf0df..69b7154895341 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingSymmetricHashJoinHelperSuite.scala
@@ -18,10 +18,8 @@
 package org.apache.spark.sql.streaming
 
 import org.apache.spark.sql.Column
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions.AttributeReference
-import org.apache.spark.sql.execution.{LeafExecNode, LocalTableScanExec, SparkPlan}
-import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
+import org.apache.spark.sql.execution.LocalTableScanExec
 import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.JoinConditionSplitPredicates
 import org.apache.spark.sql.types._
 
@@ -95,19 +93,17 @@ class StreamingSymmetricHashJoinHelperSuite extends StreamTest {
   }
 
   test("conjuncts after nondeterministic") {
-    // All conjuncts after a nondeterministic conjunct shouldn't be split because they don't
-    // commute across it.
     val predicate =
-      (rand() > lit(0)
+      (rand(9) > lit(0)
         && leftColA > leftColB
         && rightColC > rightColD
         && leftColA === rightColC
         && lit(1) === lit(1)).expr
     val split = JoinConditionSplitPredicates(Some(predicate), left, right)
 
-    assert(split.leftSideOnly.isEmpty)
-    assert(split.rightSideOnly.isEmpty)
-    assert(split.bothSides.contains(predicate))
+    assert(split.leftSideOnly.contains((leftColA > leftColB && lit(1) === lit(1)).expr))
+    assert(split.rightSideOnly.contains((rightColC > rightColD && lit(1) === lit(1)).expr))
+    assert(split.bothSides.contains((leftColA === rightColC && rand(9) > lit(0)).expr))
     assert(split.full.contains(predicate))
   }
 

From 3d8837e59aadd726805371041567ceff375194c0 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Sun, 31 Dec 2017 14:39:24 +0200
Subject: [PATCH 238/356] [SPARK-22397][ML] add multiple columns support to
 QuantileDiscretizer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

add multi columns support to  QuantileDiscretizer.
When calculating the splits, we can either merge together all the  probabilities into one array by calculating approxQuantiles on multiple columns at once, or compute approxQuantiles separately  for each column. After doing the performance comparision, we found it’s better to calculating approxQuantiles on multiple columns at once.

Here is how we measuring the performance time:
```
    var duration = 0.0
    for (i<- 0 until 10) {
      val start = System.nanoTime()
      discretizer.fit(df)
      val end = System.nanoTime()
      duration += (end - start) / 1e9
    }
    println(duration/10)
```
Here is the performance test result:

|numCols |NumRows  | compute each approxQuantiles separately|compute multiple columns approxQuantiles at one time|
|--------|----------|--------------------------------|-------------------------------------------|
|10         |60             |0.3623195839                            |0.1626658607                                                |
|10         |6000        |0.7537239841                             |0.3869370046                                               |
|22         |6000        |1.6497598557                             |0.4767903059                                               |
|50         |6000        |3.2268305752                            |0.7217818396                                                |

## How was this patch tested?

add UT in QuantileDiscretizerSuite to test multi columns supports

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #19715 from huaxingao/spark_22397.
---
 .../ml/feature/QuantileDiscretizer.scala      | 120 ++++++--
 .../ml/feature/QuantileDiscretizerSuite.scala | 265 ++++++++++++++++++
 2 files changed, 369 insertions(+), 16 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
index 95e8830283dee..1ec5f8cb6139b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/QuantileDiscretizer.scala
@@ -22,7 +22,7 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.attribute.NominalAttribute
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasOutputCol}
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCol, HasInputCols, HasOutputCol, HasOutputCols}
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
@@ -50,10 +50,28 @@ private[feature] trait QuantileDiscretizerBase extends Params
   /** @group getParam */
   def getNumBuckets: Int = getOrDefault(numBuckets)
 
+  /**
+   * Array of number of buckets (quantiles, or categories) into which data points are grouped.
+   * Each value must be greater than or equal to 2
+   *
+   * See also [[handleInvalid]], which can optionally create an additional bucket for NaN values.
+   *
+   * @group param
+   */
+  val numBucketsArray = new IntArrayParam(this, "numBucketsArray", "Array of number of buckets " +
+    "(quantiles, or categories) into which data points are grouped. This is for multiple " +
+    "columns input. If transforming multiple columns and numBucketsArray is not set, but " +
+    "numBuckets is set, then numBuckets will be applied across all columns.",
+    (arrayOfNumBuckets: Array[Int]) => arrayOfNumBuckets.forall(ParamValidators.gtEq(2)))
+
+  /** @group getParam */
+  def getNumBucketsArray: Array[Int] = $(numBucketsArray)
+
   /**
    * Relative error (see documentation for
    * `org.apache.spark.sql.DataFrameStatFunctions.approxQuantile` for description)
    * Must be in the range [0, 1].
+   * Note that in multiple columns case, relative error is applied to all columns.
    * default: 0.001
    * @group param
    */
@@ -68,7 +86,9 @@ private[feature] trait QuantileDiscretizerBase extends Params
   /**
    * Param for how to handle invalid entries. Options are 'skip' (filter out rows with
    * invalid values), 'error' (throw an error), or 'keep' (keep invalid values in a special
-   * additional bucket).
+   * additional bucket). Note that in the multiple columns case, the invalid handling is applied
+   * to all columns. That said for 'error' it will throw an error if any invalids are found in
+   * any column, for 'skip' it will skip rows with any invalids in any columns, etc.
    * Default: "error"
    * @group param
    */
@@ -86,6 +106,11 @@ private[feature] trait QuantileDiscretizerBase extends Params
  * categorical features. The number of bins can be set using the `numBuckets` parameter. It is
  * possible that the number of buckets used will be smaller than this value, for example, if there
  * are too few distinct values of the input to create enough distinct quantiles.
+ * Since 2.3.0, `QuantileDiscretizer` can map multiple columns at once by setting the `inputCols`
+ * parameter. If both of the `inputCol` and `inputCols` parameters are set, an Exception will be
+ * thrown. To specify the number of buckets for each column, the `numBucketsArray` parameter can
+ * be set, or if the number of buckets should be the same across columns, `numBuckets` can be
+ * set as a convenience.
  *
  * NaN handling:
  * null and NaN values will be ignored from the column during `QuantileDiscretizer` fitting. This
@@ -104,7 +129,8 @@ private[feature] trait QuantileDiscretizerBase extends Params
  */
 @Since("1.6.0")
 final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val uid: String)
-  extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable {
+  extends Estimator[Bucketizer] with QuantileDiscretizerBase with DefaultParamsWritable
+    with HasInputCols with HasOutputCols {
 
   @Since("1.6.0")
   def this() = this(Identifiable.randomUID("quantileDiscretizer"))
@@ -129,34 +155,96 @@ final class QuantileDiscretizer @Since("1.6.0") (@Since("1.6.0") override val ui
   @Since("2.1.0")
   def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
 
+  /** @group setParam */
+  @Since("2.3.0")
+  def setNumBucketsArray(value: Array[Int]): this.type = set(numBucketsArray, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCols(value: Array[String]): this.type = set(outputCols, value)
+
+  private[feature] def getInOutCols: (Array[String], Array[String]) = {
+    require((isSet(inputCol) && isSet(outputCol) && !isSet(inputCols) && !isSet(outputCols)) ||
+      (!isSet(inputCol) && !isSet(outputCol) && isSet(inputCols) && isSet(outputCols)),
+      "QuantileDiscretizer only supports setting either inputCol/outputCol or" +
+        "inputCols/outputCols."
+    )
+
+    if (isSet(inputCol)) {
+      (Array($(inputCol)), Array($(outputCol)))
+    } else {
+      require($(inputCols).length == $(outputCols).length,
+        "inputCols number do not match outputCols")
+      ($(inputCols), $(outputCols))
+    }
+  }
+
   @Since("1.6.0")
   override def transformSchema(schema: StructType): StructType = {
-    SchemaUtils.checkNumericType(schema, $(inputCol))
-    val inputFields = schema.fields
-    require(inputFields.forall(_.name != $(outputCol)),
-      s"Output column ${$(outputCol)} already exists.")
-    val attr = NominalAttribute.defaultAttr.withName($(outputCol))
-    val outputFields = inputFields :+ attr.toStructField()
+    val (inputColNames, outputColNames) = getInOutCols
+    val existingFields = schema.fields
+    var outputFields = existingFields
+    inputColNames.zip(outputColNames).foreach { case (inputColName, outputColName) =>
+      SchemaUtils.checkNumericType(schema, inputColName)
+      require(existingFields.forall(_.name != outputColName),
+        s"Output column ${outputColName} already exists.")
+      val attr = NominalAttribute.defaultAttr.withName(outputColName)
+      outputFields :+= attr.toStructField()
+    }
     StructType(outputFields)
   }
 
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): Bucketizer = {
     transformSchema(dataset.schema, logging = true)
-    val splits = dataset.stat.approxQuantile($(inputCol),
-      (0.0 to 1.0 by 1.0/$(numBuckets)).toArray, $(relativeError))
+    val bucketizer = new Bucketizer(uid).setHandleInvalid($(handleInvalid))
+    if (isSet(inputCols)) {
+      val splitsArray = if (isSet(numBucketsArray)) {
+        val probArrayPerCol = $(numBucketsArray).map { numOfBuckets =>
+          (0.0 to 1.0 by 1.0 / numOfBuckets).toArray
+        }
+
+        val probabilityArray = probArrayPerCol.flatten.sorted.distinct
+        val splitsArrayRaw = dataset.stat.approxQuantile($(inputCols),
+          probabilityArray, $(relativeError))
+
+        splitsArrayRaw.zip(probArrayPerCol).map { case (splits, probs) =>
+          val probSet = probs.toSet
+          val idxSet = probabilityArray.zipWithIndex.collect {
+            case (p, idx) if probSet(p) =>
+              idx
+          }.toSet
+          splits.zipWithIndex.collect {
+            case (s, idx) if idxSet(idx) =>
+              s
+          }
+        }
+      } else {
+        dataset.stat.approxQuantile($(inputCols),
+          (0.0 to 1.0 by 1.0 / $(numBuckets)).toArray, $(relativeError))
+      }
+      bucketizer.setSplitsArray(splitsArray.map(getDistinctSplits))
+    } else {
+      val splits = dataset.stat.approxQuantile($(inputCol),
+        (0.0 to 1.0 by 1.0 / $(numBuckets)).toArray, $(relativeError))
+      bucketizer.setSplits(getDistinctSplits(splits))
+    }
+    copyValues(bucketizer.setParent(this))
+  }
+
+  private def getDistinctSplits(splits: Array[Double]): Array[Double] = {
     splits(0) = Double.NegativeInfinity
     splits(splits.length - 1) = Double.PositiveInfinity
-
     val distinctSplits = splits.distinct
     if (splits.length != distinctSplits.length) {
       log.warn(s"Some quantiles were identical. Bucketing to ${distinctSplits.length - 1}" +
         s" buckets as a result.")
     }
-    val bucketizer = new Bucketizer(uid)
-      .setSplits(distinctSplits.sorted)
-      .setHandleInvalid($(handleInvalid))
-    copyValues(bucketizer.setParent(this))
+    distinctSplits.sorted
   }
 
   @Since("1.6.0")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index f219f775b2186..e9a75e931e6a8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.util.DefaultReadWriteTest
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.sql._
@@ -146,4 +147,268 @@ class QuantileDiscretizerSuite
     val model = discretizer.fit(df)
     assert(model.hasParent)
   }
+
+  test("Multiple Columns: Test observed number of buckets and their sizes match expected values") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val datasetSize = 100000
+    val numBuckets = 5
+    val data1 = Array.range(1, 100001, 1).map(_.toDouble)
+    val data2 = Array.range(1, 200000, 2).map(_.toDouble)
+    val df = data1.zip(data2).toSeq.toDF("input1", "input2")
+
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("result1", "result2"))
+      .setNumBuckets(numBuckets)
+    val result = discretizer.fit(df).transform(df)
+
+    val relativeError = discretizer.getRelativeError
+    val isGoodBucket = udf {
+      (size: Int) => math.abs( size - (datasetSize / numBuckets)) <= (relativeError * datasetSize)
+    }
+
+    for (i <- 1 to 2) {
+      val observedNumBuckets = result.select("result" + i).distinct.count
+      assert(observedNumBuckets === numBuckets,
+        "Observed number of buckets does not equal expected number of buckets.")
+
+      val numGoodBuckets = result.groupBy("result" + i).count.filter(isGoodBucket($"count")).count
+      assert(numGoodBuckets === numBuckets,
+        "Bucket sizes are not within expected relative error tolerance.")
+    }
+  }
+
+  test("Multiple Columns: Test on data with high proportion of duplicated values") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val numBuckets = 5
+    val expectedNumBucket = 3
+    val data1 = Array(1.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 1.0, 3.0)
+    val data2 = Array(1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 3.0, 1.0, 2.0)
+    val df = data1.zip(data2).toSeq.toDF("input1", "input2")
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("result1", "result2"))
+      .setNumBuckets(numBuckets)
+    val result = discretizer.fit(df).transform(df)
+    for (i <- 1 to 2) {
+      val observedNumBuckets = result.select("result" + i).distinct.count
+      assert(observedNumBuckets == expectedNumBucket,
+        s"Observed number of buckets are not correct." +
+          s" Expected $expectedNumBucket but found ($observedNumBuckets")
+    }
+  }
+
+  test("Multiple Columns: Test transform on data with NaN value") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val numBuckets = 3
+    val validData1 = Array(-0.9, -0.5, -0.3, 0.0, 0.2, 0.5, 0.9, Double.NaN, Double.NaN, Double.NaN)
+    val expectedKeep1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0)
+    val expectedSkip1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0)
+    val validData2 = Array(0.2, -0.1, 0.3, 0.0, 0.1, 0.3, 0.5, Double.NaN, Double.NaN, Double.NaN)
+    val expectedKeep2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0, 3.0, 3.0, 3.0)
+    val expectedSkip2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0)
+
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("result1", "result2"))
+      .setNumBuckets(numBuckets)
+
+    withClue("QuantileDiscretizer with handleInvalid=error should throw exception for NaN values") {
+      val dataFrame: DataFrame = validData1.zip(validData2).toSeq.toDF("input1", "input2")
+      intercept[SparkException] {
+        discretizer.fit(dataFrame).transform(dataFrame).collect()
+      }
+    }
+
+    List(("keep", expectedKeep1, expectedKeep2), ("skip", expectedSkip1, expectedSkip2)).foreach {
+      case (u, v, w) =>
+        discretizer.setHandleInvalid(u)
+        val dataFrame: DataFrame = validData1.zip(validData2).zip(v).zip(w).map {
+          case (((a, b), c), d) => (a, b, c, d)
+        }.toSeq.toDF("input1", "input2", "expected1", "expected2")
+        val result = discretizer.fit(dataFrame).transform(dataFrame)
+        result.select("result1", "expected1", "result2", "expected2").collect().foreach {
+          case Row(x: Double, y: Double, z: Double, w: Double) =>
+            assert(x === y && w === z)
+        }
+    }
+  }
+
+  test("Multiple Columns: Test numBucketsArray") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val numBucketsArray: Array[Int] = Array(2, 5, 10)
+    val data1 = Array.range(1, 21, 1).map(_.toDouble)
+    val expected1 = Array (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+      1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
+    val data2 = Array.range(1, 40, 2).map(_.toDouble)
+    val expected2 = Array (0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0,
+      2.0, 2.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0)
+    val data3 = Array.range(1, 60, 3).map(_.toDouble)
+    val expected3 = Array (0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 5.0,
+      5.0, 5.0, 6.0, 6.0, 7.0, 8.0, 8.0, 9.0, 9.0, 9.0)
+    val data = (0 until 20).map { idx =>
+      (data1(idx), data2(idx), data3(idx), expected1(idx), expected2(idx), expected3(idx))
+    }
+    val df =
+      data.toDF("input1", "input2", "input3", "expected1", "expected2", "expected3")
+
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2", "input3"))
+      .setOutputCols(Array("result1", "result2", "result3"))
+      .setNumBucketsArray(numBucketsArray)
+
+    discretizer.fit(df).transform(df).
+      select("result1", "expected1", "result2", "expected2", "result3", "expected3")
+      .collect().foreach {
+      case Row(r1: Double, e1: Double, r2: Double, e2: Double, r3: Double, e3: Double) =>
+        assert(r1 === e1,
+          s"The result value is not correct after bucketing. Expected $e1 but found $r1")
+        assert(r2 === e2,
+          s"The result value is not correct after bucketing. Expected $e2 but found $r2")
+        assert(r3 === e3,
+          s"The result value is not correct after bucketing. Expected $e3 but found $r3")
+    }
+  }
+
+  test("Multiple Columns: Compare single/multiple column(s) QuantileDiscretizer in pipeline") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val numBucketsArray: Array[Int] = Array(2, 5, 10)
+    val data1 = Array.range(1, 21, 1).map(_.toDouble)
+    val data2 = Array.range(1, 40, 2).map(_.toDouble)
+    val data3 = Array.range(1, 60, 3).map(_.toDouble)
+    val data = (0 until 20).map { idx =>
+      (data1(idx), data2(idx), data3(idx))
+    }
+    val df =
+      data.toDF("input1", "input2", "input3")
+
+    val multiColsDiscretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2", "input3"))
+      .setOutputCols(Array("result1", "result2", "result3"))
+      .setNumBucketsArray(numBucketsArray)
+    val plForMultiCols = new Pipeline()
+      .setStages(Array(multiColsDiscretizer))
+      .fit(df)
+
+    val discretizerForCol1 = new QuantileDiscretizer()
+      .setInputCol("input1")
+      .setOutputCol("result1")
+      .setNumBuckets(numBucketsArray(0))
+
+    val discretizerForCol2 = new QuantileDiscretizer()
+      .setInputCol("input2")
+      .setOutputCol("result2")
+      .setNumBuckets(numBucketsArray(1))
+
+    val discretizerForCol3 = new QuantileDiscretizer()
+      .setInputCol("input3")
+      .setOutputCol("result3")
+      .setNumBuckets(numBucketsArray(2))
+
+    val plForSingleCol = new Pipeline()
+      .setStages(Array(discretizerForCol1, discretizerForCol2, discretizerForCol3))
+      .fit(df)
+
+    val resultForMultiCols = plForMultiCols.transform(df)
+      .select("result1", "result2", "result3")
+      .collect()
+
+    val resultForSingleCol = plForSingleCol.transform(df)
+      .select("result1", "result2", "result3")
+      .collect()
+
+    resultForSingleCol.zip(resultForMultiCols).foreach {
+      case (rowForSingle, rowForMultiCols) =>
+        assert(rowForSingle.getDouble(0) == rowForMultiCols.getDouble(0) &&
+          rowForSingle.getDouble(1) == rowForMultiCols.getDouble(1) &&
+          rowForSingle.getDouble(2) == rowForMultiCols.getDouble(2))
+    }
+  }
+
+  test("Multiple Columns: Comparing setting numBuckets with setting numBucketsArray " +
+    "explicitly with identical values") {
+    val spark = this.spark
+    import spark.implicits._
+
+    val data1 = Array.range(1, 21, 1).map(_.toDouble)
+    val data2 = Array.range(1, 40, 2).map(_.toDouble)
+    val data3 = Array.range(1, 60, 3).map(_.toDouble)
+    val data = (0 until 20).map { idx =>
+      (data1(idx), data2(idx), data3(idx))
+    }
+    val df =
+      data.toDF("input1", "input2", "input3")
+
+    val discretizerSingleNumBuckets = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2", "input3"))
+      .setOutputCols(Array("result1", "result2", "result3"))
+      .setNumBuckets(10)
+
+    val discretizerNumBucketsArray = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2", "input3"))
+      .setOutputCols(Array("result1", "result2", "result3"))
+      .setNumBucketsArray(Array(10, 10, 10))
+
+    val result1 = discretizerSingleNumBuckets.fit(df).transform(df)
+      .select("result1", "result2", "result3")
+      .collect()
+    val result2 = discretizerNumBucketsArray.fit(df).transform(df)
+      .select("result1", "result2", "result3")
+      .collect()
+
+    result1.zip(result2).foreach {
+      case (row1, row2) =>
+        assert(row1.getDouble(0) == row2.getDouble(0) &&
+          row1.getDouble(1) == row2.getDouble(1) &&
+          row1.getDouble(2) == row2.getDouble(2))
+    }
+  }
+
+  test("Multiple Columns: read/write") {
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("result1", "result2"))
+      .setNumBucketsArray(Array(5, 10))
+    testDefaultReadWrite(discretizer)
+  }
+
+  test("Multiple Columns: Both inputCol and inputCols are set") {
+    val spark = this.spark
+    import spark.implicits._
+    val discretizer = new QuantileDiscretizer()
+      .setInputCol("input")
+      .setOutputCol("result")
+      .setNumBuckets(3)
+      .setInputCols(Array("input1", "input2"))
+    val df = sc.parallelize(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0))
+      .map(Tuple1.apply).toDF("input")
+    // When both inputCol and inputCols are set, we throw Exception.
+    intercept[IllegalArgumentException] {
+      discretizer.fit(df)
+    }
+  }
+
+  test("Multiple Columns: Mismatched sizes of inputCols / outputCols") {
+    val spark = this.spark
+    import spark.implicits._
+    val discretizer = new QuantileDiscretizer()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("result1", "result2"))
+      .setNumBuckets(3)
+    val df = sc.parallelize(Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0))
+      .map(Tuple1.apply).toDF("input")
+    intercept[IllegalArgumentException] {
+      discretizer.fit(df)
+    }
+  }
 }

From 028ee40165315337e2a349b19731764d64e4f51d Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nickp@za.ibm.com>
Date: Sun, 31 Dec 2017 14:51:38 +0200
Subject: [PATCH 239/356] [SPARK-22801][ML][PYSPARK] Allow FeatureHasher to
 treat numeric columns as categorical

Previously, `FeatureHasher` always treats numeric type columns as numbers and never as categorical features. It is quite common to have categorical features represented as numbers or codes in data sources.

In order to hash these features as categorical, users must first explicitly convert them to strings which is cumbersome.

Add a new param `categoricalCols` which specifies the numeric columns that should be treated as categorical features.

## How was this patch tested?

New unit tests.

Author: Nick Pentreath <nickp@za.ibm.com>

Closes #19991 from MLnick/hasher-num-cat.
---
 .../spark/ml/feature/FeatureHasher.scala      | 38 +++++++++++++++----
 .../spark/ml/feature/FeatureHasherSuite.scala | 25 ++++++++++++
 python/pyspark/ml/feature.py                  | 34 +++++++++++++----
 3 files changed, 83 insertions(+), 14 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
index 4615daed20fb1..a918dd4c075da 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.Transformer
 import org.apache.spark.ml.attribute.AttributeGroup
 import org.apache.spark.ml.linalg.Vectors
-import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
+import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, StringArrayParam}
 import org.apache.spark.ml.param.shared.{HasInputCols, HasOutputCol}
 import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF}
@@ -40,9 +40,9 @@ import org.apache.spark.util.collection.OpenHashMap
  * The [[FeatureHasher]] transformer operates on multiple columns. Each column may contain either
  * numeric or categorical features. Behavior and handling of column data types is as follows:
  *  -Numeric columns: For numeric features, the hash value of the column name is used to map the
- *                    feature value to its index in the feature vector. Numeric features are never
- *                    treated as categorical, even when they are integers. You must explicitly
- *                    convert numeric columns containing categorical features to strings first.
+ *                    feature value to its index in the feature vector. By default, numeric features
+ *                    are not treated as categorical (even when they are integers). To treat them
+ *                    as categorical, specify the relevant columns in `categoricalCols`.
  *  -String columns: For categorical features, the hash value of the string "column_name=value"
  *                   is used to map to the vector index, with an indicator value of `1.0`.
  *                   Thus, categorical features are "one-hot" encoded
@@ -86,6 +86,17 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
   @Since("2.3.0")
   def this() = this(Identifiable.randomUID("featureHasher"))
 
+  /**
+   * Numeric columns to treat as categorical features. By default only string and boolean
+   * columns are treated as categorical, so this param can be used to explicitly specify the
+   * numerical columns to treat as categorical. Note, the relevant columns must also be set in
+   * `inputCols`.
+   * @group param
+   */
+  @Since("2.3.0")
+  val categoricalCols = new StringArrayParam(this, "categoricalCols",
+    "numeric columns to treat as categorical")
+
   /**
    * Number of features. Should be greater than 0.
    * (default = 2^18^)
@@ -117,15 +128,28 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
   @Since("2.3.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
+  /** @group getParam */
+  @Since("2.3.0")
+  def getCategoricalCols: Array[String] = $(categoricalCols)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setCategoricalCols(value: Array[String]): this.type = set(categoricalCols, value)
+
   @Since("2.3.0")
   override def transform(dataset: Dataset[_]): DataFrame = {
     val hashFunc: Any => Int = OldHashingTF.murmur3Hash
     val n = $(numFeatures)
     val localInputCols = $(inputCols)
+    val catCols = if (isSet(categoricalCols)) {
+      $(categoricalCols).toSet
+    } else {
+      Set[String]()
+    }
 
     val outputSchema = transformSchema(dataset.schema)
     val realFields = outputSchema.fields.filter { f =>
-      f.dataType.isInstanceOf[NumericType]
+      f.dataType.isInstanceOf[NumericType] && !catCols.contains(f.name)
     }.map(_.name).toSet
 
     def getDouble(x: Any): Double = {
@@ -149,8 +173,8 @@ class FeatureHasher(@Since("2.3.0") override val uid: String) extends Transforme
             val hash = hashFunc(colName)
             (hash, value)
           } else {
-            // string and boolean values are treated as categorical, with an indicator value of 1.0
-            // and vector index based on hash of "column_name=value"
+            // string, boolean and numeric values that are in catCols are treated as categorical,
+            // with an indicator value of 1.0 and vector index based on hash of "column_name=value"
             val value = row.get(fieldIndex).toString
             val fieldName = s"$colName=$value"
             val hash = hashFunc(fieldName)
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
index 407371a826666..3fc3cbb62d5b5 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FeatureHasherSuite.scala
@@ -78,6 +78,31 @@ class FeatureHasherSuite extends SparkFunSuite
     assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
   }
 
+  test("setting explicit numerical columns to treat as categorical") {
+    val df = Seq(
+      (2.0, 1, "foo"),
+      (3.0, 2, "bar")
+    ).toDF("real", "int", "string")
+
+    val n = 100
+    val hasher = new FeatureHasher()
+      .setInputCols("real", "int", "string")
+      .setCategoricalCols(Array("real"))
+      .setOutputCol("features")
+      .setNumFeatures(n)
+    val output = hasher.transform(df)
+
+    val features = output.select("features").as[Vector].collect()
+    // Assume perfect hash on field names
+    def idx: Any => Int = murmur3FeatureIdx(n)
+    // check expected indices
+    val expected = Seq(
+      Vectors.sparse(n, Seq((idx("real=2.0"), 1.0), (idx("int"), 1.0), (idx("string=foo"), 1.0))),
+      Vectors.sparse(n, Seq((idx("real=3.0"), 1.0), (idx("int"), 2.0), (idx("string=bar"), 1.0)))
+    )
+    assert(features.zip(expected).forall { case (e, a) => e ~== a absTol 1e-14 })
+  }
+
   test("hashing works for all numeric types") {
     val df = Seq(5.0, 10.0, 15.0).toDF("real")
 
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 5094324e5c1fe..13bf95cce40be 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -714,9 +714,9 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures,
 
     * Numeric columns:
         For numeric features, the hash value of the column name is used to map the
-        feature value to its index in the feature vector. Numeric features are never
-        treated as categorical, even when they are integers. You must explicitly
-        convert numeric columns containing categorical features to strings first.
+        feature value to its index in the feature vector. By default, numeric features
+        are not treated as categorical (even when they are integers). To treat them
+        as categorical, specify the relevant columns in `categoricalCols`.
 
     * String columns:
         For categorical features, the hash value of the string "column_name=value"
@@ -741,6 +741,8 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures,
     >>> hasher = FeatureHasher(inputCols=cols, outputCol="features")
     >>> hasher.transform(df).head().features
     SparseVector(262144, {51871: 1.0, 63643: 1.0, 174475: 2.0, 253195: 1.0})
+    >>> hasher.setCategoricalCols(["real"]).transform(df).head().features
+    SparseVector(262144, {51871: 1.0, 63643: 1.0, 171257: 1.0, 253195: 1.0})
     >>> hasherPath = temp_path + "/hasher"
     >>> hasher.save(hasherPath)
     >>> loadedHasher = FeatureHasher.load(hasherPath)
@@ -752,10 +754,14 @@ class FeatureHasher(JavaTransformer, HasInputCols, HasOutputCol, HasNumFeatures,
     .. versionadded:: 2.3.0
     """
 
+    categoricalCols = Param(Params._dummy(), "categoricalCols",
+                            "numeric columns to treat as categorical",
+                            typeConverter=TypeConverters.toListString)
+
     @keyword_only
-    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+    def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None):
         """
-        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None)
         """
         super(FeatureHasher, self).__init__()
         self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.FeatureHasher", self.uid)
@@ -765,14 +771,28 @@ def __init__(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
 
     @keyword_only
     @since("2.3.0")
-    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None):
+    def setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None):
         """
-        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None)
+        setParams(self, numFeatures=1 << 18, inputCols=None, outputCol=None, categoricalCols=None)
         Sets params for this FeatureHasher.
         """
         kwargs = self._input_kwargs
         return self._set(**kwargs)
 
+    @since("2.3.0")
+    def setCategoricalCols(self, value):
+        """
+        Sets the value of :py:attr:`categoricalCols`.
+        """
+        return self._set(categoricalCols=value)
+
+    @since("2.3.0")
+    def getCategoricalCols(self):
+        """
+        Gets the value of binary or its default value.
+        """
+        return self.getOrDefault(self.categoricalCols)
+
 
 @inherit_doc
 class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, JavaMLReadable,

From 5955a2d0fb8f84560925d72afe36055089f44a05 Mon Sep 17 00:00:00 2001
From: Jirka Kremser <jkremser@redhat.com>
Date: Sun, 31 Dec 2017 15:38:10 -0600
Subject: [PATCH 240/356] [MINOR][DOCS] s/It take/It takes/g

## What changes were proposed in this pull request?

Fixing three small typos in the docs, in particular:

It take a `RDD` -> It takes an `RDD` (twice)

It take an `JavaRDD` -> It takes a `JavaRDD`

I didn't create any Jira issue for this minor thing, I hope it's ok.

## How was this patch tested?

visually by clicking on 'preview'

Author: Jirka Kremser <jkremser@redhat.com>

Closes #20108 from Jiri-Kremser/docs-typo.
---
 docs/mllib-frequent-pattern-mining.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md
index c9cd7cc85e754..0d3192c6b1d9c 100644
--- a/docs/mllib-frequent-pattern-mining.md
+++ b/docs/mllib-frequent-pattern-mining.md
@@ -41,7 +41,7 @@ We refer users to the papers for more details.
 
 [`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) implements the
 FP-growth algorithm.
-It take a `RDD` of transactions, where each transaction is an `Array` of items of a generic type.
+It takes an `RDD` of transactions, where each transaction is an `Array` of items of a generic type.
 Calling `FPGrowth.run` with transactions returns an
 [`FPGrowthModel`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel)
 that stores the frequent itemsets with their frequencies.  The following
@@ -60,7 +60,7 @@ Refer to the [`FPGrowth` Scala docs](api/scala/index.html#org.apache.spark.mllib
 
 [`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the
 FP-growth algorithm.
-It take an `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type.
+It takes a `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type.
 Calling `FPGrowth.run` with transactions returns an
 [`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html)
 that stores the frequent itemsets with their frequencies.  The following
@@ -79,7 +79,7 @@ Refer to the [`FPGrowth` Java docs](api/java/org/apache/spark/mllib/fpm/FPGrowth
 
 [`FPGrowth`](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowth) implements the
 FP-growth algorithm.
-It take an `RDD` of transactions, where each transaction is an `List` of items of a generic type.
+It takes an `RDD` of transactions, where each transaction is an `List` of items of a generic type.
 Calling `FPGrowth.train` with transactions returns an
 [`FPGrowthModel`](api/python/pyspark.mllib.html#pyspark.mllib.fpm.FPGrowthModel)
 that stores the frequent itemsets with their frequencies.

From 994065d891a23ed89a09b3f95bc3f1f986793e0d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 31 Dec 2017 15:28:59 -0800
Subject: [PATCH 241/356] [SPARK-13030][ML] Create OneHotEncoderEstimator for
 OneHotEncoder as Estimator

## What changes were proposed in this pull request?

This patch adds a new class `OneHotEncoderEstimator` which extends `Estimator`. The `fit` method returns `OneHotEncoderModel`.

Common methods between existing `OneHotEncoder` and new `OneHotEncoderEstimator`, such as transforming schema, are extracted and put into `OneHotEncoderCommon` to reduce code duplication.

### Multi-column support

`OneHotEncoderEstimator` adds simpler multi-column support because it is new API and can be free from backward compatibility.

### handleInvalid Param support

`OneHotEncoderEstimator` supports `handleInvalid` Param. It supports `error` and `keep`.

## How was this patch tested?

Added new test suite `OneHotEncoderEstimatorSuite`.

Author: Liang-Chi Hsieh <viirya@gmail.com>

Closes #19527 from viirya/SPARK-13030.
---
 .../spark/ml/feature/OneHotEncoder.scala      |  83 +--
 .../ml/feature/OneHotEncoderEstimator.scala   | 522 ++++++++++++++++++
 .../feature/OneHotEncoderEstimatorSuite.scala | 421 ++++++++++++++
 3 files changed, 960 insertions(+), 66 deletions(-)
 create mode 100644 mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
 create mode 100644 mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
index a669da183e2c8..5ab6c2dde667a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala
@@ -41,8 +41,12 @@ import org.apache.spark.sql.types.{DoubleType, NumericType, StructType}
  * The output vectors are sparse.
  *
  * @see `StringIndexer` for converting categorical values into category indices
+ * @deprecated `OneHotEncoderEstimator` will be renamed `OneHotEncoder` and this `OneHotEncoder`
+ * will be removed in 3.0.0.
  */
 @Since("1.4.0")
+@deprecated("`OneHotEncoderEstimator` will be renamed `OneHotEncoder` and this `OneHotEncoder`" +
+  " will be removed in 3.0.0.", "2.3.0")
 class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer
   with HasInputCol with HasOutputCol with DefaultParamsWritable {
 
@@ -78,56 +82,16 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
   override def transformSchema(schema: StructType): StructType = {
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
+    val inputFields = schema.fields
 
     require(schema(inputColName).dataType.isInstanceOf[NumericType],
       s"Input column must be of type NumericType but got ${schema(inputColName).dataType}")
-    val inputFields = schema.fields
     require(!inputFields.exists(_.name == outputColName),
       s"Output column $outputColName already exists.")
 
-    val inputAttr = Attribute.fromStructField(schema(inputColName))
-    val outputAttrNames: Option[Array[String]] = inputAttr match {
-      case nominal: NominalAttribute =>
-        if (nominal.values.isDefined) {
-          nominal.values
-        } else if (nominal.numValues.isDefined) {
-          nominal.numValues.map(n => Array.tabulate(n)(_.toString))
-        } else {
-          None
-        }
-      case binary: BinaryAttribute =>
-        if (binary.values.isDefined) {
-          binary.values
-        } else {
-          Some(Array.tabulate(2)(_.toString))
-        }
-      case _: NumericAttribute =>
-        throw new RuntimeException(
-          s"The input column $inputColName cannot be numeric.")
-      case _ =>
-        None // optimistic about unknown attributes
-    }
-
-    val filteredOutputAttrNames = outputAttrNames.map { names =>
-      if ($(dropLast)) {
-        require(names.length > 1,
-          s"The input column $inputColName should have at least two distinct values.")
-        names.dropRight(1)
-      } else {
-        names
-      }
-    }
-
-    val outputAttrGroup = if (filteredOutputAttrNames.isDefined) {
-      val attrs: Array[Attribute] = filteredOutputAttrNames.get.map { name =>
-        BinaryAttribute.defaultAttr.withName(name)
-      }
-      new AttributeGroup($(outputCol), attrs)
-    } else {
-      new AttributeGroup($(outputCol))
-    }
-
-    val outputFields = inputFields :+ outputAttrGroup.toStructField()
+    val outputField = OneHotEncoderCommon.transformOutputColumnSchema(
+      schema(inputColName), outputColName, $(dropLast))
+    val outputFields = inputFields :+ outputField
     StructType(outputFields)
   }
 
@@ -136,30 +100,17 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e
     // schema transformation
     val inputColName = $(inputCol)
     val outputColName = $(outputCol)
-    val shouldDropLast = $(dropLast)
-    var outputAttrGroup = AttributeGroup.fromStructField(
+
+    val outputAttrGroupFromSchema = AttributeGroup.fromStructField(
       transformSchema(dataset.schema)(outputColName))
-    if (outputAttrGroup.size < 0) {
-      // If the number of attributes is unknown, we check the values from the input column.
-      val numAttrs = dataset.select(col(inputColName).cast(DoubleType)).rdd.map(_.getDouble(0))
-        .treeAggregate(0.0)(
-          (m, x) => {
-            assert(x <= Int.MaxValue,
-              s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x")
-            assert(x >= 0.0 && x == x.toInt,
-              s"Values from column $inputColName must be indices, but got $x.")
-            math.max(m, x)
-          },
-          (m0, m1) => {
-            math.max(m0, m1)
-          }
-        ).toInt + 1
-      val outputAttrNames = Array.tabulate(numAttrs)(_.toString)
-      val filtered = if (shouldDropLast) outputAttrNames.dropRight(1) else outputAttrNames
-      val outputAttrs: Array[Attribute] =
-        filtered.map(name => BinaryAttribute.defaultAttr.withName(name))
-      outputAttrGroup = new AttributeGroup(outputColName, outputAttrs)
+
+    val outputAttrGroup = if (outputAttrGroupFromSchema.size < 0) {
+      OneHotEncoderCommon.getOutputAttrGroupFromData(
+        dataset, Seq(inputColName), Seq(outputColName), $(dropLast))(0)
+    } else {
+      outputAttrGroupFromSchema
     }
+
     val metadata = outputAttrGroup.toMetadata()
 
     // data transformation
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
new file mode 100644
index 0000000000000..074622d41e28d
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
@@ -0,0 +1,522 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.SparkException
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.attribute._
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasHandleInvalid, HasInputCols, HasOutputCols}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.functions.{col, lit, udf}
+import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
+
+/** Private trait for params and common methods for OneHotEncoderEstimator and OneHotEncoderModel */
+private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
+    with HasInputCols with HasOutputCols {
+
+  /**
+   * Param for how to handle invalid data.
+   * Options are 'keep' (invalid data presented as an extra categorical feature) or
+   * 'error' (throw an error).
+   * Default: "error"
+   * @group param
+   */
+  @Since("2.3.0")
+  override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
+    "How to handle invalid data " +
+    "Options are 'keep' (invalid data presented as an extra categorical feature) " +
+    "or error (throw an error).",
+    ParamValidators.inArray(OneHotEncoderEstimator.supportedHandleInvalids))
+
+  setDefault(handleInvalid, OneHotEncoderEstimator.ERROR_INVALID)
+
+  /**
+   * Whether to drop the last category in the encoded vector (default: true)
+   * @group param
+   */
+  @Since("2.3.0")
+  final val dropLast: BooleanParam =
+    new BooleanParam(this, "dropLast", "whether to drop the last category")
+  setDefault(dropLast -> true)
+
+  /** @group getParam */
+  @Since("2.3.0")
+  def getDropLast: Boolean = $(dropLast)
+
+  protected def validateAndTransformSchema(
+      schema: StructType, dropLast: Boolean, keepInvalid: Boolean): StructType = {
+    val inputColNames = $(inputCols)
+    val outputColNames = $(outputCols)
+    val existingFields = schema.fields
+
+    require(inputColNames.length == outputColNames.length,
+      s"The number of input columns ${inputColNames.length} must be the same as the number of " +
+        s"output columns ${outputColNames.length}.")
+
+    // Input columns must be NumericType.
+    inputColNames.foreach(SchemaUtils.checkNumericType(schema, _))
+
+    // Prepares output columns with proper attributes by examining input columns.
+    val inputFields = $(inputCols).map(schema(_))
+
+    val outputFields = inputFields.zip(outputColNames).map { case (inputField, outputColName) =>
+      OneHotEncoderCommon.transformOutputColumnSchema(
+        inputField, outputColName, dropLast, keepInvalid)
+    }
+    outputFields.foldLeft(schema) { case (newSchema, outputField) =>
+      SchemaUtils.appendColumn(newSchema, outputField)
+    }
+  }
+}
+
+/**
+ * A one-hot encoder that maps a column of category indices to a column of binary vectors, with
+ * at most a single one-value per row that indicates the input category index.
+ * For example with 5 categories, an input value of 2.0 would map to an output vector of
+ * `[0.0, 0.0, 1.0, 0.0]`.
+ * The last category is not included by default (configurable via `dropLast`),
+ * because it makes the vector entries sum up to one, and hence linearly dependent.
+ * So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+ *
+ * @note This is different from scikit-learn's OneHotEncoder, which keeps all categories.
+ * The output vectors are sparse.
+ *
+ * When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is
+ * added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros
+ * vector.
+ *
+ * @note When encoding multi-column by using `inputCols` and `outputCols` params, input/output cols
+ * come in pairs, specified by the order in the arrays, and each pair is treated independently.
+ *
+ * @see `StringIndexer` for converting categorical values into category indices
+ */
+@Since("2.3.0")
+class OneHotEncoderEstimator @Since("2.3.0") (@Since("2.3.0") override val uid: String)
+    extends Estimator[OneHotEncoderModel] with OneHotEncoderBase with DefaultParamsWritable {
+
+  @Since("2.3.0")
+  def this() = this(Identifiable.randomUID("oneHotEncoder"))
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(values: Array[String]): this.type = set(inputCols, values)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCols(values: Array[String]): this.type = set(outputCols, values)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setDropLast(value: Boolean): this.type = set(dropLast, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
+    validateAndTransformSchema(schema, dropLast = $(dropLast),
+      keepInvalid = keepInvalid)
+  }
+
+  @Since("2.3.0")
+  override def fit(dataset: Dataset[_]): OneHotEncoderModel = {
+    transformSchema(dataset.schema)
+
+    // Compute the plain number of categories without `handleInvalid` and
+    // `dropLast` taken into account.
+    val transformedSchema = validateAndTransformSchema(dataset.schema, dropLast = false,
+      keepInvalid = false)
+    val categorySizes = new Array[Int]($(outputCols).length)
+
+    val columnToScanIndices = $(outputCols).zipWithIndex.flatMap { case (outputColName, idx) =>
+      val numOfAttrs = AttributeGroup.fromStructField(
+        transformedSchema(outputColName)).size
+      if (numOfAttrs < 0) {
+        Some(idx)
+      } else {
+        categorySizes(idx) = numOfAttrs
+        None
+      }
+    }
+
+    // Some input columns don't have attributes or their attributes don't have necessary info.
+    // We need to scan the data to get the number of values for each column.
+    if (columnToScanIndices.length > 0) {
+      val inputColNames = columnToScanIndices.map($(inputCols)(_))
+      val outputColNames = columnToScanIndices.map($(outputCols)(_))
+
+      // When fitting data, we want the plain number of categories without `handleInvalid` and
+      // `dropLast` taken into account.
+      val attrGroups = OneHotEncoderCommon.getOutputAttrGroupFromData(
+        dataset, inputColNames, outputColNames, dropLast = false)
+      attrGroups.zip(columnToScanIndices).foreach { case (attrGroup, idx) =>
+        categorySizes(idx) = attrGroup.size
+      }
+    }
+
+    val model = new OneHotEncoderModel(uid, categorySizes).setParent(this)
+    copyValues(model)
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): OneHotEncoderEstimator = defaultCopy(extra)
+}
+
+@Since("2.3.0")
+object OneHotEncoderEstimator extends DefaultParamsReadable[OneHotEncoderEstimator] {
+
+  private[feature] val KEEP_INVALID: String = "keep"
+  private[feature] val ERROR_INVALID: String = "error"
+  private[feature] val supportedHandleInvalids: Array[String] = Array(KEEP_INVALID, ERROR_INVALID)
+
+  @Since("2.3.0")
+  override def load(path: String): OneHotEncoderEstimator = super.load(path)
+}
+
+@Since("2.3.0")
+class OneHotEncoderModel private[ml] (
+    @Since("2.3.0") override val uid: String,
+    @Since("2.3.0") val categorySizes: Array[Int])
+  extends Model[OneHotEncoderModel] with OneHotEncoderBase with MLWritable {
+
+  import OneHotEncoderModel._
+
+  // Returns the category size for a given index with `dropLast` and `handleInvalid`
+  // taken into account.
+  private def configedCategorySize(orgCategorySize: Int, idx: Int): Int = {
+    val dropLast = getDropLast
+    val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+
+    if (!dropLast && keepInvalid) {
+      // When `handleInvalid` is "keep", an extra category is added as last category
+      // for invalid data.
+      orgCategorySize + 1
+    } else if (dropLast && !keepInvalid) {
+      // When `dropLast` is true, the last category is removed.
+      orgCategorySize - 1
+    } else {
+      // When `dropLast` is true and `handleInvalid` is "keep", the extra category for invalid
+      // data is removed. Thus, it is the same as the plain number of categories.
+      orgCategorySize
+    }
+  }
+
+  private def encoder: UserDefinedFunction = {
+    val oneValue = Array(1.0)
+    val emptyValues = Array.empty[Double]
+    val emptyIndices = Array.empty[Int]
+    val dropLast = getDropLast
+    val handleInvalid = getHandleInvalid
+    val keepInvalid = handleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+
+    // The udf performed on input data. The first parameter is the input value. The second
+    // parameter is the index of input.
+    udf { (label: Double, idx: Int) =>
+      val plainNumCategories = categorySizes(idx)
+      val size = configedCategorySize(plainNumCategories, idx)
+
+      if (label < 0) {
+        throw new SparkException(s"Negative value: $label. Input can't be negative.")
+      } else if (label == size && dropLast && !keepInvalid) {
+        // When `dropLast` is true and `handleInvalid` is not "keep",
+        // the last category is removed.
+        Vectors.sparse(size, emptyIndices, emptyValues)
+      } else if (label >= plainNumCategories && keepInvalid) {
+        // When `handleInvalid` is "keep", encodes invalid data to last category (and removed
+        // if `dropLast` is true)
+        if (dropLast) {
+          Vectors.sparse(size, emptyIndices, emptyValues)
+        } else {
+          Vectors.sparse(size, Array(size - 1), oneValue)
+        }
+      } else if (label < plainNumCategories) {
+        Vectors.sparse(size, Array(label.toInt), oneValue)
+      } else {
+        assert(handleInvalid == OneHotEncoderEstimator.ERROR_INVALID)
+        throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
+          s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+      }
+    }
+  }
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setInputCols(values: Array[String]): this.type = set(inputCols, values)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setOutputCols(values: Array[String]): this.type = set(outputCols, values)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setDropLast(value: Boolean): this.type = set(dropLast, value)
+
+  /** @group setParam */
+  @Since("2.3.0")
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+
+  @Since("2.3.0")
+  override def transformSchema(schema: StructType): StructType = {
+    val inputColNames = $(inputCols)
+    val outputColNames = $(outputCols)
+
+    require(inputColNames.length == categorySizes.length,
+      s"The number of input columns ${inputColNames.length} must be the same as the number of " +
+        s"features ${categorySizes.length} during fitting.")
+
+    val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
+    val transformedSchema = validateAndTransformSchema(schema, dropLast = $(dropLast),
+      keepInvalid = keepInvalid)
+    verifyNumOfValues(transformedSchema)
+  }
+
+  /**
+   * If the metadata of input columns also specifies the number of categories, we need to
+   * compare with expected category number with `handleInvalid` and `dropLast` taken into
+   * account. Mismatched numbers will cause exception.
+   */
+  private def verifyNumOfValues(schema: StructType): StructType = {
+    $(outputCols).zipWithIndex.foreach { case (outputColName, idx) =>
+      val inputColName = $(inputCols)(idx)
+      val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
+
+      // If the input metadata specifies number of category for output column,
+      // comparing with expected category number with `handleInvalid` and
+      // `dropLast` taken into account.
+      if (attrGroup.attributes.nonEmpty) {
+        val numCategories = configedCategorySize(categorySizes(idx), idx)
+        require(attrGroup.size == numCategories, "OneHotEncoderModel expected " +
+          s"$numCategories categorical values for input column ${inputColName}, " +
+            s"but the input column had metadata specifying ${attrGroup.size} values.")
+      }
+    }
+    schema
+  }
+
+  @Since("2.3.0")
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val transformedSchema = transformSchema(dataset.schema, logging = true)
+    val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
+
+    val encodedColumns = (0 until $(inputCols).length).map { idx =>
+      val inputColName = $(inputCols)(idx)
+      val outputColName = $(outputCols)(idx)
+
+      val outputAttrGroupFromSchema =
+        AttributeGroup.fromStructField(transformedSchema(outputColName))
+
+      val metadata = if (outputAttrGroupFromSchema.size < 0) {
+        OneHotEncoderCommon.createAttrGroupForAttrNames(outputColName,
+          categorySizes(idx), $(dropLast), keepInvalid).toMetadata()
+      } else {
+        outputAttrGroupFromSchema.toMetadata()
+      }
+
+      encoder(col(inputColName).cast(DoubleType), lit(idx))
+        .as(outputColName, metadata)
+    }
+    dataset.withColumns($(outputCols), encodedColumns)
+  }
+
+  @Since("2.3.0")
+  override def copy(extra: ParamMap): OneHotEncoderModel = {
+    val copied = new OneHotEncoderModel(uid, categorySizes)
+    copyValues(copied, extra).setParent(parent)
+  }
+
+  @Since("2.3.0")
+  override def write: MLWriter = new OneHotEncoderModelWriter(this)
+}
+
+@Since("2.3.0")
+object OneHotEncoderModel extends MLReadable[OneHotEncoderModel] {
+
+  private[OneHotEncoderModel]
+  class OneHotEncoderModelWriter(instance: OneHotEncoderModel) extends MLWriter {
+
+    private case class Data(categorySizes: Array[Int])
+
+    override protected def saveImpl(path: String): Unit = {
+      DefaultParamsWriter.saveMetadata(instance, path, sc)
+      val data = Data(instance.categorySizes)
+      val dataPath = new Path(path, "data").toString
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath)
+    }
+  }
+
+  private class OneHotEncoderModelReader extends MLReader[OneHotEncoderModel] {
+
+    private val className = classOf[OneHotEncoderModel].getName
+
+    override def load(path: String): OneHotEncoderModel = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+      val dataPath = new Path(path, "data").toString
+      val data = sparkSession.read.parquet(dataPath)
+        .select("categorySizes")
+        .head()
+      val categorySizes = data.getAs[Seq[Int]](0).toArray
+      val model = new OneHotEncoderModel(metadata.uid, categorySizes)
+      DefaultParamsReader.getAndSetParams(model, metadata)
+      model
+    }
+  }
+
+  @Since("2.3.0")
+  override def read: MLReader[OneHotEncoderModel] = new OneHotEncoderModelReader
+
+  @Since("2.3.0")
+  override def load(path: String): OneHotEncoderModel = super.load(path)
+}
+
+/**
+ * Provides some helper methods used by both `OneHotEncoder` and `OneHotEncoderEstimator`.
+ */
+private[feature] object OneHotEncoderCommon {
+
+  private def genOutputAttrNames(inputCol: StructField): Option[Array[String]] = {
+    val inputAttr = Attribute.fromStructField(inputCol)
+    inputAttr match {
+      case nominal: NominalAttribute =>
+        if (nominal.values.isDefined) {
+          nominal.values
+        } else if (nominal.numValues.isDefined) {
+          nominal.numValues.map(n => Array.tabulate(n)(_.toString))
+        } else {
+          None
+        }
+      case binary: BinaryAttribute =>
+        if (binary.values.isDefined) {
+          binary.values
+        } else {
+          Some(Array.tabulate(2)(_.toString))
+        }
+      case _: NumericAttribute =>
+        throw new RuntimeException(
+          s"The input column ${inputCol.name} cannot be continuous-value.")
+      case _ =>
+        None // optimistic about unknown attributes
+    }
+  }
+
+  /** Creates an `AttributeGroup` filled by the `BinaryAttribute` named as required. */
+  private def genOutputAttrGroup(
+      outputAttrNames: Option[Array[String]],
+      outputColName: String): AttributeGroup = {
+    outputAttrNames.map { attrNames =>
+      val attrs: Array[Attribute] = attrNames.map { name =>
+        BinaryAttribute.defaultAttr.withName(name)
+      }
+      new AttributeGroup(outputColName, attrs)
+    }.getOrElse{
+      new AttributeGroup(outputColName)
+    }
+  }
+
+  /**
+   * Prepares the `StructField` with proper metadata for `OneHotEncoder`'s output column.
+   */
+  def transformOutputColumnSchema(
+      inputCol: StructField,
+      outputColName: String,
+      dropLast: Boolean,
+      keepInvalid: Boolean = false): StructField = {
+    val outputAttrNames = genOutputAttrNames(inputCol)
+    val filteredOutputAttrNames = outputAttrNames.map { names =>
+      if (dropLast && !keepInvalid) {
+        require(names.length > 1,
+          s"The input column ${inputCol.name} should have at least two distinct values.")
+        names.dropRight(1)
+      } else if (!dropLast && keepInvalid) {
+        names ++ Seq("invalidValues")
+      } else {
+        names
+      }
+    }
+
+    genOutputAttrGroup(filteredOutputAttrNames, outputColName).toStructField()
+  }
+
+  /**
+   * This method is called when we want to generate `AttributeGroup` from actual data for
+   * one-hot encoder.
+   */
+  def getOutputAttrGroupFromData(
+      dataset: Dataset[_],
+      inputColNames: Seq[String],
+      outputColNames: Seq[String],
+      dropLast: Boolean): Seq[AttributeGroup] = {
+    // The RDD approach has advantage of early-stop if any values are invalid. It seems that
+    // DataFrame ops don't have equivalent functions.
+    val columns = inputColNames.map { inputColName =>
+      col(inputColName).cast(DoubleType)
+    }
+    val numOfColumns = columns.length
+
+    val numAttrsArray = dataset.select(columns: _*).rdd.map { row =>
+      (0 until numOfColumns).map(idx => row.getDouble(idx)).toArray
+    }.treeAggregate(new Array[Double](numOfColumns))(
+      (maxValues, curValues) => {
+        (0 until numOfColumns).foreach { idx =>
+          val x = curValues(idx)
+          assert(x <= Int.MaxValue,
+            s"OneHotEncoder only supports up to ${Int.MaxValue} indices, but got $x.")
+          assert(x >= 0.0 && x == x.toInt,
+            s"Values from column ${inputColNames(idx)} must be indices, but got $x.")
+          maxValues(idx) = math.max(maxValues(idx), x)
+        }
+        maxValues
+      },
+      (m0, m1) => {
+        (0 until numOfColumns).foreach { idx =>
+          m0(idx) = math.max(m0(idx), m1(idx))
+        }
+        m0
+      }
+    ).map(_.toInt + 1)
+
+    outputColNames.zip(numAttrsArray).map { case (outputColName, numAttrs) =>
+      createAttrGroupForAttrNames(outputColName, numAttrs, dropLast, keepInvalid = false)
+    }
+  }
+
+  /** Creates an `AttributeGroup` with the required number of `BinaryAttribute`. */
+  def createAttrGroupForAttrNames(
+      outputColName: String,
+      numAttrs: Int,
+      dropLast: Boolean,
+      keepInvalid: Boolean): AttributeGroup = {
+    val outputAttrNames = Array.tabulate(numAttrs)(_.toString)
+    val filtered = if (dropLast && !keepInvalid) {
+      outputAttrNames.dropRight(1)
+    } else if (!dropLast && keepInvalid) {
+      outputAttrNames ++ Seq("invalidValues")
+    } else {
+      outputAttrNames
+    }
+    genOutputAttrGroup(Some(filtered), outputColName)
+  }
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala
new file mode 100644
index 0000000000000..1d3f845586426
--- /dev/null
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderEstimatorSuite.scala
@@ -0,0 +1,421 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.ml.attribute.{AttributeGroup, BinaryAttribute, NominalAttribute}
+import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
+import org.apache.spark.ml.param.ParamsSuite
+import org.apache.spark.ml.util.DefaultReadWriteTest
+import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types._
+
+class OneHotEncoderEstimatorSuite
+  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+
+  import testImplicits._
+
+  test("params") {
+    ParamsSuite.checkParams(new OneHotEncoderEstimator)
+  }
+
+  test("OneHotEncoderEstimator dropLast = false") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(3, Seq((1, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+    assert(encoder.getDropLast === true)
+    encoder.setDropLast(false)
+    assert(encoder.getDropLast === false)
+
+    val model = encoder.fit(df)
+    val encoded = model.transform(df)
+    encoded.select("output", "expected").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+  }
+
+  test("OneHotEncoderEstimator dropLast = true") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(2, Seq((1, 1.0)))),
+      Row(2.0, Vectors.sparse(2, Seq())),
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(2, Seq())))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+
+    val model = encoder.fit(df)
+    val encoded = model.transform(df)
+    encoded.select("output", "expected").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+  }
+
+  test("input column with ML attribute") {
+    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
+    val df = Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply).toDF("size")
+      .select(col("size").as("size", attr.toMetadata()))
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("size"))
+      .setOutputCols(Array("encoded"))
+    val model = encoder.fit(df)
+    val output = model.transform(df)
+    val group = AttributeGroup.fromStructField(output.schema("encoded"))
+    assert(group.size === 2)
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("small").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("medium").withIndex(1))
+  }
+
+  test("input column without ML attribute") {
+    val df = Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply).toDF("index")
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("index"))
+      .setOutputCols(Array("encoded"))
+    val model = encoder.fit(df)
+    val output = model.transform(df)
+    val group = AttributeGroup.fromStructField(output.schema("encoded"))
+    assert(group.size === 2)
+    assert(group.getAttr(0) === BinaryAttribute.defaultAttr.withName("0").withIndex(0))
+    assert(group.getAttr(1) === BinaryAttribute.defaultAttr.withName("1").withIndex(1))
+  }
+
+  test("read/write") {
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("index"))
+      .setOutputCols(Array("encoded"))
+    testDefaultReadWrite(encoder)
+  }
+
+  test("OneHotEncoderModel read/write") {
+    val instance = new OneHotEncoderModel("myOneHotEncoderModel", Array(1, 2, 3))
+    val newInstance = testDefaultReadWrite(instance)
+    assert(newInstance.categorySizes === instance.categorySizes)
+  }
+
+  test("OneHotEncoderEstimator with varying types") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(3, Seq((1, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val dfWithTypes = df
+      .withColumn("shortInput", df("input").cast(ShortType))
+      .withColumn("longInput", df("input").cast(LongType))
+      .withColumn("intInput", df("input").cast(IntegerType))
+      .withColumn("floatInput", df("input").cast(FloatType))
+      .withColumn("decimalInput", df("input").cast(DecimalType(10, 0)))
+
+    val cols = Array("input", "shortInput", "longInput", "intInput",
+      "floatInput", "decimalInput")
+    for (col <- cols) {
+      val encoder = new OneHotEncoderEstimator()
+        .setInputCols(Array(col))
+        .setOutputCols(Array("output"))
+        .setDropLast(false)
+
+      val model = encoder.fit(dfWithTypes)
+      val encoded = model.transform(dfWithTypes)
+
+      encoded.select("output", "expected").rdd.map { r =>
+        (r.getAs[Vector](0), r.getAs[Vector](1))
+      }.collect().foreach { case (vec1, vec2) =>
+        assert(vec1 === vec2)
+      }
+    }
+  }
+
+  test("OneHotEncoderEstimator: encoding multiple columns and dropLast = false") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), 2.0, Vectors.sparse(4, Seq((2, 1.0)))),
+      Row(1.0, Vectors.sparse(3, Seq((1, 1.0))), 3.0, Vectors.sparse(4, Seq((3, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0))), 0.0, Vectors.sparse(4, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), 1.0, Vectors.sparse(4, Seq((1, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), 0.0, Vectors.sparse(4, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0))), 2.0, Vectors.sparse(4, Seq((2, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input1", DoubleType),
+        StructField("expected1", new VectorUDT),
+        StructField("input2", DoubleType),
+        StructField("expected2", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("output1", "output2"))
+    assert(encoder.getDropLast === true)
+    encoder.setDropLast(false)
+    assert(encoder.getDropLast === false)
+
+    val model = encoder.fit(df)
+    val encoded = model.transform(df)
+    encoded.select("output1", "expected1", "output2", "expected2").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1), r.getAs[Vector](2), r.getAs[Vector](3))
+    }.collect().foreach { case (vec1, vec2, vec3, vec4) =>
+      assert(vec1 === vec2)
+      assert(vec3 === vec4)
+    }
+  }
+
+  test("OneHotEncoderEstimator: encoding multiple columns and dropLast = true") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0))), 2.0, Vectors.sparse(3, Seq((2, 1.0)))),
+      Row(1.0, Vectors.sparse(2, Seq((1, 1.0))), 3.0, Vectors.sparse(3, Seq())),
+      Row(2.0, Vectors.sparse(2, Seq()), 0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0))), 1.0, Vectors.sparse(3, Seq((1, 1.0)))),
+      Row(0.0, Vectors.sparse(2, Seq((0, 1.0))), 0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(2, Seq()), 2.0, Vectors.sparse(3, Seq((2, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input1", DoubleType),
+        StructField("expected1", new VectorUDT),
+        StructField("input2", DoubleType),
+        StructField("expected2", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input1", "input2"))
+      .setOutputCols(Array("output1", "output2"))
+
+    val model = encoder.fit(df)
+    val encoded = model.transform(df)
+    encoded.select("output1", "expected1", "output2", "expected2").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1), r.getAs[Vector](2), r.getAs[Vector](3))
+    }.collect().foreach { case (vec1, vec2, vec3, vec4) =>
+      assert(vec1 === vec2)
+      assert(vec3 === vec4)
+    }
+  }
+
+  test("Throw error on invalid values") {
+    val trainingData = Seq((0, 0), (1, 1), (2, 2))
+    val trainingDF = trainingData.toDF("id", "a")
+    val testData = Seq((0, 0), (1, 2), (1, 3))
+    val testDF = testData.toDF("id", "a")
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("a"))
+      .setOutputCols(Array("encoded"))
+
+    val model = encoder.fit(trainingDF)
+    val err = intercept[SparkException] {
+      model.transform(testDF).show
+    }
+    err.getMessage.contains("Unseen value: 3.0. To handle unseen values")
+  }
+
+  test("Can't transform on negative input") {
+    val trainingDF = Seq((0, 0), (1, 1), (2, 2)).toDF("a", "b")
+    val testDF = Seq((0, 0), (-1, 2), (1, 3)).toDF("a", "b")
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("a"))
+      .setOutputCols(Array("encoded"))
+
+    val model = encoder.fit(trainingDF)
+    val err = intercept[SparkException] {
+      model.transform(testDF).collect()
+    }
+    err.getMessage.contains("Negative value: -1.0. Input can't be negative")
+  }
+
+  test("Keep on invalid values: dropLast = false") {
+    val trainingDF = Seq(Tuple1(0), Tuple1(1), Tuple1(2)).toDF("input")
+
+    val testData = Seq(
+      Row(0.0, Vectors.sparse(4, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(4, Seq((1, 1.0)))),
+      Row(3.0, Vectors.sparse(4, Seq((3, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val testDF = spark.createDataFrame(sc.parallelize(testData), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+      .setHandleInvalid("keep")
+      .setDropLast(false)
+
+    val model = encoder.fit(trainingDF)
+    val encoded = model.transform(testDF)
+    encoded.select("output", "expected").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+  }
+
+  test("Keep on invalid values: dropLast = true") {
+    val trainingDF = Seq(Tuple1(0), Tuple1(1), Tuple1(2)).toDF("input")
+
+    val testData = Seq(
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(3, Seq((1, 1.0)))),
+      Row(3.0, Vectors.sparse(3, Seq())))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val testDF = spark.createDataFrame(sc.parallelize(testData), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+      .setHandleInvalid("keep")
+      .setDropLast(true)
+
+    val model = encoder.fit(trainingDF)
+    val encoded = model.transform(testDF)
+    encoded.select("output", "expected").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+  }
+
+  test("OneHotEncoderModel changes dropLast") {
+    val data = Seq(
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(3, Seq((1, 1.0))), Vectors.sparse(2, Seq((1, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0))), Vectors.sparse(2, Seq())),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(0.0, Vectors.sparse(3, Seq((0, 1.0))), Vectors.sparse(2, Seq((0, 1.0)))),
+      Row(2.0, Vectors.sparse(3, Seq((2, 1.0))), Vectors.sparse(2, Seq())))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected1", new VectorUDT),
+        StructField("expected2", new VectorUDT)))
+
+    val df = spark.createDataFrame(sc.parallelize(data), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+
+    val model = encoder.fit(df)
+
+    model.setDropLast(false)
+    val encoded1 = model.transform(df)
+    encoded1.select("output", "expected1").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+
+    model.setDropLast(true)
+    val encoded2 = model.transform(df)
+    encoded2.select("output", "expected2").rdd.map { r =>
+      (r.getAs[Vector](0), r.getAs[Vector](1))
+    }.collect().foreach { case (vec1, vec2) =>
+      assert(vec1 === vec2)
+    }
+  }
+
+  test("OneHotEncoderModel changes handleInvalid") {
+    val trainingDF = Seq(Tuple1(0), Tuple1(1), Tuple1(2)).toDF("input")
+
+    val testData = Seq(
+      Row(0.0, Vectors.sparse(4, Seq((0, 1.0)))),
+      Row(1.0, Vectors.sparse(4, Seq((1, 1.0)))),
+      Row(3.0, Vectors.sparse(4, Seq((3, 1.0)))))
+
+    val schema = StructType(Array(
+        StructField("input", DoubleType),
+        StructField("expected", new VectorUDT)))
+
+    val testDF = spark.createDataFrame(sc.parallelize(testData), schema)
+
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("input"))
+      .setOutputCols(Array("output"))
+
+    val model = encoder.fit(trainingDF)
+    model.setHandleInvalid("error")
+
+    val err = intercept[SparkException] {
+      model.transform(testDF).collect()
+    }
+    err.getMessage.contains("Unseen value: 3.0. To handle unseen values")
+
+    model.setHandleInvalid("keep")
+    model.transform(testDF).collect()
+  }
+
+  test("Transforming on mismatched attributes") {
+    val attr = NominalAttribute.defaultAttr.withValues("small", "medium", "large")
+    val df = Seq(0.0, 1.0, 2.0, 1.0).map(Tuple1.apply).toDF("size")
+      .select(col("size").as("size", attr.toMetadata()))
+    val encoder = new OneHotEncoderEstimator()
+      .setInputCols(Array("size"))
+      .setOutputCols(Array("encoded"))
+    val model = encoder.fit(df)
+
+    val testAttr = NominalAttribute.defaultAttr.withValues("tiny", "small", "medium", "large")
+    val testDF = Seq(0.0, 1.0, 2.0, 3.0).map(Tuple1.apply).toDF("size")
+      .select(col("size").as("size", testAttr.toMetadata()))
+    val err = intercept[Exception] {
+      model.transform(testDF).collect()
+    }
+    err.getMessage.contains("OneHotEncoderModel expected 2 categorical values")
+  }
+}

From f5b7714e0eed4530519df97eb4faca8b05dde161 Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Mon, 1 Jan 2018 08:47:12 -0600
Subject: [PATCH 242/356] [BUILD] Close stale PRs

Closes #18916
Closes #19520
Closes #19613
Closes #19739
Closes #19936
Closes #19919
Closes #19933
Closes #19917
Closes #20027
Closes #19035
Closes #20044
Closes #20104

Author: Sean Owen <sowen@cloudera.com>

Closes #20130 from srowen/StalePRs.

From 7a702d8d5ed830de5d2237f136b08bd18deae037 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Tue, 2 Jan 2018 07:00:31 +0900
Subject: [PATCH 243/356] [SPARK-21616][SPARKR][DOCS] update R migration guide
 and vignettes

## What changes were proposed in this pull request?

update R migration guide and vignettes

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20106 from felixcheung/rreleasenote23.
---
 R/pkg/tests/fulltests/test_Windows.R | 1 +
 R/pkg/vignettes/sparkr-vignettes.Rmd | 3 +--
 docs/sparkr.md                       | 6 ++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/pkg/tests/fulltests/test_Windows.R b/R/pkg/tests/fulltests/test_Windows.R
index b2ec6c67311db..209827d9fdc2f 100644
--- a/R/pkg/tests/fulltests/test_Windows.R
+++ b/R/pkg/tests/fulltests/test_Windows.R
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
 context("Windows-specific tests")
 
 test_that("sparkJars tag in SparkContext", {
diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 8c4ea2f2db188..2e662424b25f2 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -391,8 +391,7 @@ We convert `mpg` to `kmpg` (kilometers per gallon). `carsSubDF` is a `SparkDataF
 
 ```{r}
 carsSubDF <- select(carsDF, "model", "mpg")
-schema <- structType(structField("model", "string"), structField("mpg", "double"),
-                     structField("kmpg", "double"))
+schema <- "model STRING, mpg DOUBLE, kmpg DOUBLE"
 out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema)
 head(collect(out))
 ```
diff --git a/docs/sparkr.md b/docs/sparkr.md
index a3254e7654134..997ea60fb6cf0 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -657,3 +657,9 @@ You can inspect the search path in R with [`search()`](https://stat.ethz.ch/R-ma
  - By default, derby.log is now saved to `tempdir()`. This will be created when instantiating the SparkSession with `enableHiveSupport` set to `TRUE`.
  - `spark.lda` was not setting the optimizer correctly. It has been corrected.
  - Several model summary outputs are updated to have `coefficients` as `matrix`. This includes `spark.logit`, `spark.kmeans`, `spark.glm`. Model summary outputs for `spark.gaussianMixture` have added log-likelihood as `loglik`.
+
+## Upgrading to SparkR 2.3.0
+
+ - The `stringsAsFactors` parameter was previously ignored with `collect`, for example, in `collect(createDataFrame(iris), stringsAsFactors = TRUE))`. It has been corrected.
+ - For `summary`, option for statistics to compute has been added. Its output is changed from that from `describe`.
+ - A warning can be raised if versions of SparkR package and the Spark JVM do not match.

From c284c4e1f6f684ca8db1cc446fdcc43b46e3413c Mon Sep 17 00:00:00 2001
From: Sean Owen <sowen@cloudera.com>
Date: Sun, 31 Dec 2017 17:00:41 -0600
Subject: [PATCH 244/356] [MINOR] Fix a bunch of typos

---
 bin/find-spark-home                                          | 2 +-
 .../java/org/apache/spark/util/kvstore/LevelDBIterator.java  | 2 +-
 .../org/apache/spark/network/protocol/MessageWithHeader.java | 4 ++--
 .../java/org/apache/spark/network/sasl/SaslEncryption.java   | 4 ++--
 .../org/apache/spark/network/util/TransportFrameDecoder.java | 2 +-
 .../network/shuffle/ExternalShuffleBlockResolverSuite.java   | 2 +-
 .../main/java/org/apache/spark/util/sketch/BloomFilter.java  | 2 +-
 .../java/org/apache/spark/unsafe/array/ByteArrayMethods.java | 2 +-
 core/src/main/scala/org/apache/spark/SparkContext.scala      | 2 +-
 core/src/main/scala/org/apache/spark/status/storeTypes.scala | 2 +-
 .../test/scala/org/apache/spark/util/FileAppenderSuite.scala | 2 +-
 dev/github_jira_sync.py                                      | 2 +-
 dev/lint-python                                              | 2 +-
 examples/src/main/python/ml/linearsvc.py                     | 2 +-
 .../scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala | 2 +-
 .../scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala | 2 +-
 .../apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java   | 2 +-
 .../org/apache/spark/streaming/kinesis/KinesisUtils.scala    | 4 ++--
 .../java/org/apache/spark/launcher/ChildProcAppHandle.java   | 2 +-
 .../scala/org/apache/spark/ml/tuning/CrossValidator.scala    | 2 +-
 python/pyspark/ml/image.py                                   | 2 +-
 .../org/apache/spark/deploy/yarn/YarnClusterSuite.scala      | 2 +-
 .../spark/sql/catalyst/expressions/UnsafeArrayData.java      | 2 +-
 .../scala/org/apache/spark/sql/catalyst/analysis/view.scala  | 2 +-
 .../spark/sql/catalyst/expressions/objects/objects.scala     | 5 +++--
 .../expressions/aggregate/CountMinSketchAggSuite.scala       | 2 +-
 .../sql/sources/v2/streaming/MicroBatchWriteSupport.java     | 2 +-
 .../apache/spark/sql/execution/ui/static/spark-sql-viz.css   | 2 +-
 .../apache/spark/sql/execution/datasources/FileFormat.scala  | 2 +-
 .../spark/sql/execution/datasources/csv/CSVInferSchema.scala | 2 +-
 .../apache/spark/sql/execution/joins/HashedRelation.scala    | 2 +-
 .../streaming/StreamingSymmetricHashJoinHelper.scala         | 2 +-
 .../apache/spark/sql/execution/ui/SQLAppStatusListener.scala | 2 +-
 .../scala/org/apache/spark/sql/expressions/Aggregator.scala  | 2 +-
 .../main/scala/org/apache/spark/sql/streaming/progress.scala | 2 +-
 .../src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java | 2 +-
 .../inputs/typeCoercion/native/implicitTypeCasts.sql         | 2 +-
 .../execution/streaming/CompactibleFileStreamLogSuite.scala  | 2 +-
 .../org/apache/spark/sql/sources/fakeExternalSources.scala   | 2 +-
 .../org/apache/spark/sql/streaming/FileStreamSinkSuite.scala | 2 +-
 .../test/scala/org/apache/spark/sql/test/SQLTestUtils.scala  | 2 +-
 .../src/main/scala/org/apache/spark/sql/hive/HiveShim.scala  | 2 +-
 sql/hive/src/test/resources/data/conf/hive-log4j.properties  | 2 +-
 .../org/apache/spark/streaming/rdd/MapWithStateRDD.scala     | 2 +-
 .../scala/org/apache/spark/streaming/util/StateMap.scala     | 2 +-
 45 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/bin/find-spark-home b/bin/find-spark-home
index fa78407d4175a..617dbaa4fff86 100755
--- a/bin/find-spark-home
+++ b/bin/find-spark-home
@@ -21,7 +21,7 @@
 
 FIND_SPARK_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_spark_home.py"
 
-# Short cirtuit if the user already has this set.
+# Short circuit if the user already has this set.
 if [ ! -z "${SPARK_HOME}" ]; then
    exit 0
 elif [ ! -f "$FIND_SPARK_HOME_PYTHON_SCRIPT" ]; then
diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
index b3ba76ba58052..f62e85d435318 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java
@@ -86,7 +86,7 @@ class LevelDBIterator<T> implements KVStoreIterator<T> {
         end = index.start(parent, params.last);
       }
       if (it.hasNext()) {
-        // When descending, the caller may have set up the start of iteration at a non-existant
+        // When descending, the caller may have set up the start of iteration at a non-existent
         // entry that is guaranteed to be after the desired entry. For example, if you have a
         // compound key (a, b) where b is a, integer, you may seek to the end of the elements that
         // have the same "a" value by specifying Integer.MAX_VALUE for "b", and that value may not
diff --git a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
index 897d0f9e4fb89..a5337656cbd84 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/protocol/MessageWithHeader.java
@@ -47,7 +47,7 @@ class MessageWithHeader extends AbstractFileRegion {
   /**
    * When the write buffer size is larger than this limit, I/O will be done in chunks of this size.
    * The size should not be too large as it will waste underlying memory copy. e.g. If network
-   * avaliable buffer is smaller than this limit, the data cannot be sent within one single write
+   * available buffer is smaller than this limit, the data cannot be sent within one single write
    * operation while it still will make memory copy with this size.
    */
   private static final int NIO_BUFFER_LIMIT = 256 * 1024;
@@ -100,7 +100,7 @@ public long transferred() {
    * transferTo invocations in order to transfer a single MessageWithHeader to avoid busy waiting.
    *
    * The contract is that the caller will ensure position is properly set to the total number
-   * of bytes transferred so far (i.e. value returned by transfered()).
+   * of bytes transferred so far (i.e. value returned by transferred()).
    */
   @Override
   public long transferTo(final WritableByteChannel target, final long position) throws IOException {
diff --git a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
index 16ab4efcd4f5f..3ac9081d78a75 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/sasl/SaslEncryption.java
@@ -38,7 +38,7 @@
 import org.apache.spark.network.util.NettyUtils;
 
 /**
- * Provides SASL-based encription for transport channels. The single method exposed by this
+ * Provides SASL-based encryption for transport channels. The single method exposed by this
  * class installs the needed channel handlers on a connected channel.
  */
 class SaslEncryption {
@@ -166,7 +166,7 @@ static class EncryptedMessage extends AbstractFileRegion {
      * This makes assumptions about how netty treats FileRegion instances, because there's no way
      * to know beforehand what will be the size of the encrypted message. Namely, it assumes
      * that netty will try to transfer data from this message while
-     * <code>transfered() < count()</code>. So these two methods return, technically, wrong data,
+     * <code>transferred() < count()</code>. So these two methods return, technically, wrong data,
      * but netty doesn't know better.
      */
     @Override
diff --git a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
index 50d9651ccbbb2..8e73ab077a5c1 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/util/TransportFrameDecoder.java
@@ -29,7 +29,7 @@
 /**
  * A customized frame decoder that allows intercepting raw data.
  * <p>
- * This behaves like Netty's frame decoder (with harcoded parameters that match this library's
+ * This behaves like Netty's frame decoder (with hard coded parameters that match this library's
  * needs), except it allows an interceptor to be installed to read data directly before it's
  * framed.
  * <p>
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
index 23438a08fa094..6d201b8fe8d7d 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/ExternalShuffleBlockResolverSuite.java
@@ -127,7 +127,7 @@ public void jsonSerializationOfExecutorRegistration() throws IOException {
       mapper.readValue(shuffleJson, ExecutorShuffleInfo.class);
     assertEquals(parsedShuffleInfo, shuffleInfo);
 
-    // Intentionally keep these hard-coded strings in here, to check backwards-compatability.
+    // Intentionally keep these hard-coded strings in here, to check backwards-compatibility.
     // its not legacy yet, but keeping this here in case anybody changes it
     String legacyAppIdJson = "{\"appId\":\"foo\", \"execId\":\"bar\"}";
     assertEquals(appId, mapper.readValue(legacyAppIdJson, AppExecId.class));
diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
index c0b425e729595..37803c7a3b104 100644
--- a/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
+++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/BloomFilter.java
@@ -34,7 +34,7 @@
  *   <li>{@link String}</li>
  * </ul>
  * The false positive probability ({@code FPP}) of a Bloom filter is defined as the probability that
- * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that hasu
+ * {@linkplain #mightContain(Object)} will erroneously return {@code true} for an object that has
  * not actually been put in the {@code BloomFilter}.
  *
  * The implementation is largely based on the {@code BloomFilter} class from Guava.
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
index f121b1cd745b8..a6b1f7a16d605 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -66,7 +66,7 @@ public static boolean arrayEquals(
         i += 1;
       }
     }
-    // for architectures that suport unaligned accesses, chew it up 8 bytes at a time
+    // for architectures that support unaligned accesses, chew it up 8 bytes at a time
     if (unaligned || (((leftOffset + i) % 8 == 0) && ((rightOffset + i) % 8 == 0))) {
       while (i <= length - 8) {
         if (Platform.getLong(leftBase, leftOffset + i) !=
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1dbb3789b0c9d..31f3cb9dfa0ae 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -2276,7 +2276,7 @@ class SparkContext(config: SparkConf) extends Logging {
   }
 
   /**
-   * Clean a closure to make it ready to serialized and send to tasks
+   * Clean a closure to make it ready to be serialized and send to tasks
    * (removes unreferenced variables in $outer's, updates REPL variables)
    * If <tt>checkSerializable</tt> is set, <tt>clean</tt> will also proactively
    * check to see if <tt>f</tt> is serializable and throw a <tt>SparkException</tt>
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index d9ead0071d3bf..1cfd30df49091 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -61,7 +61,7 @@ private[spark] class ExecutorSummaryWrapper(val info: ExecutorSummary) {
 
 /**
  * Keep track of the existing stages when the job was submitted, and those that were
- * completed during the job's execution. This allows a more accurate acounting of how
+ * completed during the job's execution. This allows a more accurate accounting of how
  * many tasks were skipped for the job.
  */
 private[spark] class JobDataWrapper(
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index cd0ed5b036bf9..52cd5378bc715 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -356,7 +356,7 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     generatedFiles
   }
 
-  /** Delete all the generated rolledover files */
+  /** Delete all the generated rolled over files */
   def cleanup() {
     testFile.getParentFile.listFiles.filter { file =>
       file.getName.startsWith(testFile.getName)
diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
index acc9aeabbb9fb..7df7b325cabe0 100755
--- a/dev/github_jira_sync.py
+++ b/dev/github_jira_sync.py
@@ -43,7 +43,7 @@
 # "notification overload" when running for the first time.
 MIN_COMMENT_PR = int(os.environ.get("MIN_COMMENT_PR", "1496"))
 
-# File used as an opitimization to store maximum previously seen PR
+# File used as an optimization to store maximum previously seen PR
 # Used mostly because accessing ASF JIRA is slow, so we want to avoid checking
 # the state of JIRA's that are tied to PR's we've already looked at.
 MAX_FILE = ".github-jira-max"
diff --git a/dev/lint-python b/dev/lint-python
index 07e2606d45143..df8df037a5f69 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -19,7 +19,7 @@
 
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
-# Exclude auto-geneated configuration file.
+# Exclude auto-generated configuration file.
 PATHS_TO_CHECK="$( cd "$SPARK_ROOT_DIR" && find . -name "*.py" )"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
diff --git a/examples/src/main/python/ml/linearsvc.py b/examples/src/main/python/ml/linearsvc.py
index 18cbf87a10695..9b79abbf96f88 100644
--- a/examples/src/main/python/ml/linearsvc.py
+++ b/examples/src/main/python/ml/linearsvc.py
@@ -37,7 +37,7 @@
     # Fit the model
     lsvcModel = lsvc.fit(training)
 
-    # Print the coefficients and intercept for linearsSVC
+    # Print the coefficients and intercept for linear SVC
     print("Coefficients: " + str(lsvcModel.coefficients))
     print("Intercept: " + str(lsvcModel.intercept))
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
index 9d9e2aaba8079..66b3409c0cd04 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala
@@ -52,7 +52,7 @@ private[kafka010] case class KafkaSourceRDDPartition(
  * An RDD that reads data from Kafka based on offset ranges across multiple partitions.
  * Additionally, it allows preferred locations to be set for each topic + partition, so that
  * the [[KafkaSource]] can ensure the same executor always reads the same topic + partition
- * and cached KafkaConsuemrs (see [[CachedKafkaConsumer]] can be used read data efficiently.
+ * and cached KafkaConsumers (see [[CachedKafkaConsumer]] can be used read data efficiently.
  *
  * @param sc the [[SparkContext]]
  * @param executorKafkaParams Kafka configuration for creating KafkaConsumer on the executors
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
index 08db4d827e400..75245943c4936 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala
@@ -201,7 +201,7 @@ class KafkaTestUtils(withBrokerProps: Map[String, Object] = Map.empty) extends L
     verifyTopicDeletionWithRetries(zkUtils, topic, partitions, List(this.server))
   }
 
-  /** Add new paritions to a Kafka topic */
+  /** Add new partitions to a Kafka topic */
   def addPartitions(topic: String, partitions: Int): Unit = {
     AdminUtils.addPartitions(zkUtils, topic, partitions)
     // wait until metadata is propagated
diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java
index 87bfe1514e338..b20fad2291262 100644
--- a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java
+++ b/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java
@@ -111,7 +111,7 @@ public String call(ConsumerRecord<String, String> r) {
         LocationStrategies.PreferConsistent()
     ).map(handler);
 
-    // just making sure the java user apis work; the scala tests handle logic corner cases
+    // just making sure the java user APIs work; the scala tests handle logic corner cases
     long count1 = rdd1.count();
     long count2 = rdd2.count();
     Assert.assertTrue(count1 > 0);
diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
index 2500460bd330b..c60b9896a3473 100644
--- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
+++ b/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtils.scala
@@ -164,7 +164,7 @@ object KinesisUtils {
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
    * @param stsAssumeRoleArn ARN of IAM role to assume when using STS sessions to read from
    *                         Kinesis stream.
-   * @param stsSessionName Name to uniquely identify STS sessions if multiple princples assume
+   * @param stsSessionName Name to uniquely identify STS sessions if multiple principals assume
    *                       the same role.
    * @param stsExternalId External ID that can be used to validate against the assumed IAM role's
    *                      trust policy.
@@ -434,7 +434,7 @@ object KinesisUtils {
    * @param awsSecretKey  AWS SecretKey (if null, will use DefaultAWSCredentialsProviderChain)
    * @param stsAssumeRoleArn ARN of IAM role to assume when using STS sessions to read from
    *                         Kinesis stream.
-   * @param stsSessionName Name to uniquely identify STS sessions if multiple princples assume
+   * @param stsSessionName Name to uniquely identify STS sessions if multiple princpals assume
    *                       the same role.
    * @param stsExternalId External ID that can be used to validate against the assumed IAM role's
    *                      trust policy.
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
index 3bb7e12385fd8..8b3f427b7750e 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -71,7 +71,7 @@ void setChildProc(Process childProc, String loggerName, InputStream logStream) {
   }
 
   /**
-   * Wait for the child process to exit and update the handle's state if necessary, accoding to
+   * Wait for the child process to exit and update the handle's state if necessary, according to
    * the exit code.
    */
   void monitorChild() {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 0130b3e255f0d..095b54c0fe83f 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -94,7 +94,7 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
   def setSeed(value: Long): this.type = set(seed, value)
 
   /**
-   * Set the mamixum level of parallelism to evaluate models in parallel.
+   * Set the maximum level of parallelism to evaluate models in parallel.
    * Default is 1 for serial evaluation
    *
    * @group expertSetParam
diff --git a/python/pyspark/ml/image.py b/python/pyspark/ml/image.py
index 384599dc0c532..c9b840276f675 100644
--- a/python/pyspark/ml/image.py
+++ b/python/pyspark/ml/image.py
@@ -212,7 +212,7 @@ def readImages(self, path, recursive=False, numPartitions=-1,
 ImageSchema = _ImageSchema()
 
 
-# Monkey patch to disallow instantization of this class.
+# Monkey patch to disallow instantiation of this class.
 def _disallow_instance(_):
     raise RuntimeError("Creating instance of _ImageSchema class is disallowed.")
 _ImageSchema.__init__ = _disallow_instance
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index ab0005d7b53a8..061f653b97b7a 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -95,7 +95,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
         "spark.executor.cores" -> "1",
         "spark.executor.memory" -> "512m",
         "spark.executor.instances" -> "2",
-        // Sending some senstive information, which we'll make sure gets redacted
+        // Sending some sensitive information, which we'll make sure gets redacted
         "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD" -> YarnClusterDriver.SECRET_PASSWORD,
         "spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD" -> YarnClusterDriver.SECRET_PASSWORD
       ))
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 64ab01ca57403..d18542b188f71 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -294,7 +294,7 @@ public void setNullAt(int ordinal) {
     assertIndexIsValid(ordinal);
     BitSetMethods.set(baseObject, baseOffset + 8, ordinal);
 
-    /* we assume the corrresponding column was already 0 or
+    /* we assume the corresponding column was already 0 or
        will be set to 0 later by the caller side */
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
index 3bbe41cf8f15e..20216087b0158 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/view.scala
@@ -38,7 +38,7 @@ import org.apache.spark.sql.internal.SQLConf
  *         view resolution, in this way, we are able to get the correct view column ordering and
  *         omit the extra columns that we don't require);
  *    1.2. Else set the child output attributes to `queryOutput`.
- * 2. Map the `queryQutput` to view output by index, if the corresponding attributes don't match,
+ * 2. Map the `queryOutput` to view output by index, if the corresponding attributes don't match,
  *    try to up cast and alias the attribute in `queryOutput` to the attribute in the view output.
  * 3. Add a Project over the child, with the new output generated by the previous steps.
  * If the view output doesn't have the same number of columns neither with the child output, nor
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 4af813456b790..64da9bb9cdec1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -51,7 +51,7 @@ trait InvokeLike extends Expression with NonSQLExpression {
    *
    * - generate codes for argument.
    * - use ctx.splitExpressions() to not exceed 64kb JVM limit while preparing arguments.
-   * - avoid some of nullabilty checking which are not needed because the expression is not
+   * - avoid some of nullability checking which are not needed because the expression is not
    *   nullable.
    * - when needNullCheck == true, short circuit if we found one of arguments is null because
    *   preparing rest of arguments can be skipped in the case.
@@ -193,7 +193,8 @@ case class StaticInvoke(
  * @param targetObject An expression that will return the object to call the method on.
  * @param functionName The name of the method to call.
  * @param dataType The expected return type of the function.
- * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
+ * @param arguments An optional list of expressions, whose evaluation will be passed to the
+  *                 function.
  * @param propagateNull When true, and any of the arguments is null, null will be returned instead
  *                      of calling the function.
  * @param returnNullable When false, indicating the invoked method will always return
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAggSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAggSuite.scala
index 10479630f3f99..30e3bc9fb5779 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAggSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAggSuite.scala
@@ -30,7 +30,7 @@ import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.sketch.CountMinSketch
 
 /**
- * Unit test suite for the count-min sketch SQL aggregate funciton [[CountMinSketchAgg]].
+ * Unit test suite for the count-min sketch SQL aggregate function [[CountMinSketchAgg]].
  */
 class CountMinSketchAggSuite extends SparkFunSuite {
   private val childExpression = BoundReference(0, IntegerType, nullable = true)
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
index b5e3e44cd6332..53ffa95ae0f4c 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/streaming/MicroBatchWriteSupport.java
@@ -41,7 +41,7 @@ public interface MicroBatchWriteSupport extends BaseStreamingSink {
    * @param queryId A unique string for the writing query. It's possible that there are many writing
    *                queries running at the same time, and the returned {@link DataSourceV2Writer}
    *                can use this id to distinguish itself from others.
-   * @param epochId The uniquenumeric ID of the batch within this writing query. This is an
+   * @param epochId The unique numeric ID of the batch within this writing query. This is an
    *                incrementing counter representing a consistent set of data; the same batch may
    *                be started multiple times in failure recovery scenarios, but it will always
    *                contain the same records.
diff --git a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
index 594e747a8d3a5..b13850c301490 100644
--- a/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
+++ b/sql/core/src/main/resources/org/apache/spark/sql/execution/ui/static/spark-sql-viz.css
@@ -32,7 +32,7 @@
   stroke-width: 1px;
 }
 
-/* Hightlight the SparkPlan node name */
+/* Highlight the SparkPlan node name */
 #plan-viz-graph svg text :first-child {
   font-weight: bold;
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
index d3874b58bc807..023e127888290 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileFormat.scala
@@ -77,7 +77,7 @@ trait FileFormat {
   }
 
   /**
-   * Returns whether a file with `path` could be splitted or not.
+   * Returns whether a file with `path` could be split or not.
    */
   def isSplitable(
       sparkSession: SparkSession,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
index b64d71bb4eef2..a585cbed2551b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVInferSchema.scala
@@ -150,7 +150,7 @@ private[csv] object CSVInferSchema {
     if ((allCatch opt options.timestampFormat.parse(field)).isDefined) {
       TimestampType
     } else if ((allCatch opt DateTimeUtils.stringToTime(field)).isDefined) {
-      // We keep this for backwords competibility.
+      // We keep this for backwards compatibility.
       TimestampType
     } else {
       tryParseBoolean(field, options)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index d98cf852a1b48..1465346eb802d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -368,7 +368,7 @@ private[execution] final class LongToUnsafeRowMap(val mm: TaskMemoryManager, cap
   // The minimum key
   private var minKey = Long.MaxValue
 
-  // The maxinum key
+  // The maximum key
   private var maxKey = Long.MinValue
 
   // The array to store the key and offset of UnsafeRow in the page.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
index 217e98a6419e9..4aba76cad367e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinHelper.scala
@@ -203,7 +203,7 @@ object StreamingSymmetricHashJoinHelper extends Logging {
   /**
    * A custom RDD that allows partitions to be "zipped" together, while ensuring the tasks'
    * preferred location is based on which executors have the required join state stores already
-   * loaded. This is class is a modified verion of [[ZippedPartitionsRDD2]].
+   * loaded. This is class is a modified version of [[ZippedPartitionsRDD2]].
    */
   class StateStoreAwareZipPartitionsRDD[A: ClassTag, B: ClassTag, V: ClassTag](
       sc: SparkContext,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index 2295b8dd5fe36..d8adbe7bee13e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -175,7 +175,7 @@ class SQLAppStatusListener(
 
     // Check the execution again for whether the aggregated metrics data has been calculated.
     // This can happen if the UI is requesting this data, and the onExecutionEnd handler is
-    // running at the same time. The metrics calculcated for the UI can be innacurate in that
+    // running at the same time. The metrics calculated for the UI can be innacurate in that
     // case, since the onExecutionEnd handler will clean up tracked stage metrics.
     if (exec.metricsValues != null) {
       exec.metricsValues
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
index 058c38c8cb8f4..1e076207bc607 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/Aggregator.scala
@@ -86,7 +86,7 @@ abstract class Aggregator[-IN, BUF, OUT] extends Serializable {
   def bufferEncoder: Encoder[BUF]
 
   /**
-   * Specifies the `Encoder` for the final ouput value type.
+   * Specifies the `Encoder` for the final output value type.
    * @since 2.0.0
    */
   def outputEncoder: Encoder[OUT]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
index cedc1dce4a703..0dcb666e2c3e4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/progress.scala
@@ -152,7 +152,7 @@ class StreamingQueryProgress private[sql](
  * @param endOffset              The ending offset for data being read.
  * @param numInputRows           The number of records read from this source.
  * @param inputRowsPerSecond     The rate at which data is arriving from this source.
- * @param processedRowsPerSecond The rate at which data from this source is being procressed by
+ * @param processedRowsPerSecond The rate at which data from this source is being processed by
  *                               Spark.
  * @since 2.1.0
  */
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
index 447a71d284fbb..288f5e7426c05 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/MyDoubleAvg.java
@@ -47,7 +47,7 @@ public MyDoubleAvg() {
     _inputDataType = DataTypes.createStructType(inputFields);
 
     // The buffer has two values, bufferSum for storing the current sum and
-    // bufferCount for storing the number of non-null input values that have been contribuetd
+    // bufferCount for storing the number of non-null input values that have been contributed
     // to the current sum.
     List<StructField> bufferFields = new ArrayList<>();
     bufferFields.add(DataTypes.createStructField("bufferSum", DataTypes.DoubleType, true));
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
index 58866f4b18112..6de22b8b7c3de 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/implicitTypeCasts.sql
@@ -32,7 +32,7 @@ SELECT 1.1 - '2.2' FROM t;
 SELECT 1.1 * '2.2' FROM t;
 SELECT 4.4 / '2.2' FROM t;
 
--- concatentation
+-- concatenation
 SELECT '$' || cast(1 as smallint) || '$' FROM t;
 SELECT '$' || 1 || '$' FROM t;
 SELECT '$' || cast(1 as bigint) || '$' FROM t;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
index 83018f95aa55d..12eaf63415081 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/CompactibleFileStreamLogSuite.scala
@@ -92,7 +92,7 @@ class CompactibleFileStreamLogSuite extends SparkFunSuite with SharedSQLContext
 
   test("deriveCompactInterval") {
     // latestCompactBatchId(4) + 1 <= default(5)
-    // then use latestestCompactBatchId + 1 === 5
+    // then use latestCompactBatchId + 1 === 5
     assert(5 === deriveCompactInterval(5, 4))
     // First divisor of 10 greater than 4 === 5
     assert(5 === deriveCompactInterval(4, 9))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
index bf43de597a7a0..2cb48281b30af 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/fakeExternalSources.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationP
 import org.apache.spark.sql.types._
 
 
-// Note that the package name is intendedly mismatched in order to resemble external data sources
+// Note that the package name is intentionally mismatched in order to resemble external data sources
 // and test the detection for them.
 class FakeExternalSourceOne extends RelationProvider with DataSourceRegister {
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
index 2a2552211857a..8c4e1fd00b0a2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -81,7 +81,7 @@ class FileStreamSinkSuite extends StreamTest {
       .start(outputDir)
 
     try {
-      // The output is partitoned by "value", so the value will appear in the file path.
+      // The output is partitioned by "value", so the value will appear in the file path.
       // This is to test if we handle spaces in the path correctly.
       inputData.addData("hello world")
       failAfter(streamingTimeout) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index b4248b74f50ab..904f9f2ad0b22 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -113,7 +113,7 @@ private[sql] trait SQLTestUtils extends SparkFunSuite with SQLTestUtilsBase with
       if (thread.isAlive) {
         thread.interrupt()
         // If this interrupt does not work, then this thread is most likely running something that
-        // is not interruptible. There is not much point to wait for the thread to termniate, and
+        // is not interruptible. There is not much point to wait for the thread to terminate, and
         // we rather let the JVM terminate the thread on exit.
         fail(
           s"Test '$name' running on o.a.s.util.UninterruptibleThread timed out after" +
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 9e9894803ce25..11afe1af32809 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -50,7 +50,7 @@ private[hive] object HiveShim {
   val HIVE_GENERIC_UDF_MACRO_CLS = "org.apache.hadoop.hive.ql.udf.generic.GenericUDFMacro"
 
   /*
-   * This function in hive-0.13 become private, but we have to do this to walkaround hive bug
+   * This function in hive-0.13 become private, but we have to do this to work around hive bug
    */
   private def appendReadColumnNames(conf: Configuration, cols: Seq[String]) {
     val old: String = conf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, "")
diff --git a/sql/hive/src/test/resources/data/conf/hive-log4j.properties b/sql/hive/src/test/resources/data/conf/hive-log4j.properties
index 6a042472adb90..83fd03a99bc37 100644
--- a/sql/hive/src/test/resources/data/conf/hive-log4j.properties
+++ b/sql/hive/src/test/resources/data/conf/hive-log4j.properties
@@ -32,7 +32,7 @@ log4j.threshhold=WARN
 log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
 log4j.appender.DRFA.File=${hive.log.dir}/${hive.log.file}
 
-# Rollver at midnight
+# Roll over at midnight
 log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
 
 # 30-day backup
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
index 15d3c7e54b8dd..8da5a5f8193cf 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
@@ -162,7 +162,7 @@ private[streaming] class MapWithStateRDD[K: ClassTag, V: ClassTag, S: ClassTag,
       mappingFunction,
       batchTime,
       timeoutThresholdTime,
-      removeTimedoutData = doFullScan // remove timedout data only when full scan is enabled
+      removeTimedoutData = doFullScan // remove timed-out data only when full scan is enabled
     )
     Iterator(newRecord)
   }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala
index 3a21cfae5ac2f..89524cd84ff32 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/StateMap.scala
@@ -364,7 +364,7 @@ private[streaming] object OpenHashMapBasedStateMap {
   }
 
   /**
-   * Internal class to represent a marker the demarkate the end of all state data in the
+   * Internal class to represent a marker that demarcates the end of all state data in the
    * serialized bytes.
    */
   class LimitMarker(val num: Int) extends Serializable

From 1c9f95cb771ac78775a77edd1abfeb2d8ae2a124 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Tue, 2 Jan 2018 07:13:27 +0900
Subject: [PATCH 245/356] [SPARK-22530][PYTHON][SQL] Adding Arrow support for
 ArrayType

## What changes were proposed in this pull request?

This change adds `ArrayType` support for working with Arrow in pyspark when creating a DataFrame, calling `toPandas()`, and using vectorized `pandas_udf`.

## How was this patch tested?

Added new Python unit tests using Array data.

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20114 from BryanCutler/arrow-ArrayType-support-SPARK-22530.
---
 python/pyspark/sql/tests.py                   | 47 ++++++++++++++++++-
 python/pyspark/sql/types.py                   |  4 ++
 .../vectorized/ArrowColumnVector.java         | 13 ++++-
 3 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 1c34c897eecb5..67bdb3d72d93b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3372,6 +3372,31 @@ def test_schema_conversion_roundtrip(self):
         schema_rt = from_arrow_schema(arrow_schema)
         self.assertEquals(self.schema, schema_rt)
 
+    def test_createDataFrame_with_array_type(self):
+        import pandas as pd
+        pdf = pd.DataFrame({"a": [[1, 2], [3, 4]], "b": [[u"x", u"y"], [u"y", u"z"]]})
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        result = df.collect()
+        result_arrow = df_arrow.collect()
+        expected = [tuple(list(e) for e in rec) for rec in pdf.to_records(index=False)]
+        for r in range(len(expected)):
+            for e in range(len(expected[r])):
+                self.assertTrue(expected[r][e] == result_arrow[r][e] and
+                                result[r][e] == result_arrow[r][e])
+
+    def test_toPandas_with_array_type(self):
+        expected = [([1, 2], [u"x", u"y"]), ([3, 4], [u"y", u"z"])]
+        array_schema = StructType([StructField("a", ArrayType(IntegerType())),
+                                   StructField("b", ArrayType(StringType()))])
+        df = self.spark.createDataFrame(expected, schema=array_schema)
+        pdf, pdf_arrow = self._toPandas_arrow_toggle(df)
+        result = [tuple(list(e) for e in rec) for rec in pdf.to_records(index=False)]
+        result_arrow = [tuple(list(e) for e in rec) for rec in pdf_arrow.to_records(index=False)]
+        for r in range(len(expected)):
+            for e in range(len(expected[r])):
+                self.assertTrue(expected[r][e] == result_arrow[r][e] and
+                                result[r][e] == result_arrow[r][e])
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class PandasUDFTests(ReusedSQLTestCase):
@@ -3651,6 +3676,24 @@ def test_vectorized_udf_datatype_string(self):
                         bool_f(col('bool')))
         self.assertEquals(df.collect(), res.collect())
 
+    def test_vectorized_udf_array_type(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [([1, 2],), ([3, 4],)]
+        array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
+        df = self.spark.createDataFrame(data, schema=array_schema)
+        array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()))
+        result = df.select(array_f(col('array')))
+        self.assertEquals(df.collect(), result.collect())
+
+    def test_vectorized_udf_null_array(self):
+        from pyspark.sql.functions import pandas_udf, col
+        data = [([1, 2],), (None,), (None,), ([3, 4],), (None,)]
+        array_schema = StructType([StructField("array", ArrayType(IntegerType()))])
+        df = self.spark.createDataFrame(data, schema=array_schema)
+        array_f = pandas_udf(lambda x: x, ArrayType(IntegerType()))
+        result = df.select(array_f(col('array')))
+        self.assertEquals(df.collect(), result.collect())
+
     def test_vectorized_udf_complex(self):
         from pyspark.sql.functions import pandas_udf, col, expr
         df = self.spark.range(10).select(
@@ -3705,7 +3748,7 @@ def test_vectorized_udf_chained(self):
     def test_vectorized_udf_wrong_return_type(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10)
-        f = pandas_udf(lambda x: x * 1.0, ArrayType(LongType()))
+        f = pandas_udf(lambda x: x * 1.0, MapType(LongType(), LongType()))
         with QuietTest(self.sc):
             with self.assertRaisesRegexp(Exception, 'Unsupported.*type.*conversion'):
                 df.select(f(col('id'))).collect()
@@ -4009,7 +4052,7 @@ def test_wrong_return_type(self):
 
         foo = pandas_udf(
             lambda pdf: pdf,
-            'id long, v array<int>',
+            'id long, v map<int, int>',
             PandasUDFType.GROUP_MAP
         )
 
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 02b245713b6ab..146e673ae9756 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1625,6 +1625,8 @@ def to_arrow_type(dt):
     elif type(dt) == TimestampType:
         # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
         arrow_type = pa.timestamp('us', tz='UTC')
+    elif type(dt) == ArrayType:
+        arrow_type = pa.list_(to_arrow_type(dt.elementType))
     else:
         raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
     return arrow_type
@@ -1665,6 +1667,8 @@ def from_arrow_type(at):
         spark_type = DateType()
     elif types.is_timestamp(at):
         spark_type = TimestampType()
+    elif types.is_list(at):
+        spark_type = ArrayType(from_arrow_type(at.value_type))
     else:
         raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
     return spark_type
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
index 528f66f342dc9..af5673e26a501 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
@@ -326,7 +326,8 @@ private abstract static class ArrowVectorAccessor {
       this.vector = vector;
     }
 
-    final boolean isNullAt(int rowId) {
+    // TODO: should be final after removing ArrayAccessor workaround
+    boolean isNullAt(int rowId) {
       return vector.isNull(rowId);
     }
 
@@ -589,6 +590,16 @@ private static class ArrayAccessor extends ArrowVectorAccessor {
       this.accessor = vector;
     }
 
+    @Override
+    final boolean isNullAt(int rowId) {
+      // TODO: Workaround if vector has all non-null values, see ARROW-1948
+      if (accessor.getValueCount() > 0 && accessor.getValidityBuffer().capacity() == 0) {
+        return false;
+      } else {
+        return super.isNullAt(rowId);
+      }
+    }
+
     @Override
     final int getArrayLength(int rowId) {
       return accessor.getInnerValueCountAt(rowId);

From e734a4b9c23463a7fea61011027a822bc9e11c98 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 2 Jan 2018 07:20:05 +0900
Subject: [PATCH 246/356] [SPARK-21893][SPARK-22142][TESTS][FOLLOWUP] Enables
 PySpark tests for Flume and Kafka in Jenkins

## What changes were proposed in this pull request?

This PR proposes to enable PySpark tests for Flume and Kafka in Jenkins by explicitly setting the environment variables in `modules.py`.

Seems we are not taking the dependencies into account when calculating environment variables:

https://github.com/apache/spark/blob/3a07eff5af601511e97a05e6fea0e3d48f74c4f0/dev/run-tests.py#L554-L561

## How was this patch tested?

Manual tests with Jenkins in https://github.com/apache/spark/pull/20126.

**Before** - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/85559/consoleFull

```
[info] Setup the following environment variables for tests:
...
```

**After** - https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder/85560/consoleFull

```
[info] Setup the following environment variables for tests:
ENABLE_KAFKA_0_8_TESTS=1
ENABLE_FLUME_TESTS=1
...
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20128 from HyukjinKwon/SPARK-21893.
---
 dev/sparktestsupport/modules.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 44f990ec8a5ac..f834563da9dda 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -418,6 +418,10 @@ def __hash__(self):
     source_file_regexes=[
         "python/pyspark/streaming"
     ],
+    environ={
+        "ENABLE_FLUME_TESTS": "1",
+        "ENABLE_KAFKA_0_8_TESTS": "1"
+    },
     python_test_goals=[
         "pyspark.streaming.util",
         "pyspark.streaming.tests",

From e0c090f227e9b64e595b47d4d1f96f8a2fff5bf7 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 2 Jan 2018 09:19:18 +0800
Subject: [PATCH 247/356] [SPARK-22932][SQL] Refactor AnalysisContext

## What changes were proposed in this pull request?
Add a `reset` function to ensure the state in `AnalysisContext ` is per-query.

## How was this patch tested?
The existing test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20127 from gatorsmile/refactorAnalysisContext.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 6d294d48c0ee7..35b35110e491f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -52,6 +52,7 @@ object SimpleAnalyzer extends Analyzer(
 /**
  * Provides a way to keep state during the analysis, this enables us to decouple the concerns
  * of analysis environment from the catalog.
+ * The state that is kept here is per-query.
  *
  * Note this is thread local.
  *
@@ -70,6 +71,8 @@ object AnalysisContext {
   }
 
   def get: AnalysisContext = value.get()
+  def reset(): Unit = value.remove()
+
   private def set(context: AnalysisContext): Unit = value.set(context)
 
   def withAnalysisContext[A](database: Option[String])(f: => A): A = {
@@ -95,6 +98,17 @@ class Analyzer(
     this(catalog, conf, conf.optimizerMaxIterations)
   }
 
+  override def execute(plan: LogicalPlan): LogicalPlan = {
+    AnalysisContext.reset()
+    try {
+      executeSameContext(plan)
+    } finally {
+      AnalysisContext.reset()
+    }
+  }
+
+  private def executeSameContext(plan: LogicalPlan): LogicalPlan = super.execute(plan)
+
   def resolver: Resolver = conf.resolver
 
   protected val fixedPoint = FixedPoint(maxIterations)
@@ -176,7 +190,7 @@ class Analyzer(
       case With(child, relations) =>
         substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) {
           case (resolved, (name, relation)) =>
-            resolved :+ name -> execute(substituteCTE(relation, resolved))
+            resolved :+ name -> executeSameContext(substituteCTE(relation, resolved))
         })
       case other => other
     }
@@ -600,7 +614,7 @@ class Analyzer(
               "avoid errors. Increase the value of spark.sql.view.maxNestedViewDepth to work " +
               "aroud this.")
           }
-          execute(child)
+          executeSameContext(child)
         }
         view.copy(child = newChild)
       case p @ SubqueryAlias(_, view: View) =>
@@ -1269,7 +1283,7 @@ class Analyzer(
       do {
         // Try to resolve the subquery plan using the regular analyzer.
         previous = current
-        current = execute(current)
+        current = executeSameContext(current)
 
         // Use the outer references to resolve the subquery plan if it isn't resolved yet.
         val i = plans.iterator
@@ -1392,7 +1406,7 @@ class Analyzer(
               grouping,
               Alias(cond, "havingCondition")() :: Nil,
               child)
-          val resolvedOperator = execute(aggregatedCondition)
+          val resolvedOperator = executeSameContext(aggregatedCondition)
           def resolvedAggregateFilter =
             resolvedOperator
               .asInstanceOf[Aggregate]
@@ -1450,7 +1464,8 @@ class Analyzer(
           val aliasedOrdering =
             unresolvedSortOrders.map(o => Alias(o.child, "aggOrder")())
           val aggregatedOrdering = aggregate.copy(aggregateExpressions = aliasedOrdering)
-          val resolvedAggregate: Aggregate = execute(aggregatedOrdering).asInstanceOf[Aggregate]
+          val resolvedAggregate: Aggregate =
+            executeSameContext(aggregatedOrdering).asInstanceOf[Aggregate]
           val resolvedAliasedOrdering: Seq[Alias] =
             resolvedAggregate.aggregateExpressions.asInstanceOf[Seq[Alias]]
 

From a6fc300e91273230e7134ac6db95ccb4436c6f8f Mon Sep 17 00:00:00 2001
From: Xianjin YE <advancedxy@gmail.com>
Date: Tue, 2 Jan 2018 23:30:38 +0800
Subject: [PATCH 248/356] [SPARK-22897][CORE] Expose stageAttemptId in
 TaskContext

## What changes were proposed in this pull request?
stageAttemptId added in TaskContext and corresponding construction modification

## How was this patch tested?
Added a new test in TaskContextSuite, two cases are tested:
1. Normal case without failure
2. Exception case with resubmitted stages

Link to [SPARK-22897](https://issues.apache.org/jira/browse/SPARK-22897)

Author: Xianjin YE <advancedxy@gmail.com>

Closes #20082 from advancedxy/SPARK-22897.
---
 .../scala/org/apache/spark/TaskContext.scala  |  9 +++++-
 .../org/apache/spark/TaskContextImpl.scala    |  5 ++--
 .../org/apache/spark/scheduler/Task.scala     |  1 +
 .../spark/JavaTaskContextCompileCheck.java    |  2 ++
 .../scala/org/apache/spark/ShuffleSuite.scala |  6 ++--
 .../spark/memory/MemoryTestingUtils.scala     |  1 +
 .../spark/scheduler/TaskContextSuite.scala    | 29 +++++++++++++++++--
 .../spark/storage/BlockInfoManagerSuite.scala |  2 +-
 project/MimaExcludes.scala                    |  3 ++
 .../UnsafeFixedWidthAggregationMapSuite.scala |  1 +
 .../UnsafeKVExternalSorterSuite.scala         |  1 +
 .../execution/UnsafeRowSerializerSuite.scala  |  2 +-
 .../SortBasedAggregationStoreSuite.scala      |  3 +-
 13 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index 0b87cd503d4fa..69739745aa6cf 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -66,7 +66,7 @@ object TaskContext {
    * An empty task context that does not represent an actual task.  This is only used in tests.
    */
   private[spark] def empty(): TaskContextImpl = {
-    new TaskContextImpl(0, 0, 0, 0, null, new Properties, null)
+    new TaskContextImpl(0, 0, 0, 0, 0, null, new Properties, null)
   }
 }
 
@@ -150,6 +150,13 @@ abstract class TaskContext extends Serializable {
    */
   def stageId(): Int
 
+  /**
+   * How many times the stage that this task belongs to has been attempted. The first stage attempt
+   * will be assigned stageAttemptNumber = 0, and subsequent attempts will have increasing attempt
+   * numbers.
+   */
+  def stageAttemptNumber(): Int
+
   /**
    * The ID of the RDD partition that is computed by this task.
    */
diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
index 01d8973e1bb06..cccd3ea457ba4 100644
--- a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -41,8 +41,9 @@ import org.apache.spark.util._
  * `TaskMetrics` & `MetricsSystem` objects are not thread safe.
  */
 private[spark] class TaskContextImpl(
-    val stageId: Int,
-    val partitionId: Int,
+    override val stageId: Int,
+    override val stageAttemptNumber: Int,
+    override val partitionId: Int,
     override val taskAttemptId: Long,
     override val attemptNumber: Int,
     override val taskMemoryManager: TaskMemoryManager,
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
index 7767ef1803a06..f536fc2a5f0a1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Task.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -79,6 +79,7 @@ private[spark] abstract class Task[T](
     SparkEnv.get.blockManager.registerTask(taskAttemptId)
     context = new TaskContextImpl(
       stageId,
+      stageAttemptId, // stageAttemptId and stageAttemptNumber are semantically equal
       partitionId,
       taskAttemptId,
       attemptNumber,
diff --git a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
index 94f5805853e1e..f8e233a05a447 100644
--- a/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
+++ b/core/src/test/java/test/org/apache/spark/JavaTaskContextCompileCheck.java
@@ -38,6 +38,7 @@ public static void test() {
     tc.attemptNumber();
     tc.partitionId();
     tc.stageId();
+    tc.stageAttemptNumber();
     tc.taskAttemptId();
   }
 
@@ -51,6 +52,7 @@ public void onTaskCompletion(TaskContext context) {
       context.isCompleted();
       context.isInterrupted();
       context.stageId();
+      context.stageAttemptNumber();
       context.partitionId();
       context.addTaskCompletionListener(this);
     }
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index 3931d53b4ae0a..ced5a06516f75 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -363,14 +363,14 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
 
     // first attempt -- its successful
     val writer1 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 0L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data1 = (1 to 10).map { x => x -> x}
 
     // second attempt -- also successful.  We'll write out different data,
     // just to simulate the fact that the records may get written differently
     // depending on what gets spilled, what gets combined, etc.
     val writer2 = manager.getWriter[Int, Int](shuffleHandle, 0,
-      new TaskContextImpl(0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(0, 0, 0, 1L, 0, taskMemoryManager, new Properties, metricsSystem))
     val data2 = (11 to 20).map { x => x -> x}
 
     // interleave writes of both attempts -- we want to test that both attempts can occur
@@ -398,7 +398,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
     }
 
     val reader = manager.getReader[Int, Int](shuffleHandle, 0, 1,
-      new TaskContextImpl(1, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
+      new TaskContextImpl(1, 0, 0, 2L, 0, taskMemoryManager, new Properties, metricsSystem))
     val readData = reader.read().toIndexedSeq
     assert(readData === data1.toIndexedSeq || readData === data2.toIndexedSeq)
 
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
index 362cd861cc248..dcf89e4f75acf 100644
--- a/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryTestingUtils.scala
@@ -29,6 +29,7 @@ object MemoryTestingUtils {
     val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0)
     new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 0,
       attemptNumber = 0,
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
index a1d9085fa085d..aa9c36c0aaacb 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskContextSuite.scala
@@ -29,6 +29,7 @@ import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.JvmSource
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.rdd.RDD
+import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.util._
 
 class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSparkContext {
@@ -158,6 +159,30 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     assert(attemptIdsWithFailedTask.toSet === Set(0, 1))
   }
 
+  test("TaskContext.stageAttemptNumber getter") {
+    sc = new SparkContext("local[1,2]", "test")
+
+    // Check stageAttemptNumbers are 0 for initial stage
+    val stageAttemptNumbers = sc.parallelize(Seq(1, 2), 2).mapPartitions { _ =>
+      Seq(TaskContext.get().stageAttemptNumber()).iterator
+    }.collect()
+    assert(stageAttemptNumbers.toSet === Set(0))
+
+    // Check stageAttemptNumbers that are resubmitted when tasks have FetchFailedException
+    val stageAttemptNumbersWithFailedStage =
+      sc.parallelize(Seq(1, 2, 3, 4), 4).repartition(1).mapPartitions { _ =>
+      val stageAttemptNumber = TaskContext.get().stageAttemptNumber()
+      if (stageAttemptNumber < 2) {
+        // Throw FetchFailedException to explicitly trigger stage resubmission. A normal exception
+        // will only trigger task resubmission in the same stage.
+        throw new FetchFailedException(null, 0, 0, 0, "Fake")
+      }
+      Seq(stageAttemptNumber).iterator
+    }.collect()
+
+    assert(stageAttemptNumbersWithFailedStage.toSet === Set(2))
+  }
+
   test("accumulators are updated on exception failures") {
     // This means use 1 core and 4 max task failures
     sc = new SparkContext("local[1,4]", "test")
@@ -190,7 +215,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.empty
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
@@ -213,7 +238,7 @@ class TaskContextSuite extends SparkFunSuite with BeforeAndAfter with LocalSpark
     // accumulator updates from it.
     val taskMetrics = TaskMetrics.registered
     val task = new Task[Int](0, 0, 0) {
-      context = new TaskContextImpl(0, 0, 0L, 0,
+      context = new TaskContextImpl(0, 0, 0, 0L, 0,
         new TaskMemoryManager(SparkEnv.get.memoryManager, 0L),
         new Properties,
         SparkEnv.get.metricsSystem,
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
index 917db766f7f11..9c0699bc981f8 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockInfoManagerSuite.scala
@@ -62,7 +62,7 @@ class BlockInfoManagerSuite extends SparkFunSuite with BeforeAndAfterEach {
   private def withTaskId[T](taskAttemptId: Long)(block: => T): T = {
     try {
       TaskContext.setTaskContext(
-        new TaskContextImpl(0, 0, taskAttemptId, 0, null, new Properties, null))
+        new TaskContextImpl(0, 0, 0, taskAttemptId, 0, null, new Properties, null))
       block
     } finally {
       TaskContext.unset()
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 81584af6813ea..3b452f35c5ec1 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,9 @@ object MimaExcludes {
 
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
+    // [SPARK-22897] Expose stageAttemptId in TaskContext
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.TaskContext.stageAttemptNumber"),
+
     // SPARK-22789: Map-only continuous processing execution
     ProblemFilters.exclude[IncompatibleResultTypeProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$8"),
     ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StreamingQueryManager.startQuery$default$6"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
index 232c1beae7998..3e31d22e15c0e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeFixedWidthAggregationMapSuite.scala
@@ -70,6 +70,7 @@ class UnsafeFixedWidthAggregationMapSuite
 
       TaskContext.setTaskContext(new TaskContextImpl(
         stageId = 0,
+        stageAttemptNumber = 0,
         partitionId = 0,
         taskAttemptId = Random.nextInt(10000),
         attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
index 604502f2a57d0..6af9f8b77f8d3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeKVExternalSorterSuite.scala
@@ -116,6 +116,7 @@ class UnsafeKVExternalSorterSuite extends SparkFunSuite with SharedSQLContext {
     val taskMemMgr = new TaskMemoryManager(memoryManager, 0)
     TaskContext.setTaskContext(new TaskContextImpl(
       stageId = 0,
+      stageAttemptNumber = 0,
       partitionId = 0,
       taskAttemptId = 98456,
       attemptNumber = 0,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index dff88ce7f1b9a..a3ae93810aa3c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -114,7 +114,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
         (i, converter(Row(i)))
       }
       val taskMemoryManager = new TaskMemoryManager(sc.env.memoryManager, 0)
-      val taskContext = new TaskContextImpl(0, 0, 0, 0, taskMemoryManager, new Properties, null)
+      val taskContext = new TaskContextImpl(0, 0, 0, 0, 0, taskMemoryManager, new Properties, null)
 
       val sorter = new ExternalSorter[Int, UnsafeRow, UnsafeRow](
         taskContext,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
index 10f1ee279bedf..3fad7dfddadcc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationStoreSuite.scala
@@ -35,7 +35,8 @@ class SortBasedAggregationStoreSuite  extends SparkFunSuite with LocalSparkConte
     val conf = new SparkConf()
     sc = new SparkContext("local[2, 4]", "test", conf)
     val taskManager = new TaskMemoryManager(new TestMemoryManager(conf), 0)
-    TaskContext.setTaskContext(new TaskContextImpl(0, 0, 0, 0, taskManager, new Properties, null))
+    TaskContext.setTaskContext(
+      new TaskContextImpl(0, 0, 0, 0, 0, taskManager, new Properties, null))
   }
 
   override def afterAll(): Unit = TaskContext.unset()

From 247a08939d58405aef39b2a4e7773aa45474ad12 Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Wed, 3 Jan 2018 21:40:51 +0800
Subject: [PATCH 249/356] [SPARK-22938] Assert that SQLConf.get is accessed
 only on the driver.

## What changes were proposed in this pull request?

Assert if code tries to access SQLConf.get on executor.
This can lead to hard to detect bugs, where the executor will read fallbackConf, falling back to default config values, ignoring potentially changed non-default configs.
If a config is to be passed to executor code, it needs to be read on the driver, and passed explicitly.

## How was this patch tested?

Check in existing tests.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #20136 from juliuszsompolski/SPARK-22938.
---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala  | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 4f77c54a7af57..80cdc61484c0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -27,11 +27,13 @@ import scala.util.matching.Regex
 
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.{SparkContext, SparkEnv}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
+import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // This file defines the configuration options for Spark SQL.
@@ -70,7 +72,7 @@ object SQLConf {
    * Default config. Only used when there is no active SparkSession for the thread.
    * See [[get]] for more information.
    */
-  private val fallbackConf = new ThreadLocal[SQLConf] {
+  private lazy val fallbackConf = new ThreadLocal[SQLConf] {
     override def initialValue: SQLConf = new SQLConf
   }
 
@@ -1087,6 +1089,12 @@ object SQLConf {
 class SQLConf extends Serializable with Logging {
   import SQLConf._
 
+  if (Utils.isTesting && SparkEnv.get != null) {
+    // assert that we're only accessing it on the driver.
+    assert(SparkEnv.get.executorId == SparkContext.DRIVER_IDENTIFIER,
+      "SQLConf should only be created and accessed on the driver.")
+  }
+
   /** Only low degree of contention is expected for conf, thus NOT using ConcurrentHashMap. */
   @transient protected[spark] val settings = java.util.Collections.synchronizedMap(
     new java.util.HashMap[String, String]())

From 1a87a1609c4d2c9027a2cf669ea3337b89f61fb6 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Wed, 3 Jan 2018 22:09:30 +0800
Subject: [PATCH 250/356] [SPARK-22934][SQL] Make optional clauses order
 insensitive for CREATE TABLE SQL statement

## What changes were proposed in this pull request?
Currently, our CREATE TABLE syntax require the EXACT order of clauses. It is pretty hard to remember the exact order. Thus, this PR is to make optional clauses order insensitive for `CREATE TABLE` SQL statement.

```
CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
    [(col_name1 col_type1 [COMMENT col_comment1], ...)]
    USING datasource
    [OPTIONS (key1=val1, key2=val2, ...)]
    [PARTITIONED BY (col_name1, col_name2, ...)]
    [CLUSTERED BY (col_name3, col_name4, ...) INTO num_buckets BUCKETS]
    [LOCATION path]
    [COMMENT table_comment]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
    [AS select_statement]
```

The proposal is to make the following clauses order insensitive.
```
    [OPTIONS (key1=val1, key2=val2, ...)]
    [PARTITIONED BY (col_name1, col_name2, ...)]
    [CLUSTERED BY (col_name3, col_name4, ...) INTO num_buckets BUCKETS]
    [LOCATION path]
    [COMMENT table_comment]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
```

The same idea is also applicable to Create Hive Table.
```
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
    [(col_name1[:] col_type1 [COMMENT col_comment1], ...)]
    [COMMENT table_comment]
    [PARTITIONED BY (col_name2[:] col_type2 [COMMENT col_comment2], ...)]
    [ROW FORMAT row_format]
    [STORED AS file_format]
    [LOCATION path]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
    [AS select_statement]
```

The proposal is to make the following clauses order insensitive.
```
    [COMMENT table_comment]
    [PARTITIONED BY (col_name2[:] col_type2 [COMMENT col_comment2], ...)]
    [ROW FORMAT row_format]
    [STORED AS file_format]
    [LOCATION path]
    [TBLPROPERTIES (key1=val1, key2=val2, ...)]
```

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20133 from gatorsmile/createDataSourceTableDDL.
---
 .../spark/sql/catalyst/parser/SqlBase.g4      |  24 +-
 .../sql/catalyst/parser/ParserUtils.scala     |   9 +
 .../spark/sql/execution/SparkSqlParser.scala  |  81 +++++--
 .../execution/command/DDLParserSuite.scala    | 220 ++++++++++++++----
 .../sql/execution/command/DDLSuite.scala      |   2 +-
 .../sql/hive/execution/HiveDDLSuite.scala     |  13 +-
 .../sql/hive/execution/SQLQuerySuite.scala    | 124 +++++-----
 7 files changed, 335 insertions(+), 138 deletions(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 6fe995f650d55..6daf01d98426c 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -73,18 +73,22 @@ statement
     | ALTER DATABASE identifier SET DBPROPERTIES tablePropertyList     #setDatabaseProperties
     | DROP DATABASE (IF EXISTS)? identifier (RESTRICT | CASCADE)?      #dropDatabase
     | createTableHeader ('(' colTypeList ')')? tableProvider
-        (OPTIONS options=tablePropertyList)?
-        (PARTITIONED BY partitionColumnNames=identifierList)?
-        bucketSpec? locationSpec?
-        (COMMENT comment=STRING)?
-        (TBLPROPERTIES tableProps=tablePropertyList)?
+        ((OPTIONS options=tablePropertyList) |
+        (PARTITIONED BY partitionColumnNames=identifierList) |
+        bucketSpec |
+        locationSpec |
+        (COMMENT comment=STRING) |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createTable
     | createTableHeader ('(' columns=colTypeList ')')?
-        (COMMENT comment=STRING)?
-        (PARTITIONED BY '(' partitionColumns=colTypeList ')')?
-        bucketSpec? skewSpec?
-        rowFormat?  createFileFormat? locationSpec?
-        (TBLPROPERTIES tablePropertyList)?
+        ((COMMENT comment=STRING) |
+        (PARTITIONED BY '(' partitionColumns=colTypeList ')') |
+        bucketSpec |
+        skewSpec |
+        rowFormat |
+        createFileFormat |
+        locationSpec |
+        (TBLPROPERTIES tableProps=tablePropertyList))*
         (AS? query)?                                                   #createHiveTable
     | CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier
         LIKE source=tableIdentifier locationSpec?                      #createTableLike
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
index 9b127f91648e6..89347f4b1f7bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/ParserUtils.scala
@@ -16,6 +16,8 @@
  */
 package org.apache.spark.sql.catalyst.parser
 
+import java.util
+
 import scala.collection.mutable.StringBuilder
 
 import org.antlr.v4.runtime.{ParserRuleContext, Token}
@@ -39,6 +41,13 @@ object ParserUtils {
     throw new ParseException(s"Operation not allowed: $message", ctx)
   }
 
+  def checkDuplicateClauses[T](
+      nodes: util.List[T], clauseName: String, ctx: ParserRuleContext): Unit = {
+    if (nodes.size() > 1) {
+      throw new ParseException(s"Found duplicate clauses: $clauseName", ctx)
+    }
+  }
+
   /** Check if duplicate keys exist in a set of key-value pairs. */
   def checkDuplicateKeys[T](keyPairs: Seq[(String, T)], ctx: ParserRuleContext): Unit = {
     keyPairs.groupBy(_._1).filter(_._2.size > 1).foreach { case (key, _) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 29b584b55972c..d3cfd2a1ffbf2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -383,16 +383,19 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [TEMPORARY] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   USING table_provider
-   *   [OPTIONS table_property_list]
-   *   [PARTITIONED BY (col_name, col_name, ...)]
-   *   [CLUSTERED BY (col_name, col_name, ...)
-   *    [SORTED BY (col_name [ASC|DESC], ...)]
-   *    INTO num_buckets BUCKETS
-   *   ]
-   *   [LOCATION path]
-   *   [COMMENT table_comment]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [[AS] select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [OPTIONS table_property_list]
+   *     [PARTITIONED BY (col_name, col_name, ...)]
+   *     [CLUSTERED BY (col_name, col_name, ...)
+   *       [SORTED BY (col_name [ASC|DESC], ...)]
+   *       INTO num_buckets BUCKETS
+   *     ]
+   *     [LOCATION path]
+   *     [COMMENT table_comment]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateTable(ctx: CreateTableContext): LogicalPlan = withOrigin(ctx) {
@@ -400,6 +403,14 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     if (external) {
       operationNotAllowed("CREATE EXTERNAL TABLE ... USING", ctx)
     }
+
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.OPTIONS, "OPTIONS", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val options = Option(ctx.options).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val provider = ctx.tableProvider.qualifiedName.getText
     val schema = Option(ctx.colTypeList()).map(createSchema)
@@ -408,9 +419,9 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         .map(visitIdentifierList(_).toArray)
         .getOrElse(Array.empty[String])
     val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     val storage = DataSource.buildStorageFormatFromOptions(options)
 
     if (location.isDefined && storage.locationUri.isDefined) {
@@ -1087,13 +1098,16 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
    * {{{
    *   CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
    *   [(col1[:] data_type [COMMENT col_comment], ...)]
-   *   [COMMENT table_comment]
-   *   [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
-   *   [ROW FORMAT row_format]
-   *   [STORED AS file_format]
-   *   [LOCATION path]
-   *   [TBLPROPERTIES (property_name=property_value, ...)]
+   *   create_table_clauses
    *   [AS select_statement];
+   *
+   *   create_table_clauses (order insensitive):
+   *     [COMMENT table_comment]
+   *     [PARTITIONED BY (col2[:] data_type [COMMENT col_comment], ...)]
+   *     [ROW FORMAT row_format]
+   *     [STORED AS file_format]
+   *     [LOCATION path]
+   *     [TBLPROPERTIES (property_name=property_value, ...)]
    * }}}
    */
   override def visitCreateHiveTable(ctx: CreateHiveTableContext): LogicalPlan = withOrigin(ctx) {
@@ -1104,15 +1118,23 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
         "CREATE TEMPORARY TABLE is not supported yet. " +
           "Please use CREATE TEMPORARY VIEW as an alternative.", ctx)
     }
-    if (ctx.skewSpec != null) {
+    if (ctx.skewSpec.size > 0) {
       operationNotAllowed("CREATE TABLE ... SKEWED BY", ctx)
     }
 
+    checkDuplicateClauses(ctx.TBLPROPERTIES, "TBLPROPERTIES", ctx)
+    checkDuplicateClauses(ctx.PARTITIONED, "PARTITIONED BY", ctx)
+    checkDuplicateClauses(ctx.COMMENT, "COMMENT", ctx)
+    checkDuplicateClauses(ctx.bucketSpec(), "CLUSTERED BY", ctx)
+    checkDuplicateClauses(ctx.createFileFormat, "STORED AS/BY", ctx)
+    checkDuplicateClauses(ctx.rowFormat, "ROW FORMAT", ctx)
+    checkDuplicateClauses(ctx.locationSpec, "LOCATION", ctx)
+
     val dataCols = Option(ctx.columns).map(visitColTypeList).getOrElse(Nil)
     val partitionCols = Option(ctx.partitionColumns).map(visitColTypeList).getOrElse(Nil)
-    val properties = Option(ctx.tablePropertyList).map(visitPropertyKeyValues).getOrElse(Map.empty)
+    val properties = Option(ctx.tableProps).map(visitPropertyKeyValues).getOrElse(Map.empty)
     val selectQuery = Option(ctx.query).map(plan)
-    val bucketSpec = Option(ctx.bucketSpec()).map(visitBucketSpec)
+    val bucketSpec = ctx.bucketSpec().asScala.headOption.map(visitBucketSpec)
 
     // Note: Hive requires partition columns to be distinct from the schema, so we need
     // to include the partition columns here explicitly
@@ -1120,12 +1142,12 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
 
     // Storage format
     val defaultStorage = HiveSerDe.getDefaultStorage(conf)
-    validateRowFormatFileFormat(ctx.rowFormat, ctx.createFileFormat, ctx)
-    val fileStorage = Option(ctx.createFileFormat).map(visitCreateFileFormat)
+    validateRowFormatFileFormat(ctx.rowFormat.asScala, ctx.createFileFormat.asScala, ctx)
+    val fileStorage = ctx.createFileFormat.asScala.headOption.map(visitCreateFileFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val rowStorage = Option(ctx.rowFormat).map(visitRowFormat)
+    val rowStorage = ctx.rowFormat.asScala.headOption.map(visitRowFormat)
       .getOrElse(CatalogStorageFormat.empty)
-    val location = Option(ctx.locationSpec).map(visitLocationSpec)
+    val location = ctx.locationSpec.asScala.headOption.map(visitLocationSpec)
     // If we are creating an EXTERNAL table, then the LOCATION field is required
     if (external && location.isEmpty) {
       operationNotAllowed("CREATE EXTERNAL TABLE must be accompanied by LOCATION", ctx)
@@ -1180,7 +1202,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
             ctx)
         }
 
-        val hasStorageProperties = (ctx.createFileFormat != null) || (ctx.rowFormat != null)
+        val hasStorageProperties = (ctx.createFileFormat.size != 0) || (ctx.rowFormat.size != 0)
         if (conf.convertCTAS && !hasStorageProperties) {
           // At here, both rowStorage.serdeProperties and fileStorage.serdeProperties
           // are empty Maps.
@@ -1366,6 +1388,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder(conf) {
     }
   }
 
+  private def validateRowFormatFileFormat(
+      rowFormatCtx: Seq[RowFormatContext],
+      createFileFormatCtx: Seq[CreateFileFormatContext],
+      parentCtx: ParserRuleContext): Unit = {
+    if (rowFormatCtx.size == 1 && createFileFormatCtx.size == 1) {
+      validateRowFormatFileFormat(rowFormatCtx.head, createFileFormatCtx.head, parentCtx)
+    }
+  }
+
   /**
    * Create or replace a view. This creates a [[CreateViewCommand]] command.
    *
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
index eb7c33590b602..2b1aea08b1223 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLParserSuite.scala
@@ -54,6 +54,13 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  private def intercept(sqlCommand: String, messages: String*): Unit = {
+    val e = intercept[ParseException](parser.parsePlan(sqlCommand)).getMessage
+    messages.foreach { message =>
+      assert(e.contains(message))
+    }
+  }
+
   private def parseAs[T: ClassTag](query: String): T = {
     parser.parsePlan(query) match {
       case t: T => t
@@ -494,6 +501,37 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Duplicate clauses - create table") {
+    def createTableHeader(duplicateClause: String, isNative: Boolean): String = {
+      val fileFormat = if (isNative) "USING parquet" else "STORED AS parquet"
+      s"CREATE TABLE my_tab(a INT, b STRING) $fileFormat $duplicateClause $duplicateClause"
+    }
+
+    Seq(true, false).foreach { isNative =>
+      intercept(createTableHeader("TBLPROPERTIES('test' = 'test2')", isNative),
+        "Found duplicate clauses: TBLPROPERTIES")
+      intercept(createTableHeader("LOCATION '/tmp/file'", isNative),
+        "Found duplicate clauses: LOCATION")
+      intercept(createTableHeader("COMMENT 'a table'", isNative),
+        "Found duplicate clauses: COMMENT")
+      intercept(createTableHeader("CLUSTERED BY(b) INTO 256 BUCKETS", isNative),
+        "Found duplicate clauses: CLUSTERED BY")
+    }
+
+    // Only for native data source tables
+    intercept(createTableHeader("PARTITIONED BY (b)", isNative = true),
+      "Found duplicate clauses: PARTITIONED BY")
+
+    // Only for Hive serde tables
+    intercept(createTableHeader("PARTITIONED BY (k int)", isNative = false),
+      "Found duplicate clauses: PARTITIONED BY")
+    intercept(createTableHeader("STORED AS parquet", isNative = false),
+      "Found duplicate clauses: STORED AS/BY")
+    intercept(
+      createTableHeader("ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'", isNative = false),
+      "Found duplicate clauses: ROW FORMAT")
+  }
+
   test("create table - with location") {
     val v1 = "CREATE TABLE my_tab(a INT, b STRING) USING parquet LOCATION '/tmp/file'"
 
@@ -1153,38 +1191,119 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
     }
   }
 
+  test("Test CTAS against data source tables") {
+    val s1 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s2 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |LOCATION '/user/external/page_view'
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE TABLE IF NOT EXISTS mydb.page_view
+        |USING parquet
+        |COMMENT 'This is the staging page view table'
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.provider == Some("parquet"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
+  }
+
   test("Test CTAS #1") {
     val s1 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |STORED AS RCFILE
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s1)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    assert(desc.comment == Some("This is the staging page view table"))
-    // TODO will be SQLText
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
-    assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
-    assert(desc.storage.serde ==
-      Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    val s3 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |LOCATION '/user/external/page_view'
+        |STORED AS RCFILE
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+    checkParsing(s3)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      assert(desc.comment == Some("This is the staging page view table"))
+      // TODO will be SQLText
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.inputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileInputFormat"))
+      assert(desc.storage.outputFormat == Some("org.apache.hadoop.hive.ql.io.RCFileOutputFormat"))
+      assert(desc.storage.serde ==
+        Some("org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #2") {
-    val s2 =
-      """CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+    val s1 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
         |COMMENT 'This is the staging page view table'
         |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
         | STORED AS
@@ -1192,26 +1311,45 @@ class DDLParserSuite extends PlanTest with SharedSQLContext {
         | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
         |LOCATION '/user/external/page_view'
         |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
-        |AS SELECT * FROM src""".stripMargin
+        |AS SELECT * FROM src
+      """.stripMargin
 
-    val (desc, exists) = extractTableDesc(s2)
-    assert(exists)
-    assert(desc.identifier.database == Some("mydb"))
-    assert(desc.identifier.table == "page_view")
-    assert(desc.tableType == CatalogTableType.EXTERNAL)
-    assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
-    assert(desc.schema.isEmpty) // will be populated later when the table is actually created
-    // TODO will be SQLText
-    assert(desc.comment == Some("This is the staging page view table"))
-    assert(desc.viewText.isEmpty)
-    assert(desc.viewDefaultDatabase.isEmpty)
-    assert(desc.viewQueryColumnNames.isEmpty)
-    assert(desc.partitionColumnNames.isEmpty)
-    assert(desc.storage.properties == Map())
-    assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
-    assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
-    assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
-    assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    val s2 =
+      """
+        |CREATE EXTERNAL TABLE IF NOT EXISTS mydb.page_view
+        |LOCATION '/user/external/page_view'
+        |TBLPROPERTIES ('p1'='v1', 'p2'='v2')
+        |ROW FORMAT SERDE 'parquet.hive.serde.ParquetHiveSerDe'
+        | STORED AS
+        | INPUTFORMAT 'parquet.hive.DeprecatedParquetInputFormat'
+        | OUTPUTFORMAT 'parquet.hive.DeprecatedParquetOutputFormat'
+        |COMMENT 'This is the staging page view table'
+        |AS SELECT * FROM src
+      """.stripMargin
+
+    checkParsing(s1)
+    checkParsing(s2)
+
+    def checkParsing(sql: String): Unit = {
+      val (desc, exists) = extractTableDesc(sql)
+      assert(exists)
+      assert(desc.identifier.database == Some("mydb"))
+      assert(desc.identifier.table == "page_view")
+      assert(desc.tableType == CatalogTableType.EXTERNAL)
+      assert(desc.storage.locationUri == Some(new URI("/user/external/page_view")))
+      assert(desc.schema.isEmpty) // will be populated later when the table is actually created
+      // TODO will be SQLText
+      assert(desc.comment == Some("This is the staging page view table"))
+      assert(desc.viewText.isEmpty)
+      assert(desc.viewDefaultDatabase.isEmpty)
+      assert(desc.viewQueryColumnNames.isEmpty)
+      assert(desc.partitionColumnNames.isEmpty)
+      assert(desc.storage.properties == Map())
+      assert(desc.storage.inputFormat == Some("parquet.hive.DeprecatedParquetInputFormat"))
+      assert(desc.storage.outputFormat == Some("parquet.hive.DeprecatedParquetOutputFormat"))
+      assert(desc.storage.serde == Some("parquet.hive.serde.ParquetHiveSerDe"))
+      assert(desc.properties == Map("p1" -> "v1", "p2" -> "v2"))
+    }
   }
 
   test("Test CTAS #3") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index fdb9b2f51f9cb..591510c1d8283 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -1971,8 +1971,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
           s"""
              |CREATE TABLE t(a int, b int, c int, d int)
              |USING parquet
-             |PARTITIONED BY(a, b)
              |LOCATION "${dir.toURI}"
+             |PARTITIONED BY(a, b)
            """.stripMargin)
         spark.sql("INSERT INTO TABLE t PARTITION(a=1, b=2) SELECT 3, 4")
         checkAnswer(spark.table("t"), Row(3, 4, 1, 2) :: Nil)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index f2e0c695ca38b..65be244418670 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -875,12 +875,13 @@ class HiveDDLSuite
 
   test("desc table for Hive table - bucketed + sorted table") {
     withTable("tbl") {
-      sql(s"""
-        CREATE TABLE tbl (id int, name string)
-        PARTITIONED BY (ds string)
-        CLUSTERED BY(id)
-        SORTED BY(id, name) INTO 1024 BUCKETS
-        """)
+      sql(
+        s"""
+          |CREATE TABLE tbl (id int, name string)
+          |CLUSTERED BY(id)
+          |SORTED BY(id, name) INTO 1024 BUCKETS
+          |PARTITIONED BY (ds string)
+        """.stripMargin)
 
       val x = sql("DESC FORMATTED tbl").collect()
       assert(x.containsSlice(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 07ae3ae945848..47adc77a52d51 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -461,51 +461,55 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("CTAS without serde without location") {
-    val originalConf = sessionState.conf.convertCTAS
-
-    setConf(SQLConf.CONVERT_CTAS, true)
-
-    val defaultDataSource = sessionState.conf.defaultDataSourceName
-    try {
-      sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      val message = intercept[AnalysisException] {
+    withSQLConf(SQLConf.CONVERT_CTAS.key -> "true") {
+      val defaultDataSource = sessionState.conf.defaultDataSourceName
+      withTable("ctas1") {
         sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      }.getMessage
-      assert(message.contains("already exists"))
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+        sql("CREATE TABLE IF NOT EXISTS ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        val message = intercept[AnalysisException] {
+          sql("CREATE TABLE ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        }.getMessage
+        assert(message.contains("already exists"))
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
       // Specifying database name for query can be converted to data source write path
       // is not allowed right now.
-      sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", true, defaultDataSource)
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE default.ctas1 AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = true, defaultDataSource)
+      }
 
-      sql("CREATE TABLE ctas1 stored as textfile" +
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as textfile" +
           " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "text")
-      sql("DROP TABLE ctas1")
+        checkRelation("ctas1", isDataSourceTable = false, "text")
+      }
 
-      sql("CREATE TABLE ctas1 stored as sequencefile" +
-            " AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "sequence")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as sequencefile" +
+          " AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "sequence")
+      }
 
-      sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "rcfile")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as rcfile AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "rcfile")
+      }
 
-      sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "orc")
-      sql("DROP TABLE ctas1")
+      withTable("ctas1") {
+        sql("CREATE TABLE ctas1 stored as orc AS SELECT key k, value FROM src ORDER BY k, value")
+        checkRelation("ctas1", isDataSourceTable = false, "orc")
+      }
 
-      sql("CREATE TABLE ctas1 stored as parquet AS SELECT key k, value FROM src ORDER BY k, value")
-      checkRelation("ctas1", false, "parquet")
-      sql("DROP TABLE ctas1")
-    } finally {
-      setConf(SQLConf.CONVERT_CTAS, originalConf)
-      sql("DROP TABLE IF EXISTS ctas1")
+      withTable("ctas1") {
+        sql(
+          """
+            |CREATE TABLE ctas1 stored as parquet
+            |AS SELECT key k, value FROM src ORDER BY k, value
+          """.stripMargin)
+        checkRelation("ctas1", isDataSourceTable = false, "parquet")
+      }
     }
   }
 
@@ -539,30 +543,40 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         val defaultDataSource = sessionState.conf.defaultDataSourceName
 
         val tempLocation = dir.toURI.getPath.stripSuffix("/")
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c1"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c1'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c1"))
+        }
 
-        sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", true, defaultDataSource, Some(s"file:$tempLocation/c2"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 LOCATION 'file:$tempLocation/c2'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = true, defaultDataSource, Some(s"file:$tempLocation/c2"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "text", Some(s"file:$tempLocation/c3"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as textfile LOCATION 'file:$tempLocation/c3'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "text", Some(s"file:$tempLocation/c3"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "sequence", Some(s"file:$tempLocation/c4"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as sequenceFile LOCATION 'file:$tempLocation/c4'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "sequence", Some(s"file:$tempLocation/c4"))
+        }
 
-        sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
-          " AS SELECT key k, value FROM src ORDER BY k, value")
-        checkRelation("ctas1", false, "rcfile", Some(s"file:$tempLocation/c5"))
-        sql("DROP TABLE ctas1")
+        withTable("ctas1") {
+          sql(s"CREATE TABLE ctas1 stored as rcfile LOCATION 'file:$tempLocation/c5'" +
+            " AS SELECT key k, value FROM src ORDER BY k, value")
+          checkRelation(
+            "ctas1", isDataSourceTable = false, "rcfile", Some(s"file:$tempLocation/c5"))
+        }
       }
     }
   }

From a66fe36cee9363b01ee70e469f1c968f633c5713 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 3 Jan 2018 22:18:13 +0800
Subject: [PATCH 251/356] [SPARK-20236][SQL] dynamic partition overwrite

## What changes were proposed in this pull request?

When overwriting a partitioned table with dynamic partition columns, the behavior is different between data source and hive tables.

data source table: delete all partition directories that match the static partition values provided in the insert statement.

hive table: only delete partition directories which have data written into it

This PR adds a new config to make users be able to choose hive's behavior.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #18714 from cloud-fan/overwrite-partition.
---
 .../internal/io/FileCommitProtocol.scala      | 25 ++++--
 .../io/HadoopMapReduceCommitProtocol.scala    | 75 ++++++++++++++----
 .../apache/spark/sql/internal/SQLConf.scala   | 21 +++++
 .../InsertIntoHadoopFsRelationCommand.scala   | 20 ++++-
 .../SQLHadoopMapReduceCommitProtocol.scala    | 10 ++-
 .../spark/sql/sources/InsertSuite.scala       | 78 +++++++++++++++++++
 6 files changed, 200 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
index 50f51e1af4530..6d0059b6a0272 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/FileCommitProtocol.scala
@@ -28,8 +28,9 @@ import org.apache.spark.util.Utils
  *
  * 1. Implementations must be serializable, as the committer instance instantiated on the driver
  *    will be used for tasks on executors.
- * 2. Implementations should have a constructor with 2 arguments:
- *      (jobId: String, path: String)
+ * 2. Implementations should have a constructor with 2 or 3 arguments:
+ *      (jobId: String, path: String) or
+ *      (jobId: String, path: String, dynamicPartitionOverwrite: Boolean)
  * 3. A committer should not be reused across multiple Spark jobs.
  *
  * The proper call sequence is:
@@ -139,10 +140,22 @@ object FileCommitProtocol {
   /**
    * Instantiates a FileCommitProtocol using the given className.
    */
-  def instantiate(className: String, jobId: String, outputPath: String)
-    : FileCommitProtocol = {
+  def instantiate(
+      className: String,
+      jobId: String,
+      outputPath: String,
+      dynamicPartitionOverwrite: Boolean = false): FileCommitProtocol = {
     val clazz = Utils.classForName(className).asInstanceOf[Class[FileCommitProtocol]]
-    val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
-    ctor.newInstance(jobId, outputPath)
+    // First try the constructor with arguments (jobId: String, outputPath: String,
+    // dynamicPartitionOverwrite: Boolean).
+    // If that doesn't exist, try the one with (jobId: string, outputPath: String).
+    try {
+      val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String], classOf[Boolean])
+      ctor.newInstance(jobId, outputPath, dynamicPartitionOverwrite.asInstanceOf[java.lang.Boolean])
+    } catch {
+      case _: NoSuchMethodException =>
+        val ctor = clazz.getDeclaredConstructor(classOf[String], classOf[String])
+        ctor.newInstance(jobId, outputPath)
+    }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 95c99d29c3a9c..6d20ef1f98a3c 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -39,8 +39,19 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil
  *
  * @param jobId the job's or stage's id
  * @param path the job's output path, or null if committer acts as a noop
+ * @param dynamicPartitionOverwrite If true, Spark will overwrite partition directories at runtime
+ *                                  dynamically, i.e., we first write files under a staging
+ *                                  directory with partition path, e.g.
+ *                                  /path/to/staging/a=1/b=1/xxx.parquet. When committing the job,
+ *                                  we first clean up the corresponding partition directories at
+ *                                  destination path, e.g. /path/to/destination/a=1/b=1, and move
+ *                                  files from staging directory to the corresponding partition
+ *                                  directories under destination path.
  */
-class HadoopMapReduceCommitProtocol(jobId: String, path: String)
+class HadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
   extends FileCommitProtocol with Serializable with Logging {
 
   import FileCommitProtocol._
@@ -67,9 +78,17 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
   @transient private var addedAbsPathFiles: mutable.Map[String, String] = null
 
   /**
-   * The staging directory for all files committed with absolute output paths.
+   * Tracks partitions with default path that have new files written into them by this task,
+   * e.g. a=1/b=2. Files under these partitions will be saved into staging directory and moved to
+   * destination directory at the end, if `dynamicPartitionOverwrite` is true.
    */
-  private def absPathStagingDir: Path = new Path(path, "_temporary-" + jobId)
+  @transient private var partitionPaths: mutable.Set[String] = null
+
+  /**
+   * The staging directory of this write job. Spark uses it to deal with files with absolute output
+   * path, or writing data into partitioned directory with dynamicPartitionOverwrite=true.
+   */
+  private def stagingDir = new Path(path, ".spark-staging-" + jobId)
 
   protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
     val format = context.getOutputFormatClass.newInstance()
@@ -85,11 +104,16 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
       taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = {
     val filename = getFilename(taskContext, ext)
 
-    val stagingDir: String = committer match {
+    val stagingDir: Path = committer match {
+      case _ if dynamicPartitionOverwrite =>
+        assert(dir.isDefined,
+          "The dataset to be written must be partitioned when dynamicPartitionOverwrite is true.")
+        partitionPaths += dir.get
+        this.stagingDir
       // For FileOutputCommitter it has its own staging path called "work path".
       case f: FileOutputCommitter =>
-        Option(f.getWorkPath).map(_.toString).getOrElse(path)
-      case _ => path
+        new Path(Option(f.getWorkPath).map(_.toString).getOrElse(path))
+      case _ => new Path(path)
     }
 
     dir.map { d =>
@@ -106,8 +130,7 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
     // Include a UUID here to prevent file collisions for one task writing to different dirs.
     // In principle we could include hash(absoluteDir) instead but this is simpler.
-    val tmpOutputPath = new Path(
-      absPathStagingDir, UUID.randomUUID().toString() + "-" + filename).toString
+    val tmpOutputPath = new Path(stagingDir, UUID.randomUUID().toString() + "-" + filename).toString
 
     addedAbsPathFiles(tmpOutputPath) = absOutputPath
     tmpOutputPath
@@ -141,23 +164,42 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
 
   override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = {
     committer.commitJob(jobContext)
-    val filesToMove = taskCommits.map(_.obj.asInstanceOf[Map[String, String]])
-      .foldLeft(Map[String, String]())(_ ++ _)
-    logDebug(s"Committing files staged for absolute locations $filesToMove")
+
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
+      val (allAbsPathFiles, allPartitionPaths) =
+        taskCommits.map(_.obj.asInstanceOf[(Map[String, String], Set[String])]).unzip
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+
+      val filesToMove = allAbsPathFiles.foldLeft(Map[String, String]())(_ ++ _)
+      logDebug(s"Committing files staged for absolute locations $filesToMove")
+      if (dynamicPartitionOverwrite) {
+        val absPartitionPaths = filesToMove.values.map(new Path(_).getParent).toSet
+        logDebug(s"Clean up absolute partition directories for overwriting: $absPartitionPaths")
+        absPartitionPaths.foreach(fs.delete(_, true))
+      }
       for ((src, dst) <- filesToMove) {
         fs.rename(new Path(src), new Path(dst))
       }
-      fs.delete(absPathStagingDir, true)
+
+      if (dynamicPartitionOverwrite) {
+        val partitionPaths = allPartitionPaths.foldLeft(Set[String]())(_ ++ _)
+        logDebug(s"Clean up default partition directories for overwriting: $partitionPaths")
+        for (part <- partitionPaths) {
+          val finalPartPath = new Path(path, part)
+          fs.delete(finalPartPath, true)
+          fs.rename(new Path(stagingDir, part), finalPartPath)
+        }
+      }
+
+      fs.delete(stagingDir, true)
     }
   }
 
   override def abortJob(jobContext: JobContext): Unit = {
     committer.abortJob(jobContext, JobStatus.State.FAILED)
     if (hasValidPath) {
-      val fs = absPathStagingDir.getFileSystem(jobContext.getConfiguration)
-      fs.delete(absPathStagingDir, true)
+      val fs = stagingDir.getFileSystem(jobContext.getConfiguration)
+      fs.delete(stagingDir, true)
     }
   }
 
@@ -165,13 +207,14 @@ class HadoopMapReduceCommitProtocol(jobId: String, path: String)
     committer = setupCommitter(taskContext)
     committer.setupTask(taskContext)
     addedAbsPathFiles = mutable.Map[String, String]()
+    partitionPaths = mutable.Set[String]()
   }
 
   override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = {
     val attemptId = taskContext.getTaskAttemptID
     SparkHadoopMapRedUtil.commitTask(
       committer, taskContext, attemptId.getJobID.getId, attemptId.getTaskID.getId)
-    new TaskCommitMessage(addedAbsPathFiles.toMap)
+    new TaskCommitMessage(addedAbsPathFiles.toMap -> partitionPaths.toSet)
   }
 
   override def abortTask(taskContext: TaskAttemptContext): Unit = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 80cdc61484c0f..5d6edf6b8abec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1068,6 +1068,24 @@ object SQLConf {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefault(100)
 
+  object PartitionOverwriteMode extends Enumeration {
+    val STATIC, DYNAMIC = Value
+  }
+
+  val PARTITION_OVERWRITE_MODE =
+    buildConf("spark.sql.sources.partitionOverwriteMode")
+      .doc("When INSERT OVERWRITE a partitioned data source table, we currently support 2 modes: " +
+        "static and dynamic. In static mode, Spark deletes all the partitions that match the " +
+        "partition specification(e.g. PARTITION(a=1,b)) in the INSERT statement, before " +
+        "overwriting. In dynamic mode, Spark doesn't delete partitions ahead, and only overwrite " +
+        "those partitions that have data written into it at runtime. By default we use static " +
+        "mode to keep the same behavior of Spark prior to 2.3. Note that this config doesn't " +
+        "affect Hive serde tables, as they are always overwritten with dynamic mode.")
+      .stringConf
+      .transform(_.toUpperCase(Locale.ROOT))
+      .checkValues(PartitionOverwriteMode.values.map(_.toString))
+      .createWithDefault(PartitionOverwriteMode.STATIC.toString)
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
   }
@@ -1394,6 +1412,9 @@ class SQLConf extends Serializable with Logging {
 
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
+  def partitionOverwriteMode: PartitionOverwriteMode.Value =
+    PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
index ad24e280d942a..dd7ef0d15c140 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InsertIntoHadoopFsRelationCommand.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.command._
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.util.SchemaUtils
 
 /**
@@ -89,13 +90,19 @@ case class InsertIntoHadoopFsRelationCommand(
     }
 
     val pathExists = fs.exists(qualifiedOutputPath)
-    // If we are appending data to an existing dir.
-    val isAppend = pathExists && (mode == SaveMode.Append)
+
+    val enableDynamicOverwrite =
+      sparkSession.sessionState.conf.partitionOverwriteMode == PartitionOverwriteMode.DYNAMIC
+    // This config only makes sense when we are overwriting a partitioned dataset with dynamic
+    // partition columns.
+    val dynamicPartitionOverwrite = enableDynamicOverwrite && mode == SaveMode.Overwrite &&
+      staticPartitions.size < partitionColumns.length
 
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
-      outputPath = outputPath.toString)
+      outputPath = outputPath.toString,
+      dynamicPartitionOverwrite = dynamicPartitionOverwrite)
 
     val doInsertion = (mode, pathExists) match {
       case (SaveMode.ErrorIfExists, true) =>
@@ -103,6 +110,9 @@ case class InsertIntoHadoopFsRelationCommand(
       case (SaveMode.Overwrite, true) =>
         if (ifPartitionNotExists && matchingPartitions.nonEmpty) {
           false
+        } else if (dynamicPartitionOverwrite) {
+          // For dynamic partition overwrite, do not delete partition directories ahead.
+          true
         } else {
           deleteMatchingPartitions(fs, qualifiedOutputPath, customPartitionLocations, committer)
           true
@@ -126,7 +136,9 @@ case class InsertIntoHadoopFsRelationCommand(
               catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)),
               ifNotExists = true).run(sparkSession)
           }
-          if (mode == SaveMode.Overwrite) {
+          // For dynamic partition overwrite, we never remove partitions but only update existing
+          // ones.
+          if (mode == SaveMode.Overwrite && !dynamicPartitionOverwrite) {
             val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions
             if (deletedPartitions.nonEmpty) {
               AlterTableDropPartitionCommand(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
index 40825a1f724b1..39c594a9bc618 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SQLHadoopMapReduceCommitProtocol.scala
@@ -29,11 +29,15 @@ import org.apache.spark.sql.internal.SQLConf
  * A variant of [[HadoopMapReduceCommitProtocol]] that allows specifying the actual
  * Hadoop output committer using an option specified in SQLConf.
  */
-class SQLHadoopMapReduceCommitProtocol(jobId: String, path: String)
-  extends HadoopMapReduceCommitProtocol(jobId, path) with Serializable with Logging {
+class SQLHadoopMapReduceCommitProtocol(
+    jobId: String,
+    path: String,
+    dynamicPartitionOverwrite: Boolean = false)
+  extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite)
+    with Serializable with Logging {
 
   override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = {
-    var committer = context.getOutputFormatClass.newInstance().getOutputCommitter(context)
+    var committer = super.setupCommitter(context)
 
     val configuration = context.getConfiguration
     val clazz =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 8b7e2e5f45946..fef01c860db6e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -21,6 +21,8 @@ import java.io.File
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, Row}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.util.Utils
 
@@ -442,4 +444,80 @@ class InsertSuite extends DataSourceTest with SharedSQLContext {
       assert(e.contains("Only Data Sources providing FileFormat are supported"))
     }
   }
+
+  test("SPARK-20236: dynamic partition overwrite without catalog table") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempPath { path =>
+        Seq((1, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(1, 1, 1))
+
+        Seq((2, 1, 1)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1))
+
+        Seq((2, 2, 2)).toDF("i", "part1", "part2")
+          .write.partitionBy("part1", "part2").mode("overwrite").parquet(path.getAbsolutePath)
+        checkAnswer(spark.read.parquet(path.getAbsolutePath), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        sql("insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-20236: dynamic partition overwrite with customer partition path") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        val path1 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=1) location '$path1'")
+        sql(s"insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        val path2 = Utils.createTempDir()
+        sql(s"alter table t add partition(part1=1, part2=2) location '$path2'")
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+      }
+    }
+  }
 }

From 9a2b65a3c0c36316aae0a53aa0f61c5044c2ceff Mon Sep 17 00:00:00 2001
From: chetkhatri <ckhatrimanjal@gmail.com>
Date: Wed, 3 Jan 2018 11:31:32 -0600
Subject: [PATCH 252/356] [SPARK-22896] Improvement in String interpolation

## What changes were proposed in this pull request?

* String interpolation in ml pipeline example has been corrected as per scala standard.

## How was this patch tested?
* manually tested.

Author: chetkhatri <ckhatrimanjal@gmail.com>

Closes #20070 from chetkhatri/mllib-chetan-contrib.
---
 .../spark/examples/ml/JavaQuantileDiscretizerExample.java | 2 +-
 .../apache/spark/examples/SimpleSkewedGroupByTest.scala   | 4 ----
 .../org/apache/spark/examples/graphx/Analytics.scala      | 6 ++++--
 .../org/apache/spark/examples/graphx/SynthBenchmark.scala | 6 +++---
 .../apache/spark/examples/ml/ChiSquareTestExample.scala   | 6 +++---
 .../org/apache/spark/examples/ml/CorrelationExample.scala | 4 ++--
 .../org/apache/spark/examples/ml/DataFrameExample.scala   | 4 ++--
 .../examples/ml/DecisionTreeClassificationExample.scala   | 4 ++--
 .../spark/examples/ml/DecisionTreeRegressionExample.scala | 4 ++--
 .../apache/spark/examples/ml/DeveloperApiExample.scala    | 6 +++---
 .../examples/ml/EstimatorTransformerParamExample.scala    | 6 +++---
 .../ml/GradientBoostedTreeClassifierExample.scala         | 4 ++--
 .../examples/ml/GradientBoostedTreeRegressorExample.scala | 4 ++--
 ...ulticlassLogisticRegressionWithElasticNetExample.scala | 2 +-
 .../ml/MultilayerPerceptronClassifierExample.scala        | 2 +-
 .../org/apache/spark/examples/ml/NaiveBayesExample.scala  | 2 +-
 .../spark/examples/ml/QuantileDiscretizerExample.scala    | 4 ++--
 .../spark/examples/ml/RandomForestClassifierExample.scala | 4 ++--
 .../spark/examples/ml/RandomForestRegressorExample.scala  | 4 ++--
 .../apache/spark/examples/ml/VectorIndexerExample.scala   | 4 ++--
 .../spark/examples/mllib/AssociationRulesExample.scala    | 6 +++---
 .../mllib/BinaryClassificationMetricsExample.scala        | 4 ++--
 .../mllib/DecisionTreeClassificationExample.scala         | 4 ++--
 .../examples/mllib/DecisionTreeRegressionExample.scala    | 4 ++--
 .../org/apache/spark/examples/mllib/FPGrowthExample.scala | 2 +-
 .../mllib/GradientBoostingClassificationExample.scala     | 4 ++--
 .../mllib/GradientBoostingRegressionExample.scala         | 4 ++--
 .../spark/examples/mllib/HypothesisTestingExample.scala   | 2 +-
 .../spark/examples/mllib/IsotonicRegressionExample.scala  | 2 +-
 .../org/apache/spark/examples/mllib/KMeansExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/LBFGSExample.scala    | 2 +-
 .../examples/mllib/LatentDirichletAllocationExample.scala | 8 +++++---
 .../examples/mllib/LinearRegressionWithSGDExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/PCAExample.scala      | 4 ++--
 .../spark/examples/mllib/PMMLModelExportExample.scala     | 2 +-
 .../apache/spark/examples/mllib/PrefixSpanExample.scala   | 4 ++--
 .../mllib/RandomForestClassificationExample.scala         | 4 ++--
 .../examples/mllib/RandomForestRegressionExample.scala    | 4 ++--
 .../spark/examples/mllib/RecommendationExample.scala      | 2 +-
 .../apache/spark/examples/mllib/SVMWithSGDExample.scala   | 2 +-
 .../org/apache/spark/examples/mllib/SimpleFPGrowth.scala  | 8 +++-----
 .../spark/examples/mllib/StratifiedSamplingExample.scala  | 4 ++--
 .../org/apache/spark/examples/mllib/TallSkinnyPCA.scala   | 2 +-
 .../org/apache/spark/examples/mllib/TallSkinnySVD.scala   | 2 +-
 .../apache/spark/examples/streaming/CustomReceiver.scala  | 6 +++---
 .../apache/spark/examples/streaming/RawNetworkGrep.scala  | 2 +-
 .../examples/streaming/RecoverableNetworkWordCount.scala  | 8 ++++----
 .../streaming/clickstream/PageViewGenerator.scala         | 4 ++--
 .../examples/streaming/clickstream/PageViewStream.scala   | 4 ++--
 49 files changed, 94 insertions(+), 96 deletions(-)

diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
index dd20cac621102..43cc30c1a899b 100644
--- a/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaQuantileDiscretizerExample.java
@@ -66,7 +66,7 @@ public static void main(String[] args) {
       .setNumBuckets(3);
 
     Dataset<Row> result = discretizer.fit(df).transform(df);
-    result.show();
+    result.show(false);
     // $example off$
     spark.stop();
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
index e64dcbd182d94..2332a661f26a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala
@@ -60,10 +60,6 @@ object SimpleSkewedGroupByTest {
     pairs1.count
 
     println(s"RESULT: ${pairs1.groupByKey(numReducers).count}")
-    // Print how many keys each reducer got (for debugging)
-    // println("RESULT: " + pairs1.groupByKey(numReducers)
-    //                           .map{case (k,v) => (k, v.size)}
-    //                           .collectAsMap)
 
     spark.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
index 92936bd30dbc0..815404d1218b7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/Analytics.scala
@@ -145,9 +145,11 @@ object Analytics extends Logging {
           // TriangleCount requires the graph to be partitioned
           .partitionBy(partitionStrategy.getOrElse(RandomVertexCut)).cache()
         val triangles = TriangleCount.run(graph)
-        println("Triangles: " + triangles.vertices.map {
+        val triangleTypes = triangles.vertices.map {
           case (vid, data) => data.toLong
-        }.reduce(_ + _) / 3)
+        }.reduce(_ + _) / 3
+
+        println(s"Triangles: ${triangleTypes}")
         sc.stop()
 
       case _ =>
diff --git a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
index 6d2228c8742aa..57b2edf992208 100644
--- a/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/graphx/SynthBenchmark.scala
@@ -52,7 +52,7 @@ object SynthBenchmark {
       arg =>
         arg.dropWhile(_ == '-').split('=') match {
           case Array(opt, v) => (opt -> v)
-          case _ => throw new IllegalArgumentException("Invalid argument: " + arg)
+          case _ => throw new IllegalArgumentException(s"Invalid argument: $arg")
         }
     }
 
@@ -76,7 +76,7 @@ object SynthBenchmark {
       case ("sigma", v) => sigma = v.toDouble
       case ("degFile", v) => degFile = v
       case ("seed", v) => seed = v.toInt
-      case (opt, _) => throw new IllegalArgumentException("Invalid option: " + opt)
+      case (opt, _) => throw new IllegalArgumentException(s"Invalid option: $opt")
     }
 
     val conf = new SparkConf()
@@ -86,7 +86,7 @@ object SynthBenchmark {
     val sc = new SparkContext(conf)
 
     // Create the graph
-    println(s"Creating graph...")
+    println("Creating graph...")
     val unpartitionedGraph = GraphGenerators.logNormalGraph(sc, numVertices,
       numEPart.getOrElse(sc.defaultParallelism), mu, sigma, seed)
     // Repartition the graph
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
index dcee1e427ce58..5146fd0316467 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/ChiSquareTestExample.scala
@@ -52,9 +52,9 @@ object ChiSquareTestExample {
 
     val df = data.toDF("label", "features")
     val chi = ChiSquareTest.test(df, "features", "label").head
-    println("pValues = " + chi.getAs[Vector](0))
-    println("degreesOfFreedom = " + chi.getSeq[Int](1).mkString("[", ",", "]"))
-    println("statistics = " + chi.getAs[Vector](2))
+    println(s"pValues = ${chi.getAs[Vector](0)}")
+    println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}")
+    println(s"statistics ${chi.getAs[Vector](2)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
index 3f57dc342eb00..d7f1fc8ed74d7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/CorrelationExample.scala
@@ -51,10 +51,10 @@ object CorrelationExample {
 
     val df = data.map(Tuple1.apply).toDF("features")
     val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
-    println("Pearson correlation matrix:\n" + coeff1.toString)
+    println(s"Pearson correlation matrix:\n $coeff1")
 
     val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
-    println("Spearman correlation matrix:\n" + coeff2.toString)
+    println(s"Spearman correlation matrix:\n $coeff2")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
index 0658bddf16961..ee4469faab3a0 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala
@@ -47,7 +47,7 @@ object DataFrameExample {
     val parser = new OptionParser[Params]("DataFrameExample") {
       head("DataFrameExample: an example app using DataFrame for ML.")
       opt[String]("input")
-        .text(s"input path to dataframe")
+        .text("input path to dataframe")
         .action((x, c) => c.copy(input = x))
       checkConfig { params =>
         success
@@ -93,7 +93,7 @@ object DataFrameExample {
     // Load the records back.
     println(s"Loading Parquet file with UDT from $outputDir.")
     val newDF = spark.read.parquet(outputDir)
-    println(s"Schema from Parquet:")
+    println("Schema from Parquet:")
     newDF.printSchema()
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
index bc6d3275933ea..276cedab11abc 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeClassificationExample.scala
@@ -83,10 +83,10 @@ object DecisionTreeClassificationExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
-    println("Learned classification tree model:\n" + treeModel.toDebugString)
+    println(s"Learned classification tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
index ee61200ad1d0c..aaaecaea47081 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DecisionTreeRegressionExample.scala
@@ -73,10 +73,10 @@ object DecisionTreeRegressionExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
-    println("Learned regression tree model:\n" + treeModel.toDebugString)
+    println(s"Learned regression tree model:\n ${treeModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index d94d837d10e96..2dc11b07d88ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -53,7 +53,7 @@ object DeveloperApiExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new MyLogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("MyLogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"MyLogisticRegression parameters:\n ${lr.explainParams()}")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -169,10 +169,10 @@ private class MyLogisticRegressionModel(
     Vectors.dense(-margin, margin)
   }
 
-  /** Number of classes the label can take. 2 indicates binary classification. */
+  // Number of classes the label can take. 2 indicates binary classification.
   override val numClasses: Int = 2
 
-  /** Number of features the model was trained on. */
+  // Number of features the model was trained on.
   override val numFeatures: Int = coefficients.size
 
   /**
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
index f18d86e1a6921..e5d91f132a3f2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/EstimatorTransformerParamExample.scala
@@ -46,7 +46,7 @@ object EstimatorTransformerParamExample {
     // Create a LogisticRegression instance. This instance is an Estimator.
     val lr = new LogisticRegression()
     // Print out the parameters, documentation, and any default values.
-    println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
+    println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")
 
     // We may set parameters using setter methods.
     lr.setMaxIter(10)
@@ -58,7 +58,7 @@ object EstimatorTransformerParamExample {
     // we can view the parameters it used during fit().
     // This prints the parameter (name: value) pairs, where names are unique IDs for this
     // LogisticRegression instance.
-    println("Model 1 was fit using parameters: " + model1.parent.extractParamMap)
+    println(s"Model 1 was fit using parameters: ${model1.parent.extractParamMap}")
 
     // We may alternatively specify parameters using a ParamMap,
     // which supports several methods for specifying parameters.
@@ -73,7 +73,7 @@ object EstimatorTransformerParamExample {
     // Now learn a new model using the paramMapCombined parameters.
     // paramMapCombined overrides all parameters set earlier via lr.set* methods.
     val model2 = lr.fit(training, paramMapCombined)
-    println("Model 2 was fit using parameters: " + model2.parent.extractParamMap)
+    println(s"Model 2 was fit using parameters: ${model2.parent.extractParamMap}")
 
     // Prepare test data.
     val test = spark.createDataFrame(Seq(
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
index 3656773c8b817..ef78c0a1145ef 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeClassifierExample.scala
@@ -86,10 +86,10 @@ object GradientBoostedTreeClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${1.0 - accuracy}")
 
     val gbtModel = model.stages(2).asInstanceOf[GBTClassificationModel]
-    println("Learned classification GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned classification GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
index e53aab7f326d3..3feb2343f6a85 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/GradientBoostedTreeRegressorExample.scala
@@ -73,10 +73,10 @@ object GradientBoostedTreeRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
-    println("Learned regression GBT model:\n" + gbtModel.toDebugString)
+    println(s"Learned regression GBT model:\n ${gbtModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
index 42f0ace7a353d..3e61dbe628c20 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MulticlassLogisticRegressionWithElasticNetExample.scala
@@ -48,7 +48,7 @@ object MulticlassLogisticRegressionWithElasticNetExample {
 
     // Print the coefficients and intercept for multinomial logistic regression
     println(s"Coefficients: \n${lrModel.coefficientMatrix}")
-    println(s"Intercepts: ${lrModel.interceptVector}")
+    println(s"Intercepts: \n${lrModel.interceptVector}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
index 6fce82d294f8d..646f46a925062 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/MultilayerPerceptronClassifierExample.scala
@@ -66,7 +66,7 @@ object MultilayerPerceptronClassifierExample {
     val evaluator = new MulticlassClassificationEvaluator()
       .setMetricName("accuracy")
 
-    println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels))
+    println(s"Test set accuracy = ${evaluator.evaluate(predictionAndLabels)}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
index bd9fcc420a66c..50c70c626b128 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/NaiveBayesExample.scala
@@ -52,7 +52,7 @@ object NaiveBayesExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test set accuracy = " + accuracy)
+    println(s"Test set accuracy = $accuracy")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
index aedb9e7d3bb70..0fe16fb6dfa9f 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/QuantileDiscretizerExample.scala
@@ -36,7 +36,7 @@ object QuantileDiscretizerExample {
     // Output of QuantileDiscretizer for such small datasets can depend on the number of
     // partitions. Here we force a single partition to ensure consistent results.
     // Note this is not necessary for normal use cases
-        .repartition(1)
+      .repartition(1)
 
     // $example on$
     val discretizer = new QuantileDiscretizer()
@@ -45,7 +45,7 @@ object QuantileDiscretizerExample {
       .setNumBuckets(3)
 
     val result = discretizer.fit(df).transform(df)
-    result.show()
+    result.show(false)
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
index 5eafda8ce4285..6265f83902528 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestClassifierExample.scala
@@ -85,10 +85,10 @@ object RandomForestClassifierExample {
       .setPredictionCol("prediction")
       .setMetricName("accuracy")
     val accuracy = evaluator.evaluate(predictions)
-    println("Test Error = " + (1.0 - accuracy))
+    println(s"Test Error = ${(1.0 - accuracy)}")
 
     val rfModel = model.stages(2).asInstanceOf[RandomForestClassificationModel]
-    println("Learned classification forest model:\n" + rfModel.toDebugString)
+    println(s"Learned classification forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
index 9a0a001c26ef5..2679fcb353a8a 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomForestRegressorExample.scala
@@ -72,10 +72,10 @@ object RandomForestRegressorExample {
       .setPredictionCol("prediction")
       .setMetricName("rmse")
     val rmse = evaluator.evaluate(predictions)
-    println("Root Mean Squared Error (RMSE) on test data = " + rmse)
+    println(s"Root Mean Squared Error (RMSE) on test data = $rmse")
 
     val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
-    println("Learned regression forest model:\n" + rfModel.toDebugString)
+    println(s"Learned regression forest model:\n ${rfModel.toDebugString}")
     // $example off$
 
     spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
index afa761aee0b98..96bb8ea2338af 100644
--- a/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorIndexerExample.scala
@@ -41,8 +41,8 @@ object VectorIndexerExample {
     val indexerModel = indexer.fit(data)
 
     val categoricalFeatures: Set[Int] = indexerModel.categoryMaps.keys.toSet
-    println(s"Chose ${categoricalFeatures.size} categorical features: " +
-      categoricalFeatures.mkString(", "))
+    println(s"Chose ${categoricalFeatures.size} " +
+      s"categorical features: ${categoricalFeatures.mkString(", ")}")
 
     // Create new column "indexed" with categorical values transformed to indices
     val indexedData = indexerModel.transform(data)
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
index ff44de56839e5..a07535bb5a38d 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/AssociationRulesExample.scala
@@ -42,9 +42,8 @@ object AssociationRulesExample {
     val results = ar.run(freqItemsets)
 
     results.collect().foreach { rule =>
-      println("[" + rule.antecedent.mkString(",")
-        + "=>"
-        + rule.consequent.mkString(",") + "]," + rule.confidence)
+    println(s"[${rule.antecedent.mkString(",")}=>${rule.consequent.mkString(",")} ]" +
+        s" ${rule.confidence}")
     }
     // $example off$
 
@@ -53,3 +52,4 @@ object AssociationRulesExample {
 
 }
 // scalastyle:on println
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
index b9263ac6fcff6..c6312d71cc912 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/BinaryClassificationMetricsExample.scala
@@ -86,7 +86,7 @@ object BinaryClassificationMetricsExample {
 
     // AUPRC
     val auPRC = metrics.areaUnderPR
-    println("Area under precision-recall curve = " + auPRC)
+    println(s"Area under precision-recall curve = $auPRC")
 
     // Compute thresholds used in ROC and PR curves
     val thresholds = precision.map(_._1)
@@ -96,7 +96,7 @@ object BinaryClassificationMetricsExample {
 
     // AUROC
     val auROC = metrics.areaUnderROC
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
index b50b4592777ce..c2f89b72c9a2e 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
@@ -55,8 +55,8 @@ object DecisionTreeClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification tree model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
index 2af45afae3d5b..1ecf6426e1f95 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
@@ -54,8 +54,8 @@ object DecisionTreeRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) }.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression tree model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression tree model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
index 6435abc127752..f724ee1030f04 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/FPGrowthExample.scala
@@ -74,7 +74,7 @@ object FPGrowthExample {
     println(s"Number of frequent itemsets: ${model.freqItemsets.count()}")
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")}, ${itemset.freq}")
     }
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
index 00bb3348d2a36..3c56e1941aeca 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingClassificationExample.scala
@@ -54,8 +54,8 @@ object GradientBoostingClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification GBT model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
index d8c263460839b..c288bf29bf255 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/GradientBoostingRegressionExample.scala
@@ -53,8 +53,8 @@ object GradientBoostingRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression GBT model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression GBT model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myGradientBoostingRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
index 0d391a3637c07..add1719739539 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/HypothesisTestingExample.scala
@@ -68,7 +68,7 @@ object HypothesisTestingExample {
     // against the label.
     val featureTestResults: Array[ChiSqTestResult] = Statistics.chiSqTest(obs)
     featureTestResults.zipWithIndex.foreach { case (k, v) =>
-      println("Column " + (v + 1).toString + ":")
+      println(s"Column ${(v + 1)} :")
       println(k)
     }  // summary of the test
     // $example off$
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
index 4aee951f5b04c..a10d6f0dda880 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/IsotonicRegressionExample.scala
@@ -56,7 +56,7 @@ object IsotonicRegressionExample {
 
     // Calculate mean squared error between predicted and real labels.
     val meanSquaredError = predictionAndLabel.map { case (p, l) => math.pow((p - l), 2) }.mean()
-    println("Mean Squared Error = " + meanSquaredError)
+    println(s"Mean Squared Error = $meanSquaredError")
 
     // Save and load model
     model.save(sc, "target/tmp/myIsotonicRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
index c4d71d862f375..b0a6f1671a898 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/KMeansExample.scala
@@ -43,7 +43,7 @@ object KMeansExample {
 
     // Evaluate clustering by computing Within Set Sum of Squared Errors
     val WSSSE = clusters.computeCost(parsedData)
-    println("Within Set Sum of Squared Errors = " + WSSSE)
+    println(s"Within Set Sum of Squared Errors = $WSSSE")
 
     // Save and load model
     clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
index fedcefa098381..123782fa6b9cf 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LBFGSExample.scala
@@ -82,7 +82,7 @@ object LBFGSExample {
 
     println("Loss of each step in training process")
     loss.foreach(println)
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
index f2c8ec01439f1..d25962c5500ed 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LatentDirichletAllocationExample.scala
@@ -42,11 +42,13 @@ object LatentDirichletAllocationExample {
     val ldaModel = new LDA().setK(3).run(corpus)
 
     // Output topics. Each is a distribution over words (matching word count vectors)
-    println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
+    println(s"Learned topics (as distributions over vocab of ${ldaModel.vocabSize} words):")
     val topics = ldaModel.topicsMatrix
     for (topic <- Range(0, 3)) {
-      print("Topic " + topic + ":")
-      for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
+      print(s"Topic $topic :")
+      for (word <- Range(0, ldaModel.vocabSize)) {
+        print(s"${topics(word, topic)}")
+      }
       println()
     }
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
index d399618094487..449b725d1d173 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/LinearRegressionWithSGDExample.scala
@@ -52,7 +52,7 @@ object LinearRegressionWithSGDExample {
       (point.label, prediction)
     }
     val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
-    println("training Mean Squared Error = " + MSE)
+    println(s"training Mean Squared Error $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
index eb36697d94ba1..eff2393cc3abe 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PCAExample.scala
@@ -65,8 +65,8 @@ object PCAExample {
     val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
     val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()
 
-    println("Mean Squared Error = " + MSE)
-    println("PCA Mean Squared Error = " + MSE_pca)
+    println(s"Mean Squared Error = $MSE")
+    println(s"PCA Mean Squared Error = $MSE_pca")
     // $example off$
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
index d74d74a37fb11..96deafd469bc7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PMMLModelExportExample.scala
@@ -41,7 +41,7 @@ object PMMLModelExportExample {
     val clusters = KMeans.train(parsedData, numClusters, numIterations)
 
     // Export to PMML to a String in PMML format
-    println("PMML Model:\n" + clusters.toPMML)
+    println(s"PMML Model:\n ${clusters.toPMML}")
 
     // Export the model to a local file in PMML format
     clusters.toPMML("/tmp/kmeans.xml")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
index 69c72c4336576..8b789277774af 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/PrefixSpanExample.scala
@@ -42,8 +42,8 @@ object PrefixSpanExample {
     val model = prefixSpan.run(sequences)
     model.freqSequences.collect().foreach { freqSequence =>
       println(
-        freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") +
-          ", " + freqSequence.freq)
+        s"${freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")}," +
+          s" ${freqSequence.freq}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
index f1ebdf1a733ed..246e71de25615 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestClassificationExample.scala
@@ -55,8 +55,8 @@ object RandomForestClassificationExample {
       (point.label, prediction)
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
-    println("Test Error = " + testErr)
-    println("Learned classification forest model:\n" + model.toDebugString)
+    println(s"Test Error = $testErr")
+    println(s"Learned classification forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestClassificationModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
index 11d612e651b4b..770e30276bc30 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomForestRegressionExample.scala
@@ -55,8 +55,8 @@ object RandomForestRegressionExample {
       (point.label, prediction)
     }
     val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 2)}.mean()
-    println("Test Mean Squared Error = " + testMSE)
-    println("Learned regression forest model:\n" + model.toDebugString)
+    println(s"Test Mean Squared Error = $testMSE")
+    println(s"Learned regression forest model:\n ${model.toDebugString}")
 
     // Save and load model
     model.save(sc, "target/tmp/myRandomForestRegressionModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
index 6df742d737e70..0bb2b8c8c2b43 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/RecommendationExample.scala
@@ -56,7 +56,7 @@ object RecommendationExample {
       val err = (r1 - r2)
       err * err
     }.mean()
-    println("Mean Squared Error = " + MSE)
+    println(s"Mean Squared Error = $MSE")
 
     // Save and load model
     model.save(sc, "target/tmp/myCollaborativeFilter")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
index b73fe9b2b3faa..285e2ce512639 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala
@@ -57,7 +57,7 @@ object SVMWithSGDExample {
     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
     val auROC = metrics.areaUnderROC()
 
-    println("Area under ROC = " + auROC)
+    println(s"Area under ROC = $auROC")
 
     // Save and load model
     model.save(sc, "target/tmp/scalaSVMWithSGDModel")
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
index b5c3033bcba09..694c3bb18b045 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/SimpleFPGrowth.scala
@@ -42,15 +42,13 @@ object SimpleFPGrowth {
     val model = fpg.run(transactions)
 
     model.freqItemsets.collect().foreach { itemset =>
-      println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
+      println(s"${itemset.items.mkString("[", ",", "]")},${itemset.freq}")
     }
 
     val minConfidence = 0.8
     model.generateAssociationRules(minConfidence).collect().foreach { rule =>
-      println(
-        rule.antecedent.mkString("[", ",", "]")
-          + " => " + rule.consequent .mkString("[", ",", "]")
-          + ", " + rule.confidence)
+      println(s"${rule.antecedent.mkString("[", ",", "]")}=> " +
+        s"${rule.consequent .mkString("[", ",", "]")},${rule.confidence}")
     }
     // $example off$
 
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
index 16b074ef60699..3d41bef0af88c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/StratifiedSamplingExample.scala
@@ -41,10 +41,10 @@ object StratifiedSamplingExample {
     val exactSample = data.sampleByKeyExact(withReplacement = false, fractions = fractions)
     // $example off$
 
-    println("approxSample size is " + approxSample.collect().size.toString)
+    println(s"approxSample size is ${approxSample.collect().size}")
     approxSample.collect().foreach(println)
 
-    println("exactSample its size is " + exactSample.collect().size.toString)
+    println(s"exactSample its size is ${exactSample.collect().size}")
     exactSample.collect().foreach(println)
 
     sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
index 03bc675299c5a..071d341b81614 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnyPCA.scala
@@ -54,7 +54,7 @@ object TallSkinnyPCA {
     // Compute principal components.
     val pc = mat.computePrincipalComponents(mat.numCols().toInt)
 
-    println("Principal components are:\n" + pc)
+    println(s"Principal components are:\n $pc")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
index 067e49b9599e7..8ae6de16d80e7 100644
--- a/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/mllib/TallSkinnySVD.scala
@@ -54,7 +54,7 @@ object TallSkinnySVD {
     // Compute SVD.
     val svd = mat.computeSVD(mat.numCols().toInt)
 
-    println("Singular values are " + svd.s)
+    println(s"Singular values are ${svd.s}")
 
     sc.stop()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
index 43044d01b1204..25c7bf2871972 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/CustomReceiver.scala
@@ -82,9 +82,9 @@ class CustomReceiver(host: String, port: Int)
    var socket: Socket = null
    var userInput: String = null
    try {
-     logInfo("Connecting to " + host + ":" + port)
+     logInfo(s"Connecting to $host : $port")
      socket = new Socket(host, port)
-     logInfo("Connected to " + host + ":" + port)
+     logInfo(s"Connected to $host : $port")
      val reader = new BufferedReader(
        new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8))
      userInput = reader.readLine()
@@ -98,7 +98,7 @@ class CustomReceiver(host: String, port: Int)
      restart("Trying to connect again")
    } catch {
      case e: java.net.ConnectException =>
-       restart("Error connecting to " + host + ":" + port, e)
+       restart(s"Error connecting to $host : $port", e)
      case t: Throwable =>
        restart("Error receiving data", t)
    }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
index 5322929d177b4..437ccf0898d7c 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RawNetworkGrep.scala
@@ -54,7 +54,7 @@ object RawNetworkGrep {
       ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray
     val union = ssc.union(rawStreams)
     union.filter(_.contains("the")).count().foreachRDD(r =>
-      println("Grep count: " + r.collect().mkString))
+      println(s"Grep count: ${r.collect().mkString}"))
     ssc.start()
     ssc.awaitTermination()
   }
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
index 49c0427321133..f018f3a26d2e9 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/RecoverableNetworkWordCount.scala
@@ -130,10 +130,10 @@ object RecoverableNetworkWordCount {
           true
         }
       }.collect().mkString("[", ", ", "]")
-      val output = "Counts at time " + time + " " + counts
+      val output = s"Counts at time $time $counts"
       println(output)
-      println("Dropped " + droppedWordsCounter.value + " word(s) totally")
-      println("Appending to " + outputFile.getAbsolutePath)
+      println(s"Dropped ${droppedWordsCounter.value} word(s) totally")
+      println(s"Appending to ${outputFile.getAbsolutePath}")
       Files.append(output + "\n", outputFile, Charset.defaultCharset())
     }
     ssc
@@ -141,7 +141,7 @@ object RecoverableNetworkWordCount {
 
   def main(args: Array[String]) {
     if (args.length != 4) {
-      System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
+      System.err.println(s"Your arguments were ${args.mkString("[", ", ", "]")}")
       System.err.println(
         """
           |Usage: RecoverableNetworkWordCount <hostname> <port> <checkpoint-directory>
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
index 0ddd065f0db2b..2108bc63edea2 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewGenerator.scala
@@ -90,13 +90,13 @@ object PageViewGenerator {
     val viewsPerSecond = args(1).toFloat
     val sleepDelayMs = (1000.0 / viewsPerSecond).toInt
     val listener = new ServerSocket(port)
-    println("Listening on port: " + port)
+    println(s"Listening on port: $port")
 
     while (true) {
       val socket = listener.accept()
       new Thread() {
         override def run(): Unit = {
-          println("Got client connected from: " + socket.getInetAddress)
+          println(s"Got client connected from: ${socket.getInetAddress}")
           val out = new PrintWriter(socket.getOutputStream(), true)
 
           while (true) {
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
index 1ba093f57b32c..b8e7c7e9e9152 100644
--- a/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/clickstream/PageViewStream.scala
@@ -104,8 +104,8 @@ object PageViewStream {
           .foreachRDD((rdd, time) => rdd.join(userList)
             .map(_._2._2)
             .take(10)
-            .foreach(u => println("Saw user %s at time %s".format(u, time))))
-      case _ => println("Invalid metric entered: " + metric)
+            .foreach(u => println(s"Saw user $u at time $time")))
+      case _ => println(s"Invalid metric entered: $metric")
     }
 
     ssc.start()

From b297029130735316e1ac1144dee44761a12bfba7 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 07:28:53 +0800
Subject: [PATCH 253/356] [SPARK-20960][SQL] make ColumnVector public

## What changes were proposed in this pull request?

move `ColumnVector` and related classes to `org.apache.spark.sql.vectorized`, and improve the document.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20116 from cloud-fan/column-vector.
---
 .../VectorizedParquetRecordReader.java        |  7 ++-
 .../vectorized/ColumnVectorUtils.java         |  2 +
 .../vectorized/MutableColumnarRow.java        |  4 ++
 .../vectorized/WritableColumnVector.java      |  7 ++-
 .../vectorized/ArrowColumnVector.java         | 62 +------------------
 .../vectorized/ColumnVector.java              | 31 ++++++----
 .../vectorized/ColumnarArray.java             |  7 +--
 .../vectorized/ColumnarBatch.java             | 34 +++-------
 .../vectorized/ColumnarRow.java               |  7 +--
 .../sql/execution/ColumnarBatchScan.scala     |  4 +-
 .../aggregate/HashAggregateExec.scala         |  2 +-
 .../VectorizedHashMapGenerator.scala          |  3 +-
 .../sql/execution/arrow/ArrowConverters.scala |  2 +-
 .../columnar/InMemoryTableScanExec.scala      |  1 +
 .../execution/datasources/FileScanRDD.scala   |  2 +-
 .../execution/python/ArrowPythonRunner.scala  |  2 +-
 .../execution/arrow/ArrowWriterSuite.scala    |  2 +-
 .../vectorized/ArrowColumnVectorSuite.scala   |  1 +
 .../vectorized/ColumnVectorSuite.scala        |  2 +-
 .../vectorized/ColumnarBatchSuite.scala       |  6 +-
 20 files changed, 63 insertions(+), 125 deletions(-)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ArrowColumnVector.java (94%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnVector.java (79%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarArray.java (95%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarBatch.java (73%)
 rename sql/core/src/main/java/org/apache/spark/sql/{execution => }/vectorized/ColumnarRow.java (96%)

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
index 6c157e85d411f..cd745b1f0e4e3 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
@@ -31,10 +31,10 @@
 import org.apache.spark.memory.MemoryMode;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch;
 import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
 import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
 import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
@@ -248,7 +248,10 @@ public void enableReturningBatches() {
    * Advances to the next batch of rows. Returns false if there are no more.
    */
   public boolean nextBatch() throws IOException {
-    columnarBatch.reset();
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+    columnarBatch.setNumRows(0);
     if (rowsReturned >= totalRowCount) return false;
     checkEndOfRowGroup();
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
index bc62bc43484e5..b5cbe8e2839ba 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVectorUtils.java
@@ -28,6 +28,8 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.util.DateTimeUtils;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
index 06602c147dfe9..70057a9def6c0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/MutableColumnarRow.java
@@ -23,6 +23,10 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarArray;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.ColumnarRow;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.types.CalendarInterval;
 import org.apache.spark.unsafe.types.UTF8String;
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 5f6f125976e12..d2ae32b06f83b 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnVector;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -585,11 +586,11 @@ public final int appendArray(int length) {
   public final int appendStruct(boolean isNull) {
     if (isNull) {
       appendNull();
-      for (ColumnVector c: childColumns) {
+      for (WritableColumnVector c: childColumns) {
         if (c.type instanceof StructType) {
-          ((WritableColumnVector) c).appendStruct(true);
+          c.appendStruct(true);
         } else {
-          ((WritableColumnVector) c).appendNull();
+          c.appendNull();
         }
       }
     } else {
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
similarity index 94%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
index af5673e26a501..708333213f3f1 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ArrowColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ArrowColumnVector.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.arrow.vector.*;
 import org.apache.arrow.vector.complex.*;
@@ -34,11 +34,7 @@ public final class ArrowColumnVector extends ColumnVector {
   private ArrowColumnVector[] childColumns;
 
   private void ensureAccessible(int index) {
-    int valueCount = accessor.getValueCount();
-    if (index < 0 || index >= valueCount) {
-      throw new IndexOutOfBoundsException(
-        String.format("index: %d, valueCount: %d", index, valueCount));
-    }
+    ensureAccessible(index, 1);
   }
 
   private void ensureAccessible(int index, int count) {
@@ -64,20 +60,12 @@ public void close() {
     accessor.close();
   }
 
-  //
-  // APIs dealing with nulls
-  //
-
   @Override
   public boolean isNullAt(int rowId) {
     ensureAccessible(rowId);
     return accessor.isNullAt(rowId);
   }
 
-  //
-  // APIs dealing with Booleans
-  //
-
   @Override
   public boolean getBoolean(int rowId) {
     ensureAccessible(rowId);
@@ -94,10 +82,6 @@ public boolean[] getBooleans(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Bytes
-  //
-
   @Override
   public byte getByte(int rowId) {
     ensureAccessible(rowId);
@@ -114,10 +98,6 @@ public byte[] getBytes(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Shorts
-  //
-
   @Override
   public short getShort(int rowId) {
     ensureAccessible(rowId);
@@ -134,10 +114,6 @@ public short[] getShorts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Ints
-  //
-
   @Override
   public int getInt(int rowId) {
     ensureAccessible(rowId);
@@ -154,10 +130,6 @@ public int[] getInts(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Longs
-  //
-
   @Override
   public long getLong(int rowId) {
     ensureAccessible(rowId);
@@ -174,10 +146,6 @@ public long[] getLongs(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with floats
-  //
-
   @Override
   public float getFloat(int rowId) {
     ensureAccessible(rowId);
@@ -194,10 +162,6 @@ public float[] getFloats(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with doubles
-  //
-
   @Override
   public double getDouble(int rowId) {
     ensureAccessible(rowId);
@@ -214,10 +178,6 @@ public double[] getDoubles(int rowId, int count) {
     return array;
   }
 
-  //
-  // APIs dealing with Arrays
-  //
-
   @Override
   public int getArrayLength(int rowId) {
     ensureAccessible(rowId);
@@ -230,45 +190,27 @@ public int getArrayOffset(int rowId) {
     return accessor.getArrayOffset(rowId);
   }
 
-  //
-  // APIs dealing with Decimals
-  //
-
   @Override
   public Decimal getDecimal(int rowId, int precision, int scale) {
     ensureAccessible(rowId);
     return accessor.getDecimal(rowId, precision, scale);
   }
 
-  //
-  // APIs dealing with UTF8Strings
-  //
-
   @Override
   public UTF8String getUTF8String(int rowId) {
     ensureAccessible(rowId);
     return accessor.getUTF8String(rowId);
   }
 
-  //
-  // APIs dealing with Binaries
-  //
-
   @Override
   public byte[] getBinary(int rowId) {
     ensureAccessible(rowId);
     return accessor.getBinary(rowId);
   }
 
-  /**
-   * Returns the data for the underlying array.
-   */
   @Override
   public ArrowColumnVector arrayData() { return childColumns[0]; }
 
-  /**
-   * Returns the ordinal's child data column.
-   */
   @Override
   public ArrowColumnVector getChildColumn(int ordinal) { return childColumns[ordinal]; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
similarity index 79%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
index dc7c1269bedd9..d1196e1299fee 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnVector.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.MapData;
 import org.apache.spark.sql.types.DataType;
@@ -22,24 +22,31 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * This class represents in-memory values of a column and provides the main APIs to access the data.
- * It supports all the types and contains get APIs as well as their batched versions. The batched
- * versions are considered to be faster and preferable whenever possible.
+ * An interface representing in-memory columnar data in Spark. This interface defines the main APIs
+ * to access the data, as well as their batched versions. The batched versions are considered to be
+ * faster and preferable whenever possible.
  *
- * To handle nested schemas, ColumnVector has two types: Arrays and Structs. In both cases these
- * columns have child columns. All of the data are stored in the child columns and the parent column
- * only contains nullability. In the case of Arrays, the lengths and offsets are saved in the child
- * column and are encoded identically to INTs.
+ * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
+ * in this ColumnVector.
  *
- * Maps are just a special case of a two field struct.
+ * ColumnVector supports all the data types including nested types. To handle nested types,
+ * ColumnVector can have children and is a tree structure. For struct type, it stores the actual
+ * data of each field in the corresponding child ColumnVector, and only stores null information in
+ * the parent ColumnVector. For array type, it stores the actual array elements in the child
+ * ColumnVector, and stores null information, array offsets and lengths in the parent ColumnVector.
  *
- * Most of the APIs take the rowId as a parameter. This is the batch local 0-based row id for values
- * in the current batch.
+ * ColumnVector is expected to be reused during the entire data loading process, to avoid allocating
+ * memory again and again.
+ *
+ * ColumnVector is meant to maximize CPU efficiency but not to minimize storage footprint.
+ * Implementations should prefer computing efficiency over storage efficiency when design the
+ * format. Since it is expected to reuse the ColumnVector instance while loading data, the storage
+ * footprint is negligible.
  */
 public abstract class ColumnVector implements AutoCloseable {
 
   /**
-   * Returns the data type of this column.
+   * Returns the data type of this column vector.
    */
   public final DataType dataType() { return type; }
 
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
similarity index 95%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
index cbc39d1d0aec2..0d89a52e7a4fe 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarArray.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarArray.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.util.ArrayData;
 import org.apache.spark.sql.catalyst.util.MapData;
@@ -23,8 +23,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Array abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Array abstraction in {@link ColumnVector}.
  */
 public final class ColumnarArray extends ArrayData {
   // The data for this array. This array contains elements from
@@ -33,7 +32,7 @@ public final class ColumnarArray extends ArrayData {
   private final int offset;
   private final int length;
 
-  ColumnarArray(ColumnVector data, int offset, int length) {
+  public ColumnarArray(ColumnVector data, int offset, int length) {
     this.data = data;
     this.offset = offset;
     this.length = length;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
similarity index 73%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
index a9d09aa679726..9ae1c6d9993f0 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarBatch.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatch.java
@@ -14,26 +14,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import java.util.*;
 
 import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow;
 import org.apache.spark.sql.types.StructType;
 
 /**
- * This class is the in memory representation of rows as they are streamed through operators. It
- * is designed to maximize CPU efficiency and not storage footprint. Since it is expected that
- * each operator allocates one of these objects, the storage footprint on the task is negligible.
- *
- * The layout is a columnar with values encoded in their native format. Each RowBatch contains
- * a horizontal partitioning of the data, split into columns.
- *
- * The ColumnarBatch supports either on heap or offheap modes with (mostly) the identical API.
- *
- * TODO:
- *  - There are many TODOs for the existing APIs. They should throw a not implemented exception.
- *  - Compaction: The batch and columns should be able to compact based on a selection vector.
+ * This class wraps multiple ColumnVectors as a row-wise table. It provides a row view of this
+ * batch so that Spark can access the data row by row. Instance of it is meant to be reused during
+ * the entire data loading process.
  */
 public final class ColumnarBatch {
   public static final int DEFAULT_BATCH_SIZE = 4 * 1024;
@@ -57,7 +49,7 @@ public void close() {
   }
 
   /**
-   * Returns an iterator over the rows in this batch. This skips rows that are filtered out.
+   * Returns an iterator over the rows in this batch.
    */
   public Iterator<InternalRow> rowIterator() {
     final int maxRows = numRows;
@@ -87,19 +79,7 @@ public void remove() {
   }
 
   /**
-   * Resets the batch for writing.
-   */
-  public void reset() {
-    for (int i = 0; i < numCols(); ++i) {
-      if (columns[i] instanceof WritableColumnVector) {
-        ((WritableColumnVector) columns[i]).reset();
-      }
-    }
-    this.numRows = 0;
-  }
-
-  /**
-   * Sets the number of rows that are valid.
+   * Sets the number of rows in this batch.
    */
   public void setNumRows(int numRows) {
     assert(numRows <= this.capacity);
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
similarity index 96%
rename from sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
rename to sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
index 8bb33ed5b78c0..3c6656dec77cd 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/ColumnarRow.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/vectorized/ColumnarRow.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.sql.execution.vectorized;
+package org.apache.spark.sql.vectorized;
 
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
@@ -24,8 +24,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 /**
- * Row abstraction in {@link ColumnVector}. The instance of this class is intended
- * to be reused, callers should copy the data out if it needs to be stored.
+ * Row abstraction in {@link ColumnVector}.
  */
 public final class ColumnarRow extends InternalRow {
   // The data for this row.
@@ -34,7 +33,7 @@ public final class ColumnarRow extends InternalRow {
   private final int rowId;
   private final int numFields;
 
-  ColumnarRow(ColumnVector data, int rowId) {
+  public ColumnarRow(ColumnVector data, int rowId) {
     assert (data.dataType() instanceof StructType);
     this.data = data;
     this.rowId = rowId;
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
index 782cec5e292ba..5617046e1396e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ColumnarBatchScan.scala
@@ -20,13 +20,13 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 /**
  * Helper trait for abstracting scan functionality using
- * [[org.apache.spark.sql.execution.vectorized.ColumnarBatch]]es.
+ * [[ColumnarBatch]]es.
  */
 private[sql] trait ColumnarBatchScan extends CodegenSupport {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 9a6f1c6dfa6a9..ce3c68810f3b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
-import org.apache.spark.sql.execution.vectorized.{ColumnarRow, MutableColumnarRow}
+import org.apache.spark.sql.execution.vectorized.MutableColumnarRow
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.util.Utils
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
index 0380ee8b09d63..0cf9b53ce1d5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/VectorizedHashMapGenerator.scala
@@ -20,8 +20,9 @@ package org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
-import org.apache.spark.sql.execution.vectorized.{ColumnarBatch, MutableColumnarRow, OnHeapColumnVector}
+import org.apache.spark.sql.execution.vectorized.{MutableColumnarRow, OnHeapColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarBatch
 
 /**
  * This is a helper class to generate an append-only vectorized hash map that can act as a 'cache'
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index bcfc412430263..bcd1aa0890ba3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -32,8 +32,8 @@ import org.apache.spark.TaskContext
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
index 3e73393b12850..933b9753faa61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partition
 import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.vectorized._
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
 
 
 case class InMemoryTableScanExec(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index 8731ee88f87f2..835ce98462477 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -26,7 +26,7 @@ import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{InputFileBlockHolder, RDD}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.vectorized.ColumnarBatch
+import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.NextIterator
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
index 5cc8ed3535654..dc5ba96e69aec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/ArrowPythonRunner.scala
@@ -30,8 +30,8 @@ import org.apache.spark._
 import org.apache.spark.api.python._
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.{ArrowUtils, ArrowWriter}
-import org.apache.spark.sql.execution.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.util.Utils
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
index 508c116aae92e..c42bc60a59d67 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowWriterSuite.scala
@@ -20,8 +20,8 @@ package org.apache.spark.sql.execution.arrow
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.ArrayData
-import org.apache.spark.sql.execution.vectorized.ArrowColumnVector
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowWriterSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
index 03490ad15a655..7304803a092c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ArrowColumnVectorSuite.scala
@@ -23,6 +23,7 @@ import org.apache.arrow.vector.complex._
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ArrowColumnVector
 import org.apache.spark.unsafe.types.UTF8String
 
 class ArrowColumnVectorSuite extends SparkFunSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 54b31cee031f6..944240f3bade5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -21,10 +21,10 @@ import org.scalatest.BeforeAndAfterEach
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.SpecificInternalRow
-import org.apache.spark.sql.catalyst.util.ArrayData
 import org.apache.spark.sql.execution.columnar.ColumnAccessor
 import org.apache.spark.sql.execution.columnar.compression.ColumnBuilderHelper
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.ColumnarArray
 import org.apache.spark.unsafe.types.UTF8String
 
 class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
index 7848ebdcab6d0..675f06b31b970 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnarBatchSuite.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.{RandomDataGenerator, Row}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.arrow.ArrowUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector}
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.CalendarInterval
 
@@ -918,10 +919,7 @@ class ColumnarBatchSuite extends SparkFunSuite {
       assert(it.hasNext == false)
 
       // Reset and add 3 rows
-      batch.reset()
-      assert(batch.numRows() == 0)
-      assert(batch.rowIterator().hasNext == false)
-
+      columns.foreach(_.reset())
       // Add rows [NULL, 2.2, 2, "abc"], [3, NULL, 3, ""], [4, 4.4, 4, "world]
       columns(0).putNull(0)
       columns(1).putDouble(0, 2.2)

From 7d045c5f00e2c7c67011830e2169a4e130c3ace8 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 13:14:52 +0800
Subject: [PATCH 254/356] [SPARK-22944][SQL] improve FoldablePropagation

## What changes were proposed in this pull request?

`FoldablePropagation` is a little tricky as it needs to handle attributes that are miss-derived from children, e.g. outer join outputs. This rule does a kind of stop-able tree transform, to skip to apply this rule when hit a node which may have miss-derived attributes.

Logically we should be able to apply this rule above the unsupported nodes, by just treating the unsupported nodes as leaf nodes. This PR improves this rule to not stop the tree transformation, but reduce the foldable expressions that we want to propagate.

## How was this patch tested?

existing tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20139 from cloud-fan/foldable.
---
 .../sql/catalyst/optimizer/expressions.scala  | 65 +++++++++++--------
 .../optimizer/FoldablePropagationSuite.scala  | 23 ++++++-
 2 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 7d830bbb7dc32..1c0b7bd806801 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -506,18 +506,21 @@ object NullPropagation extends Rule[LogicalPlan] {
 
 
 /**
- * Propagate foldable expressions:
  * Replace attributes with aliases of the original foldable expressions if possible.
- * Other optimizations will take advantage of the propagated foldable expressions.
- *
+ * Other optimizations will take advantage of the propagated foldable expressions. For example,
+ * this rule can optimize
  * {{{
  *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY x, y, 3
- *   ==>  SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
  * }}}
+ * to
+ * {{{
+ *   SELECT 1.0 x, 'abc' y, Now() z ORDER BY 1.0, 'abc', Now()
+ * }}}
+ * and other rules can further optimize it and remove the ORDER BY operator.
  */
 object FoldablePropagation extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = {
-    val foldableMap = AttributeMap(plan.flatMap {
+    var foldableMap = AttributeMap(plan.flatMap {
       case Project(projectList, _) => projectList.collect {
         case a: Alias if a.child.foldable => (a.toAttribute, a)
       }
@@ -530,38 +533,44 @@ object FoldablePropagation extends Rule[LogicalPlan] {
     if (foldableMap.isEmpty) {
       plan
     } else {
-      var stop = false
       CleanupAliases(plan.transformUp {
-        // A leaf node should not stop the folding process (note that we are traversing up the
-        // tree, starting at the leaf nodes); so we are allowing it.
-        case l: LeafNode =>
-          l
-
         // We can only propagate foldables for a subset of unary nodes.
-        case u: UnaryNode if !stop && canPropagateFoldables(u) =>
+        case u: UnaryNode if foldableMap.nonEmpty && canPropagateFoldables(u) =>
           u.transformExpressions(replaceFoldable)
 
-        // Allow inner joins. We do not allow outer join, although its output attributes are
-        // derived from its children, they are actually different attributes: the output of outer
-        // join is not always picked from its children, but can also be null.
+        // Join derives the output attributes from its child while they are actually not the
+        // same attributes. For example, the output of outer join is not always picked from its
+        // children, but can also be null. We should exclude these miss-derived attributes when
+        // propagating the foldable expressions.
         // TODO(cloud-fan): It seems more reasonable to use new attributes as the output attributes
         // of outer join.
-        case j @ Join(_, _, Inner, _) if !stop =>
-          j.transformExpressions(replaceFoldable)
-
-        // We can fold the projections an expand holds. However expand changes the output columns
-        // and often reuses the underlying attributes; so we cannot assume that a column is still
-        // foldable after the expand has been applied.
-        // TODO(hvanhovell): Expand should use new attributes as the output attributes.
-        case expand: Expand if !stop =>
-          val newExpand = expand.copy(projections = expand.projections.map { projection =>
+        case j @ Join(left, right, joinType, _) if foldableMap.nonEmpty =>
+          val newJoin = j.transformExpressions(replaceFoldable)
+          val missDerivedAttrsSet: AttributeSet = AttributeSet(joinType match {
+            case _: InnerLike | LeftExistence(_) => Nil
+            case LeftOuter => right.output
+            case RightOuter => left.output
+            case FullOuter => left.output ++ right.output
+          })
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => missDerivedAttrsSet.contains(attr)
+          }.toSeq)
+          newJoin
+
+        // We can not replace the attributes in `Expand.output`. If there are other non-leaf
+        // operators that have the `output` field, we should put them here too.
+        case expand: Expand if foldableMap.nonEmpty =>
+          expand.copy(projections = expand.projections.map { projection =>
             projection.map(_.transform(replaceFoldable))
           })
-          stop = true
-          newExpand
 
-        case other =>
-          stop = true
+        // For other plans, they are not safe to apply foldable propagation, and they should not
+        // propagate foldable expressions from children.
+        case other if foldableMap.nonEmpty =>
+          val childrenOutputSet = AttributeSet(other.children.flatMap(_.output))
+          foldableMap = AttributeMap(foldableMap.baseMap.values.filterNot {
+            case (attr, _) => childrenOutputSet.contains(attr)
+          }.toSeq)
           other
       })
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
index dccb32f0379a8..c28844642aed0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FoldablePropagationSuite.scala
@@ -147,8 +147,8 @@ class FoldablePropagationSuite extends PlanTest {
   test("Propagate in expand") {
     val c1 = Literal(1).as('a)
     val c2 = Literal(2).as('b)
-    val a1 = c1.toAttribute.withNullability(true)
-    val a2 = c2.toAttribute.withNullability(true)
+    val a1 = c1.toAttribute.newInstance().withNullability(true)
+    val a2 = c2.toAttribute.newInstance().withNullability(true)
     val expand = Expand(
       Seq(Seq(Literal(null), 'b), Seq('a, Literal(null))),
       Seq(a1, a2),
@@ -161,4 +161,23 @@ class FoldablePropagationSuite extends PlanTest {
     val correctAnswer = correctExpand.where(a1.isNotNull).select(a1, a2).analyze
     comparePlans(optimized, correctAnswer)
   }
+
+  test("Propagate above outer join") {
+    val left = LocalRelation('a.int).select('a, Literal(1).as('b))
+    val right = LocalRelation('c.int).select('c, Literal(1).as('d))
+
+    val join = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && 'b === 'd))
+    val query = join.select(('b + 3).as('res)).analyze
+    val optimized = Optimize.execute(query)
+
+    val correctAnswer = left.join(
+      right,
+      joinType = LeftOuter,
+      condition = Some('a === 'c && Literal(1) === Literal(1)))
+      .select((Literal(1) + 3).as('res)).analyze
+    comparePlans(optimized, correctAnswer)
+  }
 }

From df95a908baf78800556636a76d58bba9b3dd943f Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Wed, 3 Jan 2018 21:43:14 -0800
Subject: [PATCH 255/356] [SPARK-22933][SPARKR] R Structured Streaming API for
 withWatermark, trigger, partitionBy

## What changes were proposed in this pull request?

R Structured Streaming API for withWatermark, trigger, partitionBy

## How was this patch tested?

manual, unit tests

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20129 from felixcheung/rwater.
---
 R/pkg/NAMESPACE                               |   1 +
 R/pkg/R/DataFrame.R                           |  96 +++++++++++++++-
 R/pkg/R/SQLContext.R                          |   4 +-
 R/pkg/R/generics.R                            |   6 +
 R/pkg/tests/fulltests/test_streaming.R        | 107 ++++++++++++++++++
 python/pyspark/sql/streaming.py               |   4 +
 .../sql/execution/streaming/Triggers.scala    |   2 +-
 7 files changed, 214 insertions(+), 6 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 3219c6f0cc47b..c51eb0f39c4b1 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -179,6 +179,7 @@ exportMethods("arrange",
               "with",
               "withColumn",
               "withColumnRenamed",
+              "withWatermark",
               "write.df",
               "write.jdbc",
               "write.json",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index fe238f6dd4eb0..9956f7eda91e6 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3661,7 +3661,8 @@ setMethod("getNumPartitions",
 #' isStreaming
 #'
 #' Returns TRUE if this SparkDataFrame contains one or more sources that continuously return data
-#' as it arrives.
+#' as it arrives. A dataset that reads data from a streaming source must be executed as a
+#' \code{StreamingQuery} using \code{write.stream}.
 #'
 #' @param x A SparkDataFrame
 #' @return TRUE if this SparkDataFrame is from a streaming source
@@ -3707,7 +3708,17 @@ setMethod("isStreaming",
 #' @param df a streaming SparkDataFrame.
 #' @param source a name for external data source.
 #' @param outputMode one of 'append', 'complete', 'update'.
-#' @param ... additional argument(s) passed to the method.
+#' @param partitionBy a name or a list of names of columns to partition the output by on the file
+#'        system. If specified, the output is laid out on the file system similar to Hive's
+#'        partitioning scheme.
+#' @param trigger.processingTime a processing time interval as a string, e.g. '5 seconds',
+#'        '1 minute'. This is a trigger that runs a query periodically based on the processing
+#'        time. If value is '0 seconds', the query will run as fast as possible, this is the
+#'        default. Only one trigger can be set.
+#' @param trigger.once a logical, must be set to \code{TRUE}. This is a trigger that processes only
+#'        one batch of data in a streaming query then terminates the query. Only one trigger can be
+#'        set.
+#' @param ... additional external data source specific named options.
 #'
 #' @family SparkDataFrame functions
 #' @seealso \link{read.stream}
@@ -3725,7 +3736,8 @@ setMethod("isStreaming",
 #' # console
 #' q <- write.stream(wordCounts, "console", outputMode = "complete")
 #' # text stream
-#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp")
+#' q <- write.stream(df, "text", path = "/home/user/out", checkpointLocation = "/home/user/cp"
+#'                   partitionBy = c("year", "month"), trigger.processingTime = "30 seconds")
 #' # memory stream
 #' q <- write.stream(wordCounts, "memory", queryName = "outs", outputMode = "complete")
 #' head(sql("SELECT * from outs"))
@@ -3737,7 +3749,8 @@ setMethod("isStreaming",
 #' @note experimental
 setMethod("write.stream",
           signature(df = "SparkDataFrame"),
-          function(df, source = NULL, outputMode = NULL, ...) {
+          function(df, source = NULL, outputMode = NULL, partitionBy = NULL,
+                   trigger.processingTime = NULL, trigger.once = NULL, ...) {
             if (!is.null(source) && !is.character(source)) {
               stop("source should be character, NULL or omitted. It is the data source specified ",
                    "in 'spark.sql.sources.default' configuration by default.")
@@ -3748,12 +3761,43 @@ setMethod("write.stream",
             if (is.null(source)) {
               source <- getDefaultSqlSource()
             }
+            cols <- NULL
+            if (!is.null(partitionBy)) {
+              if (!all(sapply(partitionBy, function(c) { is.character(c) }))) {
+                stop("All partitionBy column names should be characters.")
+              }
+              cols <- as.list(partitionBy)
+            }
+            jtrigger <- NULL
+            if (!is.null(trigger.processingTime) && !is.na(trigger.processingTime)) {
+              if (!is.null(trigger.once)) {
+                stop("Multiple triggers not allowed.")
+              }
+              interval <- as.character(trigger.processingTime)
+              if (nchar(interval) == 0) {
+                stop("Value for trigger.processingTime must be a non-empty string.")
+              }
+              jtrigger <- handledCallJStatic("org.apache.spark.sql.streaming.Trigger",
+                                             "ProcessingTime",
+                                             interval)
+            } else if (!is.null(trigger.once) && !is.na(trigger.once)) {
+              if (!is.logical(trigger.once) || !trigger.once) {
+                stop("Value for trigger.once must be TRUE.")
+              }
+              jtrigger <- callJStatic("org.apache.spark.sql.streaming.Trigger", "Once")
+            }
             options <- varargsToStrEnv(...)
             write <- handledCallJMethod(df@sdf, "writeStream")
             write <- callJMethod(write, "format", source)
             if (!is.null(outputMode)) {
               write <- callJMethod(write, "outputMode", outputMode)
             }
+            if (!is.null(cols)) {
+              write <- callJMethod(write, "partitionBy", cols)
+            }
+            if (!is.null(jtrigger)) {
+              write <- callJMethod(write, "trigger", jtrigger)
+            }
             write <- callJMethod(write, "options", options)
             ssq <- handledCallJMethod(write, "start")
             streamingQuery(ssq)
@@ -3967,3 +4011,47 @@ setMethod("broadcast",
             sdf <- callJStatic("org.apache.spark.sql.functions", "broadcast", x@sdf)
             dataFrame(sdf)
           })
+
+#' withWatermark
+#'
+#' Defines an event time watermark for this streaming SparkDataFrame. A watermark tracks a point in
+#' time before which we assume no more late data is going to arrive.
+#'
+#' Spark will use this watermark for several purposes:
+#' \itemize{
+#'  \item{-} To know when a given time window aggregation can be finalized and thus can be emitted
+#' when using output modes that do not allow updates.
+#'  \item{-} To minimize the amount of state that we need to keep for on-going aggregations.
+#' }
+#' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
+#' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost
+#' of coordinating this value across partitions, the actual watermark used is only guaranteed
+#' to be at least \code{delayThreshold} behind the actual event time.  In some cases we may still
+#' process records that arrive more than \code{delayThreshold} late.
+#'
+#' @param x a streaming SparkDataFrame
+#' @param eventTime a string specifying the name of the Column that contains the event time of the
+#'                  row.
+#' @param delayThreshold a string specifying the minimum delay to wait to data to arrive late,
+#'                       relative to the latest record that has been processed in the form of an
+#'                       interval (e.g. "1 minute" or "5 hours"). NOTE: This should not be negative.
+#' @return a SparkDataFrame.
+#' @aliases withWatermark,SparkDataFrame,character,character-method
+#' @family SparkDataFrame functions
+#' @rdname withWatermark
+#' @name withWatermark
+#' @export
+#' @examples
+#' \dontrun{
+#' sparkR.session()
+#' schema <- structType(structField("time", "timestamp"), structField("value", "double"))
+#' df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+#' df <- withWatermark(df, "time", "10 minutes")
+#' }
+#' @note withWatermark since 2.3.0
+setMethod("withWatermark",
+          signature(x = "SparkDataFrame", eventTime = "character", delayThreshold = "character"),
+          function(x, eventTime, delayThreshold) {
+            sdf <- callJMethod(x@sdf, "withWatermark", eventTime, delayThreshold)
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 3b7f71bbbffb8..9d0a2d5e074e4 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -727,7 +727,9 @@ read.jdbc <- function(url, tableName,
 #' @param schema The data schema defined in structType or a DDL-formatted string, this is
 #'               required for file-based streaming data source
 #' @param ... additional external data source specific named options, for instance \code{path} for
-#'        file-based streaming data source
+#'        file-based streaming data source. \code{timeZone} to indicate a timezone to be used to 
+#'        parse timestamps in the JSON/CSV data sources or partition values; If it isn't set, it
+#'        uses the default value, session local timezone.
 #' @return SparkDataFrame
 #' @rdname read.stream
 #' @name read.stream
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 5369c32544e5e..e0dde3339fabc 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -799,6 +799,12 @@ setGeneric("withColumn", function(x, colName, col) { standardGeneric("withColumn
 setGeneric("withColumnRenamed",
            function(x, existingCol, newCol) { standardGeneric("withColumnRenamed") })
 
+#' @rdname withWatermark
+#' @export
+setGeneric("withWatermark", function(x, eventTime, delayThreshold) {
+  standardGeneric("withWatermark")
+})
+
 #' @rdname write.df
 #' @export
 setGeneric("write.df", function(df, path = NULL, ...) { standardGeneric("write.df") })
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
index 54f40bbd5f517..a354d50c6b54e 100644
--- a/R/pkg/tests/fulltests/test_streaming.R
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -172,6 +172,113 @@ test_that("Terminated by error", {
   stopQuery(q)
 })
 
+test_that("PartitionBy", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  checkpointPath <- tempfile(pattern = "sparkr-test", fileext = ".checkpoint")
+  textPath <- tempfile(pattern = "sparkr-test", fileext = ".text")
+  df <- read.df(jsonPath, "json", stringSchema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = stringSchema)
+
+  expect_error(write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                            partitionBy = c(1, 2)),
+                            "All partitionBy column names should be characters")
+
+  q <- write.stream(df, "json", path = textPath, checkpointLocation = "append",
+                    partitionBy = "name")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  dirs <- list.files(textPath)
+  expect_equal(length(dirs[substring(dirs, 1, nchar("name=")) == "name="]), 3)
+
+  unlink(checkpointPath)
+  unlink(textPath)
+  unlink(parquetPath)
+})
+
+test_that("Watermark", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  t <- Sys.time()
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+  df <- withColumn(df, "eventTime", cast(df$value, "timestamp"))
+  df <- withWatermark(df, "eventTime", "10 seconds")
+  counts <- count(group_by(df, "eventTime"))
+  q <- write.stream(counts, "memory", queryName = "times", outputMode = "append")
+
+  # first events
+  df <- as.DataFrame(lapply(list(t + 1, t, t + 2), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # advance watermark to 15
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # old events, should be dropped
+  df <- as.DataFrame(lapply(list(t), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  # evict events less than previous watermark
+  df <- as.DataFrame(lapply(list(t + 25), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  times <- collect(sql("SELECT * FROM times"))
+  # looks like write timing can affect the first bucket; but it should be t
+  expect_equal(times[order(times$eventTime),][1, 2], 2)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
+test_that("Trigger", {
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  schema <- structType(structField("value", "string"))
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  df <- read.stream(path = parquetPath, schema = "value STRING")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "", trigger.once = ""), "Multiple triggers not allowed.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = ""),
+               "Value for trigger.processingTime must be a non-empty string.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.processingTime = "invalid"), "illegal argument")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = ""), "Value for trigger.once must be TRUE.")
+
+  expect_error(write.stream(df, "memory", queryName = "times", outputMode = "append",
+               trigger.once = FALSE), "Value for trigger.once must be TRUE.")
+
+  q <- write.stream(df, "memory", queryName = "times", outputMode = "append", trigger.once = TRUE)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  df <- as.DataFrame(lapply(list(Sys.time()), as.character), schema)
+  write.df(df, parquetPath, "parquet", "append")
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(nrow(collect(sql("SELECT * FROM times"))), 1)
+
+  stopQuery(q)
+  unlink(parquetPath)
+})
+
 unlink(jsonPath)
 unlink(jsonPathNa)
 
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index fb228f99ba7ab..24ae3776a217b 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -793,6 +793,10 @@ def trigger(self, processingTime=None, once=None):
         .. note:: Evolving.
 
         :param processingTime: a processing time interval as a string, e.g. '5 seconds', '1 minute'.
+                               Set a trigger that runs a query periodically based on the processing
+                               time. Only one trigger can be set.
+        :param once: if set to True, set a trigger that processes only one batch of data in a
+                     streaming query then terminates the query. Only one trigger can be set.
 
         >>> # trigger the query for execution every 5 seconds
         >>> writer = sdf.writeStream.trigger(processingTime='5 seconds')
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
index 271bc4da99c08..19e3e55cb2829 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/Triggers.scala
@@ -21,7 +21,7 @@ import org.apache.spark.annotation.{Experimental, InterfaceStability}
 import org.apache.spark.sql.streaming.Trigger
 
 /**
- * A [[Trigger]] that process only one batch of data in a streaming query then terminates
+ * A [[Trigger]] that processes only one batch of data in a streaming query then terminates
  * the query.
  */
 @Experimental

From 9fa703e89318922393bae03c0db4575f4f4b4c56 Mon Sep 17 00:00:00 2001
From: Kent Yao <yaooqinn@hotmail.com>
Date: Thu, 4 Jan 2018 19:10:10 +0800
Subject: [PATCH 256/356] [SPARK-22950][SQL] Handle ChildFirstURLClassLoader's
 parent

## What changes were proposed in this pull request?

ChildFirstClassLoader's parent is set to null, so we can't get jars from its parent. This will cause ClassNotFoundException during HiveClient initialization with builtin hive jars, where we may should use spark context loader instead.

## How was this patch tested?

add new ut
cc cloud-fan gatorsmile

Author: Kent Yao <yaooqinn@hotmail.com>

Closes #20145 from yaooqinn/SPARK-22950.
---
 .../org/apache/spark/sql/hive/HiveUtils.scala |  4 +++-
 .../spark/sql/hive/HiveUtilsSuite.scala       | 20 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
index c489690af8cd1..c7717d70c996f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveUtils.scala
@@ -47,7 +47,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.internal.StaticSQLConf.{CATALOG_IMPLEMENTATION, WAREHOUSE_PATH}
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{ChildFirstURLClassLoader, Utils}
 
 
 private[spark] object HiveUtils extends Logging {
@@ -312,6 +312,8 @@ private[spark] object HiveUtils extends Logging {
       // starting from the given classLoader.
       def allJars(classLoader: ClassLoader): Array[URL] = classLoader match {
         case null => Array.empty[URL]
+        case childFirst: ChildFirstURLClassLoader =>
+          childFirst.getURLs() ++ allJars(Utils.getSparkClassLoader)
         case urlClassLoader: URLClassLoader =>
           urlClassLoader.getURLs ++ allJars(urlClassLoader.getParent)
         case other => allJars(other.getParent)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
index fdbfcf1a68440..8697d47e89e89 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveUtilsSuite.scala
@@ -17,11 +17,16 @@
 
 package org.apache.spark.sql.hive
 
+import java.net.URL
+
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 
+import org.apache.spark.SparkConf
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.SQLTestUtils
+import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}
 
 class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
@@ -42,4 +47,19 @@ class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton
       assert(hiveConf("foo") === "bar")
     }
   }
+
+  test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") {
+    val conf = new SparkConf
+    val contextClassLoader = Thread.currentThread().getContextClassLoader
+    val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader)
+    try {
+      Thread.currentThread().setContextClassLoader(loader)
+      HiveUtils.newClientForMetadata(
+        conf,
+        SparkHadoopUtil.newConfiguration(conf),
+        HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true))
+    } finally {
+      Thread.currentThread().setContextClassLoader(contextClassLoader)
+    }
+  }
 }

From d5861aba9d80ca15ad3f22793b79822e470d6913 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Jan 2018 19:17:22 +0800
Subject: [PATCH 257/356] [SPARK-22945][SQL] add java UDF APIs in the functions
 object

## What changes were proposed in this pull request?

Currently Scala users can use UDF like
```
val foo = udf((i: Int) => Math.random() + i).asNondeterministic
df.select(foo('a))
```
Python users can also do it with similar APIs. However Java users can't do it, we should add Java UDF APIs in the functions object.

## How was this patch tested?

new tests

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20141 from cloud-fan/udf.
---
 .../apache/spark/sql/UDFRegistration.scala    |  90 ++---
 .../sql/expressions/UserDefinedFunction.scala |   1 +
 .../org/apache/spark/sql/functions.scala      | 313 ++++++++++++++----
 .../apache/spark/sql/JavaDataFrameSuite.java  |  11 +
 .../scala/org/apache/spark/sql/UDFSuite.scala |  12 +-
 5 files changed, 315 insertions(+), 112 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
index dc2468a721e41..f94baef39dfad 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/UDFRegistration.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql
 
-import java.lang.reflect.{ParameterizedType, Type}
+import java.lang.reflect.ParameterizedType
 
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.Try
@@ -110,29 +110,29 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
 
   /* register 0-22 were generated by this script
 
-    (0 to 22).map { x =>
+    (0 to 22).foreach { x =>
       val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
-      val typeTags = (1 to x).map(i => s"A${i}: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
+      val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
       val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor[A$i].dataType :: $s"})
       println(s"""
-        /**
-         * Registers a deterministic Scala closure of ${x} arguments as user-defined function (UDF).
-         * @tparam RT return type of UDF.
-         * @since 1.3.0
-         */
-        def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
-          val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-          val inputTypes = Try($inputTypes).toOption
-          def builder(e: Seq[Expression]) = if (e.length == $x) {
-            ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
-          } else {
-             throw new AnalysisException("Invalid number of arguments for function " + name +
-               ". Expected: $x; Found: " + e.length)
-          }
-          functionRegistry.createOrReplaceTempFunction(name, builder)
-          val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
-          if (nullable) udf else udf.asNonNullable()
-        }""")
+        |/**
+        | * Registers a deterministic Scala closure of $x arguments as user-defined function (UDF).
+        | * @tparam RT return type of UDF.
+        | * @since 1.3.0
+        | */
+        |def register[$typeTags](name: String, func: Function$x[$types]): UserDefinedFunction = {
+        |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+        |  val inputTypes = Try($inputTypes).toOption
+        |  def builder(e: Seq[Expression]) = if (e.length == $x) {
+        |    ScalaUDF(func, dataType, e, inputTypes.getOrElse(Nil), Some(name), nullable, udfDeterministic = true)
+        |  } else {
+        |    throw new AnalysisException("Invalid number of arguments for function " + name +
+        |      ". Expected: $x; Found: " + e.length)
+        |  }
+        |  functionRegistry.createOrReplaceTempFunction(name, builder)
+        |  val udf = UserDefinedFunction(func, dataType, inputTypes).withName(name)
+        |  if (nullable) udf else udf.asNonNullable()
+        |}""".stripMargin)
     }
 
     (0 to 22).foreach { i =>
@@ -144,7 +144,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
       val funcCall = if (i == 0) "() => func" else "func"
       println(s"""
         |/**
-        | * Register a user-defined function with ${i} arguments.
+        | * Register a deterministic Java UDF$i instance as user-defined function (UDF).
         | * @since $version
         | */
         |def register(name: String, f: UDF$i[$extTypeArgs], returnType: DataType): Unit = {
@@ -689,7 +689,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 0 arguments.
+   * Register a deterministic Java UDF0 instance as user-defined function (UDF).
    * @since 2.3.0
    */
   def register(name: String, f: UDF0[_], returnType: DataType): Unit = {
@@ -704,7 +704,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 1 arguments.
+   * Register a deterministic Java UDF1 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF1[_, _], returnType: DataType): Unit = {
@@ -719,7 +719,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 2 arguments.
+   * Register a deterministic Java UDF2 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF2[_, _, _], returnType: DataType): Unit = {
@@ -734,7 +734,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 3 arguments.
+   * Register a deterministic Java UDF3 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF3[_, _, _, _], returnType: DataType): Unit = {
@@ -749,7 +749,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 4 arguments.
+   * Register a deterministic Java UDF4 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF4[_, _, _, _, _], returnType: DataType): Unit = {
@@ -764,7 +764,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 5 arguments.
+   * Register a deterministic Java UDF5 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF5[_, _, _, _, _, _], returnType: DataType): Unit = {
@@ -779,7 +779,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 6 arguments.
+   * Register a deterministic Java UDF6 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF6[_, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -794,7 +794,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 7 arguments.
+   * Register a deterministic Java UDF7 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -809,7 +809,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 8 arguments.
+   * Register a deterministic Java UDF8 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -824,7 +824,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 9 arguments.
+   * Register a deterministic Java UDF9 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -839,7 +839,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 10 arguments.
+   * Register a deterministic Java UDF10 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -854,7 +854,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 11 arguments.
+   * Register a deterministic Java UDF11 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF11[_, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -869,7 +869,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 12 arguments.
+   * Register a deterministic Java UDF12 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -884,7 +884,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 13 arguments.
+   * Register a deterministic Java UDF13 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -899,7 +899,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 14 arguments.
+   * Register a deterministic Java UDF14 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -914,7 +914,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 15 arguments.
+   * Register a deterministic Java UDF15 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -929,7 +929,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 16 arguments.
+   * Register a deterministic Java UDF16 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -944,7 +944,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 17 arguments.
+   * Register a deterministic Java UDF17 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -959,7 +959,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 18 arguments.
+   * Register a deterministic Java UDF18 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -974,7 +974,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 19 arguments.
+   * Register a deterministic Java UDF19 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -989,7 +989,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 20 arguments.
+   * Register a deterministic Java UDF20 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1004,7 +1004,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 21 arguments.
+   * Register a deterministic Java UDF21 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
@@ -1019,7 +1019,7 @@ class UDFRegistration private[sql] (functionRegistry: FunctionRegistry) extends
   }
 
   /**
-   * Register a user-defined function with 22 arguments.
+   * Register a deterministic Java UDF22 instance as user-defined function (UDF).
    * @since 1.3.0
    */
   def register(name: String, f: UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _], returnType: DataType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
index 03b654f830520..40a058d2cadd2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala
@@ -66,6 +66,7 @@ case class UserDefinedFunction protected[sql] (
    *
    * @since 1.3.0
    */
+  @scala.annotation.varargs
   def apply(exprs: Column*): Column = {
     Column(ScalaUDF(
       f,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 530a525a01dec..0d11682d80a3c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -24,6 +24,7 @@ import scala.util.Try
 import scala.util.control.NonFatal
 
 import org.apache.spark.annotation.InterfaceStability
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
@@ -32,7 +33,6 @@ import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, ResolvedHint}
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.expressions.UserDefinedFunction
-import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
@@ -3254,42 +3254,66 @@ object functions {
    */
   def map_values(e: Column): Column = withExpr { MapValues(e.expr) }
 
-  //////////////////////////////////////////////////////////////////////////////////////////////
-  //////////////////////////////////////////////////////////////////////////////////////////////
-
   // scalastyle:off line.size.limit
   // scalastyle:off parameter.number
 
   /* Use the following code to generate:
-  (0 to 10).map { x =>
+
+  (0 to 10).foreach { x =>
     val types = (1 to x).foldRight("RT")((i, s) => {s"A$i, $s"})
     val typeTags = (1 to x).map(i => s"A$i: TypeTag").foldLeft("RT: TypeTag")(_ + ", " + _)
     val inputTypes = (1 to x).foldRight("Nil")((i, s) => {s"ScalaReflection.schemaFor(typeTag[A$i]).dataType :: $s"})
     println(s"""
-    /**
-     * Defines a deterministic user-defined function of ${x} arguments as user-defined
-     * function (UDF). The data types are automatically inferred based on the function's
-     * signature. To change a UDF to nondeterministic, call the API
-     * `UserDefinedFunction.asNondeterministic()`.
-     *
-     * @group udf_funcs
-     * @since 1.3.0
-     */
-    def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
-      val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
-      val inputTypes = Try($inputTypes).toOption
-      val udf = UserDefinedFunction(f, dataType, inputTypes)
-      if (nullable) udf else udf.asNonNullable()
-    }""")
+      |/**
+      | * Defines a Scala closure of $x arguments as user-defined function (UDF).
+      | * The data types are automatically inferred based on the Scala closure's
+      | * signature. By default the returned UDF is deterministic. To change it to
+      | * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 1.3.0
+      | */
+      |def udf[$typeTags](f: Function$x[$types]): UserDefinedFunction = {
+      |  val ScalaReflection.Schema(dataType, nullable) = ScalaReflection.schemaFor[RT]
+      |  val inputTypes = Try($inputTypes).toOption
+      |  val udf = UserDefinedFunction(f, dataType, inputTypes)
+      |  if (nullable) udf else udf.asNonNullable()
+      |}""".stripMargin)
+  }
+
+  (0 to 10).foreach { i =>
+    val extTypeArgs = (0 to i).map(_ => "_").mkString(", ")
+    val anyTypeArgs = (0 to i).map(_ => "Any").mkString(", ")
+    val anyCast = s".asInstanceOf[UDF$i[$anyTypeArgs]]"
+    val anyParams = (1 to i).map(_ => "_: Any").mkString(", ")
+    val funcCall = if (i == 0) "() => func" else "func"
+    println(s"""
+      |/**
+      | * Defines a Java UDF$i instance as user-defined function (UDF).
+      | * The caller must specify the output data type, and there is no automatic input type coercion.
+      | * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+      | * API `UserDefinedFunction.asNondeterministic()`.
+      | *
+      | * @group udf_funcs
+      | * @since 2.3.0
+      | */
+      |def udf(f: UDF$i[$extTypeArgs], returnType: DataType): UserDefinedFunction = {
+      |  val func = f$anyCast.call($anyParams)
+      |  UserDefinedFunction($funcCall, returnType, inputTypes = None)
+      |}""".stripMargin)
   }
 
   */
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Scala UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
   /**
-   * Defines a deterministic user-defined function of 0 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 0 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3302,10 +3326,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 1 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 1 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3318,10 +3342,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 2 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 2 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3334,10 +3358,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 3 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 3 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3350,10 +3374,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 4 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 4 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3366,10 +3390,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 5 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 5 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3382,10 +3406,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 6 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 6 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3398,10 +3422,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 7 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 7 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3414,10 +3438,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 8 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 8 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3430,10 +3454,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 9 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 9 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3446,10 +3470,10 @@ object functions {
   }
 
   /**
-   * Defines a deterministic user-defined function of 10 arguments as user-defined
-   * function (UDF). The data types are automatically inferred based on the function's
-   * signature. To change a UDF to nondeterministic, call the API
-   * `UserDefinedFunction.asNondeterministic()`.
+   * Defines a Scala closure of 10 arguments as user-defined function (UDF).
+   * The data types are automatically inferred based on the Scala closure's
+   * signature. By default the returned UDF is deterministic. To change it to
+   * nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
    *
    * @group udf_funcs
    * @since 1.3.0
@@ -3461,13 +3485,172 @@ object functions {
     if (nullable) udf else udf.asNonNullable()
   }
 
+  //////////////////////////////////////////////////////////////////////////////////////////////
+  // Java UDF functions
+  //////////////////////////////////////////////////////////////////////////////////////////////
+
+  /**
+   * Defines a Java UDF0 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF0[_], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF0[Any]].call()
+    UserDefinedFunction(() => func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF1 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF1[_, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF1[Any, Any]].call(_: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF2 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF2[_, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF2[Any, Any, Any]].call(_: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF3 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF3[_, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF3[Any, Any, Any, Any]].call(_: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF4 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF4[_, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF4[Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF5 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF5[_, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF5[Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF6 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF6[_, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF6[Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF7 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF7[_, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF7[Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF8 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF8[_, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF8[Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF9 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF9[_, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF9[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
+  /**
+   * Defines a Java UDF10 instance as user-defined function (UDF).
+   * The caller must specify the output data type, and there is no automatic input type coercion.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
+   *
+   * @group udf_funcs
+   * @since 2.3.0
+   */
+  def udf(f: UDF10[_, _, _, _, _, _, _, _, _, _, _], returnType: DataType): UserDefinedFunction = {
+    val func = f.asInstanceOf[UDF10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]].call(_: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any, _: Any)
+    UserDefinedFunction(func, returnType, inputTypes = None)
+  }
+
   // scalastyle:on parameter.number
   // scalastyle:on line.size.limit
 
   /**
    * Defines a deterministic user-defined function (UDF) using a Scala closure. For this variant,
    * the caller must specify the output data type, and there is no automatic input type coercion.
-   * To change a UDF to nondeterministic, call the API `UserDefinedFunction.asNondeterministic()`.
+   * By default the returned UDF is deterministic. To change it to nondeterministic, call the
+   * API `UserDefinedFunction.asNondeterministic()`.
    *
    * @param f  A closure in Scala
    * @param dataType  The output data type of the UDF
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index b007093dad84b..4f8a31f185724 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -36,6 +36,7 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.expressions.UserDefinedFunction;
 import org.apache.spark.sql.test.TestSparkSession;
 import org.apache.spark.sql.types.*;
 import org.apache.spark.util.sketch.BloomFilter;
@@ -455,4 +456,14 @@ public void testCircularReferenceBean() {
     CircularReference1Bean bean = new CircularReference1Bean();
     spark.createDataFrame(Arrays.asList(bean), CircularReference1Bean.class);
   }
+
+  @Test
+  public void testUDF() {
+    UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
+    Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
+    String[] result = df.collectAsList().stream().map(row -> row.getString(0)).toArray(String[]::new);
+    String[] expected = spark.table("testData").collectAsList().stream()
+      .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
+    Assert.assertArrayEquals(expected, result);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
index 7f1c009ca6e7a..db37be68e42e6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UDFSuite.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.api.java._
 import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.command.ExplainCommand
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.functions.udf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData._
-import org.apache.spark.sql.types.DataTypes
+import org.apache.spark.sql.types.{DataTypes, DoubleType}
 
 private case class FunctionResult(f1: String, f2: String)
 
@@ -128,6 +129,13 @@ class UDFSuite extends QueryTest with SharedSQLContext {
     val df2 = testData.select(bar())
     assert(df2.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
     assert(df2.head().getDouble(0) >= 0.0)
+
+    val javaUdf = udf(new UDF0[Double] {
+      override def call(): Double = Math.random()
+    }, DoubleType).asNondeterministic()
+    val df3 = testData.select(javaUdf())
+    assert(df3.logicalPlan.asInstanceOf[Project].projectList.forall(!_.deterministic))
+    assert(df3.head().getDouble(0) >= 0.0)
   }
 
   test("TwoArgument UDF") {

From 5aadbc929cb194e06dbd3bab054a161569289af5 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 4 Jan 2018 21:07:31 +0800
Subject: [PATCH 258/356] [SPARK-22939][PYSPARK] Support Spark UDF in
 registerFunction

## What changes were proposed in this pull request?
```Python
import random
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType, StringType
random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()
spark.catalog.registerFunction("random_udf", random_udf, StringType())
spark.sql("SELECT random_udf()").collect()
```

We will get the following error.
```
Py4JError: An error occurred while calling o29.__getnewargs__. Trace:
py4j.Py4JException: Method __getnewargs__([]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
```

This PR is to support it.

## How was this patch tested?
WIP

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20137 from gatorsmile/registerFunction.
---
 python/pyspark/sql/catalog.py | 27 +++++++++++++++----
 python/pyspark/sql/context.py | 16 +++++++++---
 python/pyspark/sql/tests.py   | 49 +++++++++++++++++++++++++----------
 python/pyspark/sql/udf.py     | 21 ++++++++++-----
 4 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 659bc65701a0c..156603128d063 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -227,15 +227,15 @@ def dropGlobalTempView(self, viewName):
     @ignore_unicode_prefix
     @since(2.0)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
+        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
+        as a UDF. The registered UDF can be used in SQL statement.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -255,9 +255,26 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> _ = spark.udf.register("stringLengthInt", len, IntegerType())
         >>> spark.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
+
+        >>> import random
+        >>> from pyspark.sql.functions import udf
+        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+        >>> newRandom_udf = spark.catalog.registerFunction("random_udf", random_udf, StringType())
+        >>> spark.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+        [Row(random_udf()=u'82')]
+        >>> spark.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
+        [Row(random_udf()=u'62')]
         """
-        udf = UserDefinedFunction(f, returnType=returnType, name=name,
-                                  evalType=PythonEvalType.SQL_BATCHED_UDF)
+
+        # This is to check whether the input function is a wrapped/native UserDefinedFunction
+        if hasattr(f, 'asNondeterministic'):
+            udf = UserDefinedFunction(f.func, returnType=returnType, name=name,
+                                      evalType=PythonEvalType.SQL_BATCHED_UDF,
+                                      deterministic=f.deterministic)
+        else:
+            udf = UserDefinedFunction(f, returnType=returnType, name=name,
+                                      evalType=PythonEvalType.SQL_BATCHED_UDF)
         self._jsparkSession.udf().registerPython(name, udf._judf)
         return udf._wrapped()
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b1e723cdecef3..b8d86cc098e94 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -175,15 +175,15 @@ def range(self, start, end=None, step=1, numPartitions=None):
     @ignore_unicode_prefix
     @since(1.2)
     def registerFunction(self, name, f, returnType=StringType()):
-        """Registers a python function (including lambda function) as a UDF
-        so it can be used in SQL statements.
+        """Registers a Python function (including lambda function) or a :class:`UserDefinedFunction`
+        as a UDF. The registered UDF can be used in SQL statement.
 
         In addition to a name and the function itself, the return type can be optionally specified.
         When the return type is not given it default to a string and conversion will automatically
         be done.  For any other return type, the produced object must match the specified type.
 
         :param name: name of the UDF
-        :param f: python function
+        :param f: a Python function, or a wrapped/native UserDefinedFunction
         :param returnType: a :class:`pyspark.sql.types.DataType` object
         :return: a wrapped :class:`UserDefinedFunction`
 
@@ -203,6 +203,16 @@ def registerFunction(self, name, f, returnType=StringType()):
         >>> _ = sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
         >>> sqlContext.sql("SELECT stringLengthInt('test')").collect()
         [Row(stringLengthInt(test)=4)]
+
+        >>> import random
+        >>> from pyspark.sql.functions import udf
+        >>> from pyspark.sql.types import IntegerType, StringType
+        >>> random_udf = udf(lambda: random.randint(0, 100), IntegerType()).asNondeterministic()
+        >>> newRandom_udf = sqlContext.registerFunction("random_udf", random_udf, StringType())
+        >>> sqlContext.sql("SELECT random_udf()").collect()  # doctest: +SKIP
+        [Row(random_udf()=u'82')]
+        >>> sqlContext.range(1).select(newRandom_udf()).collect()  # doctest: +SKIP
+        [Row(random_udf()=u'62')]
         """
         return self.sparkSession.catalog.registerFunction(name, f, returnType)
 
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 67bdb3d72d93b..6dc767f9ec46e 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -378,6 +378,41 @@ def test_udf2(self):
         [res] = self.spark.sql("SELECT strlen(a) FROM test WHERE strlen(a) > 1").collect()
         self.assertEqual(4, res[0])
 
+    def test_udf3(self):
+        twoargs = self.spark.catalog.registerFunction(
+            "twoArgs", UserDefinedFunction(lambda x, y: len(x) + y), IntegerType())
+        self.assertEqual(twoargs.deterministic, True)
+        [row] = self.spark.sql("SELECT twoArgs('test', 1)").collect()
+        self.assertEqual(row[0], 5)
+
+    def test_nondeterministic_udf(self):
+        from pyspark.sql.functions import udf
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
+        self.assertEqual(udf_random_col.deterministic, False)
+        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
+        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
+        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
+        self.assertEqual(row[0] + 10, row[1])
+
+    def test_nondeterministic_udf2(self):
+        import random
+        from pyspark.sql.functions import udf
+        random_udf = udf(lambda: random.randint(6, 6), IntegerType()).asNondeterministic()
+        self.assertEqual(random_udf.deterministic, False)
+        random_udf1 = self.spark.catalog.registerFunction("randInt", random_udf, StringType())
+        self.assertEqual(random_udf1.deterministic, False)
+        [row] = self.spark.sql("SELECT randInt()").collect()
+        self.assertEqual(row[0], "6")
+        [row] = self.spark.range(1).select(random_udf1()).collect()
+        self.assertEqual(row[0], "6")
+        [row] = self.spark.range(1).select(random_udf()).collect()
+        self.assertEqual(row[0], 6)
+        # render_doc() reproduces the help() exception without printing output
+        pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
+        pydoc.render_doc(random_udf)
+        pydoc.render_doc(random_udf1)
+
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())
         [row] = self.spark.sql("SELECT double(1)").collect()
@@ -435,15 +470,6 @@ def test_udf_with_array_type(self):
         self.assertEqual(list(range(3)), l1)
         self.assertEqual(1, l2)
 
-    def test_nondeterministic_udf(self):
-        from pyspark.sql.functions import udf
-        import random
-        udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
-        df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
-        udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
-        [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
-        self.assertEqual(row[0] + 10, row[1])
-
     def test_broadcast_in_udf(self):
         bar = {"a": "aa", "b": "bb", "c": "abc"}
         foo = self.sc.broadcast(bar)
@@ -567,7 +593,6 @@ def test_read_multiple_orc_file(self):
 
     def test_udf_with_input_file_name(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
         sourceFile = udf(lambda path: path, StringType())
         filePath = "python/test_support/sql/people1.json"
         row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
@@ -575,7 +600,6 @@ def test_udf_with_input_file_name(self):
 
     def test_udf_with_input_file_name_for_hadooprdd(self):
         from pyspark.sql.functions import udf, input_file_name
-        from pyspark.sql.types import StringType
 
         def filename(path):
             return path
@@ -635,7 +659,6 @@ def test_udf_with_string_return_type(self):
 
     def test_udf_shouldnt_accept_noncallable_object(self):
         from pyspark.sql.functions import UserDefinedFunction
-        from pyspark.sql.types import StringType
 
         non_callable = None
         self.assertRaises(TypeError, UserDefinedFunction, non_callable, StringType())
@@ -1299,7 +1322,6 @@ def test_between_function(self):
                          df.filter(df.a.between(df.b, df.c)).collect())
 
     def test_struct_type(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         struct1 = StructType().add("f1", StringType(), True).add("f2", StringType(), True, None)
         struct2 = StructType([StructField("f1", StringType(), True),
                               StructField("f2", StringType(), True, None)])
@@ -1368,7 +1390,6 @@ def test_parse_datatype_string(self):
             _parse_datatype_string("a INT, c DOUBLE"))
 
     def test_metadata_null(self):
-        from pyspark.sql.types import StructType, StringType, StructField
         schema = StructType([StructField("f1", StringType(), True, None),
                              StructField("f2", StringType(), True, {'a': None})])
         rdd = self.sc.parallelize([["a", "b"], ["c", "d"]])
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 54b5a8656e1c8..5e75eb6545333 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -56,7 +56,8 @@ def _create_udf(f, returnType, evalType):
             )
 
     # Set the name of the UserDefinedFunction object to be the name of function f
-    udf_obj = UserDefinedFunction(f, returnType=returnType, name=None, evalType=evalType)
+    udf_obj = UserDefinedFunction(
+        f, returnType=returnType, name=None, evalType=evalType, deterministic=True)
     return udf_obj._wrapped()
 
 
@@ -67,8 +68,10 @@ class UserDefinedFunction(object):
     .. versionadded:: 1.3
     """
     def __init__(self, func,
-                 returnType=StringType(), name=None,
-                 evalType=PythonEvalType.SQL_BATCHED_UDF):
+                 returnType=StringType(),
+                 name=None,
+                 evalType=PythonEvalType.SQL_BATCHED_UDF,
+                 deterministic=True):
         if not callable(func):
             raise TypeError(
                 "Invalid function: not a function or callable (__call__ is not defined): "
@@ -92,7 +95,7 @@ def __init__(self, func,
             func.__name__ if hasattr(func, '__name__')
             else func.__class__.__name__)
         self.evalType = evalType
-        self._deterministic = True
+        self.deterministic = deterministic
 
     @property
     def returnType(self):
@@ -130,7 +133,7 @@ def _create_judf(self):
         wrapped_func = _wrap_function(sc, self.func, self.returnType)
         jdt = spark._jsparkSession.parseDataType(self.returnType.json())
         judf = sc._jvm.org.apache.spark.sql.execution.python.UserDefinedPythonFunction(
-            self._name, wrapped_func, jdt, self.evalType, self._deterministic)
+            self._name, wrapped_func, jdt, self.evalType, self.deterministic)
         return judf
 
     def __call__(self, *cols):
@@ -138,6 +141,9 @@ def __call__(self, *cols):
         sc = SparkContext._active_spark_context
         return Column(judf.apply(_to_seq(sc, cols, _to_java_column)))
 
+    # This function is for improving the online help system in the interactive interpreter.
+    # For example, the built-in help / pydoc.help. It wraps the UDF with the docstring and
+    # argument annotation. (See: SPARK-19161)
     def _wrapped(self):
         """
         Wrap this udf with a function and attach docstring from func
@@ -162,7 +168,8 @@ def wrapper(*args):
         wrapper.func = self.func
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
-        wrapper.asNondeterministic = self.asNondeterministic
+        wrapper.deterministic = self.deterministic
+        wrapper.asNondeterministic = lambda: self.asNondeterministic()._wrapped()
 
         return wrapper
 
@@ -172,5 +179,5 @@ def asNondeterministic(self):
 
         .. versionadded:: 2.3
         """
-        self._deterministic = False
+        self.deterministic = False
         return self

From 6f68316e98fad72b171df422566e1fc9a7bbfcde Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 4 Jan 2018 21:15:10 +0800
Subject: [PATCH 259/356] [SPARK-22771][SQL] Add a missing return statement in
 Concat.checkInputDataTypes

## What changes were proposed in this pull request?
This pr is a follow-up to fix a bug left in #19977.

## How was this patch tested?
Added tests in `StringExpressionsSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20149 from maropu/SPARK-22771-FOLLOWUP.
---
 .../sql/catalyst/expressions/stringExpressions.scala |  2 +-
 .../expressions/StringExpressionsSuite.scala         | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index b0da55a4a961b..41dc762154a4c 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -58,7 +58,7 @@ case class Concat(children: Seq[Expression]) extends Expression {
     } else {
       val childTypes = children.map(_.dataType)
       if (childTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
-        TypeCheckResult.TypeCheckFailure(
+        return TypeCheckResult.TypeCheckFailure(
           s"input to function $prettyName should have StringType or BinaryType, but it's " +
             childTypes.map(_.simpleString).mkString("[", ", ", "]"))
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index 54cde77176e27..97ddbeba2c5ca 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -51,6 +51,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Concat(strs.map(Literal.create(_, StringType))), strs.mkString, EmptyRow)
   }
 
+  test("SPARK-22771 Check Concat.checkInputDataTypes results") {
+    assert(Concat(Seq.empty[Expression]).checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a") :: Literal.create("b") :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create("a".getBytes) :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isSuccess)
+    assert(Concat(Literal.create(1) :: Literal.create(2) :: Nil)
+      .checkInputDataTypes().isFailure)
+    assert(Concat(Literal.create("a") :: Literal.create("b".getBytes) :: Nil)
+      .checkInputDataTypes().isFailure)
+  }
+
   test("concat_ws") {
     def testConcatWs(expected: String, sep: String, inputs: Any*): Unit = {
       val inputExprs = inputs.map {

From 93f92c0ed7442a4382e97254307309977ff676f8 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Thu, 4 Jan 2018 11:39:42 -0800
Subject: [PATCH 260/356] [SPARK-21475][CORE][2ND ATTEMPT] Change to use NIO's
 Files API for external shuffle service

## What changes were proposed in this pull request?

This PR is the second attempt of #18684 , NIO's Files API doesn't override `skip` method for `InputStream`, so it will bring in performance issue (mentioned in #20119). But using `FileInputStream`/`FileOutputStream` will also bring in memory issue (https://dzone.com/articles/fileinputstream-fileoutputstream-considered-harmful), which is severe for long running external shuffle service. So here in this proposal, only fixing the external shuffle service related code.

## How was this patch tested?

Existing tests.

Author: jerryshao <sshao@hortonworks.com>

Closes #20144 from jerryshao/SPARK-21475-v2.
---
 .../apache/spark/network/buffer/FileSegmentManagedBuffer.java | 3 ++-
 .../apache/spark/network/shuffle/ShuffleIndexInformation.java | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
index c20fab83c3460..8b8f9892847c3 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/buffer/FileSegmentManagedBuffer.java
@@ -24,6 +24,7 @@
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.nio.file.StandardOpenOption;
 
 import com.google.common.base.Objects;
 import com.google.common.io.ByteStreams;
@@ -132,7 +133,7 @@ public Object convertToNetty() throws IOException {
     if (conf.lazyFileDescriptor()) {
       return new DefaultFileRegion(file, offset, length);
     } else {
-      FileChannel fileChannel = new FileInputStream(file).getChannel();
+      FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
       return new DefaultFileRegion(fileChannel, offset, length);
     }
   }
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
index eacf485344b76..386738ece51a6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/ShuffleIndexInformation.java
@@ -19,10 +19,10 @@
 
 import java.io.DataInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
+import java.nio.file.Files;
 
 /**
  * Keeps the index information for a particular map output
@@ -39,7 +39,7 @@ public ShuffleIndexInformation(File indexFile) throws IOException {
     offsets = buffer.asLongBuffer();
     DataInputStream dis = null;
     try {
-      dis = new DataInputStream(new FileInputStream(indexFile));
+      dis = new DataInputStream(Files.newInputStream(indexFile.toPath()));
       dis.readFully(buffer.array());
     } finally {
       if (dis != null) {

From d2cddc88eac32f26b18ec26bb59e85c6f09a8c88 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 16:19:00 -0600
Subject: [PATCH 261/356] [SPARK-22850][CORE] Ensure queued events are
 delivered to all event queues.

The code in LiveListenerBus was queueing events before start in the
queues themselves; so in situations like the following:

   bus.post(someEvent)
   bus.addToEventLogQueue(listener)
   bus.start()

"someEvent" would not be delivered to "listener" if that was the first
listener in the queue, because the queue wouldn't exist when the
event was posted.

This change buffers the events before starting the bus in the bus itself,
so that they can be delivered to all registered queues when the bus is
started.

Also tweaked the unit tests to cover the behavior above.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20039 from vanzin/SPARK-22850.
---
 .../spark/scheduler/LiveListenerBus.scala     | 45 ++++++++++++++++---
 .../spark/scheduler/SparkListenerSuite.scala  | 21 +++++----
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
index 23121402b1025..ba6387a8f08ad 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/LiveListenerBus.scala
@@ -62,6 +62,9 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   private val queues = new CopyOnWriteArrayList[AsyncEventQueue]()
 
+  // Visible for testing.
+  @volatile private[scheduler] var queuedEvents = new mutable.ListBuffer[SparkListenerEvent]()
+
   /** Add a listener to queue shared by all non-internal listeners. */
   def addToSharedQueue(listener: SparkListenerInterface): Unit = {
     addToQueue(listener, SHARED_QUEUE)
@@ -125,13 +128,39 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
 
   /** Post an event to all queues. */
   def post(event: SparkListenerEvent): Unit = {
-    if (!stopped.get()) {
-      metrics.numEventsPosted.inc()
-      val it = queues.iterator()
-      while (it.hasNext()) {
-        it.next().post(event)
+    if (stopped.get()) {
+      return
+    }
+
+    metrics.numEventsPosted.inc()
+
+    // If the event buffer is null, it means the bus has been started and we can avoid
+    // synchronization and post events directly to the queues. This should be the most
+    // common case during the life of the bus.
+    if (queuedEvents == null) {
+      postToQueues(event)
+      return
+    }
+
+    // Otherwise, need to synchronize to check whether the bus is started, to make sure the thread
+    // calling start() picks up the new event.
+    synchronized {
+      if (!started.get()) {
+        queuedEvents += event
+        return
       }
     }
+
+    // If the bus was already started when the check above was made, just post directly to the
+    // queues.
+    postToQueues(event)
+  }
+
+  private def postToQueues(event: SparkListenerEvent): Unit = {
+    val it = queues.iterator()
+    while (it.hasNext()) {
+      it.next().post(event)
+    }
   }
 
   /**
@@ -149,7 +178,11 @@ private[spark] class LiveListenerBus(conf: SparkConf) {
     }
 
     this.sparkContext = sc
-    queues.asScala.foreach(_.start(sc))
+    queues.asScala.foreach { q =>
+      q.start(sc)
+      queuedEvents.foreach(q.post)
+    }
+    queuedEvents = null
     metricsSystem.registerSource(metrics)
   }
 
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 1beb36afa95f0..da6ecb82c7e42 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -48,7 +48,7 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     bus.metrics.metricRegistry.counter(s"queue.$SHARED_QUEUE.numDroppedEvents").getCount
   }
 
-  private def queueSize(bus: LiveListenerBus): Int = {
+  private def sharedQueueSize(bus: LiveListenerBus): Int = {
     bus.metrics.metricRegistry.getGauges().get(s"queue.$SHARED_QUEUE.size").getValue()
       .asInstanceOf[Int]
   }
@@ -73,12 +73,11 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     val conf = new SparkConf()
     val counter = new BasicJobCounter
     val bus = new LiveListenerBus(conf)
-    bus.addToSharedQueue(counter)
 
     // Metrics are initially empty.
     assert(bus.metrics.numEventsPosted.getCount === 0)
     assert(numDroppedEvents(bus) === 0)
-    assert(queueSize(bus) === 0)
+    assert(bus.queuedEvents.size === 0)
     assert(eventProcessingTimeCount(bus) === 0)
 
     // Post five events:
@@ -87,7 +86,10 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Five messages should be marked as received and queued, but no messages should be posted to
     // listeners yet because the the listener bus hasn't been started.
     assert(bus.metrics.numEventsPosted.getCount === 5)
-    assert(queueSize(bus) === 5)
+    assert(bus.queuedEvents.size === 5)
+
+    // Add the counter to the bus after messages have been queued for later delivery.
+    bus.addToSharedQueue(counter)
     assert(counter.count === 0)
 
     // Starting listener bus should flush all buffered events
@@ -95,9 +97,12 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     Mockito.verify(mockMetricsSystem).registerSource(bus.metrics)
     bus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)
     assert(counter.count === 5)
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(eventProcessingTimeCount(bus) === 5)
 
+    // After the bus is started, there should be no more queued events.
+    assert(bus.queuedEvents === null)
+
     // After listener bus has stopped, posting events should not increment counter
     bus.stop()
     (1 to 5).foreach { _ => bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded)) }
@@ -188,18 +193,18 @@ class SparkListenerSuite extends SparkFunSuite with LocalSparkContext with Match
     // Post a message to the listener bus and wait for processing to begin:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
     listenerStarted.acquire()
-    assert(queueSize(bus) === 0)
+    assert(sharedQueueSize(bus) === 0)
     assert(numDroppedEvents(bus) === 0)
 
     // If we post an additional message then it should remain in the queue because the listener is
     // busy processing the first event:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 0)
 
     // The queue is now full, so any additional events posted to the listener will be dropped:
     bus.post(SparkListenerJobEnd(0, jobCompletionTime, JobSucceeded))
-    assert(queueSize(bus) === 1)
+    assert(sharedQueueSize(bus) === 1)
     assert(numDroppedEvents(bus) === 1)
 
     // Allow the the remaining events to be processed so we can stop the listener bus:

From 95f9659abe8845f9f3f42fd7ababd79e55c52489 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 15:00:09 -0800
Subject: [PATCH 262/356] [SPARK-22948][K8S] Move SparkPodInitContainer to
 correct package.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20156 from vanzin/SPARK-22948.
---
 dev/sparktestsupport/modules.py                                 | 2 +-
 .../spark/deploy/{rest => }/k8s/SparkPodInitContainer.scala     | 2 +-
 .../deploy/{rest => }/k8s/SparkPodInitContainerSuite.scala      | 2 +-
 .../docker/src/main/dockerfiles/init-container/Dockerfile       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/{rest => }/k8s/SparkPodInitContainer.scala (99%)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/{rest => }/k8s/SparkPodInitContainerSuite.scala (98%)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index f834563da9dda..7164180a6a7b0 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -539,7 +539,7 @@ def __hash__(self):
 kubernetes = Module(
     name="kubernetes",
     dependencies=[],
-    source_file_regexes=["resource-managers/kubernetes/core"],
+    source_file_regexes=["resource-managers/kubernetes"],
     build_profile_flags=["-Pkubernetes"],
     sbt_test_goals=["kubernetes/test"]
 )
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
similarity index 99%
rename from resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
rename to resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
index 4a4b628aedbbf..c0f08786b76a1 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.concurrent.TimeUnit
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
similarity index 98%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
index 6c557ec4a7c9a..e0f29ecd0fb53 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/rest/k8s/SparkPodInitContainerSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.rest.k8s
+package org.apache.spark.deploy.k8s
 
 import java.io.File
 import java.util.UUID
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index 055493188fcb7..047056ab2633b 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -21,4 +21,4 @@ FROM spark-base
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
 # docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
 
-ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.rest.k8s.SparkPodInitContainer" ]
+ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.k8s.SparkPodInitContainer" ]

From e288fc87a027ec1e1a21401d1f151df20dbfecf3 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 4 Jan 2018 15:35:20 -0800
Subject: [PATCH 263/356] [SPARK-22953][K8S] Avoids adding duplicated secret
 volumes when init-container is used

## What changes were proposed in this pull request?

User-specified secrets are mounted into both the main container and init-container (when it is used) in a Spark driver/executor pod, using the `MountSecretsBootstrap`. Because `MountSecretsBootstrap` always adds new secret volumes for the secrets to the pod, the same secret volumes get added twice, one when mounting the secrets to the main container, and the other when mounting the secrets to the init-container. This PR fixes the issue by separating `MountSecretsBootstrap.mountSecrets` out into two methods: `addSecretVolumes` for adding secret volumes to a pod and `mountSecrets` for mounting secret volumes to a container, respectively. `addSecretVolumes` is only called once for each pod, whereas `mountSecrets` is called individually for the main container and the init-container (if it is used).

Ref: https://github.com/apache-spark-on-k8s/spark/issues/594.

## How was this patch tested?
Unit tested and manually tested.

vanzin This replaces https://github.com/apache/spark/pull/20148.
hex108 foxish kimoonkim

Author: Yinan Li <liyinan926@gmail.com>

Closes #20159 from liyinan926/master.
---
 .../deploy/k8s/MountSecretsBootstrap.scala    | 30 ++++++++++++-------
 .../k8s/submit/DriverConfigOrchestrator.scala | 16 +++++-----
 .../steps/BasicDriverConfigurationStep.scala  |  2 +-
 .../submit/steps/DriverMountSecretsStep.scala |  4 +--
 .../InitContainerMountSecretsStep.scala       | 11 +++----
 .../cluster/k8s/ExecutorPodFactory.scala      |  6 ++--
 .../k8s/{submit => }/SecretVolumeUtils.scala  | 18 +++++------
 .../BasicDriverConfigurationStepSuite.scala   |  4 +--
 .../steps/DriverMountSecretsStepSuite.scala   |  4 +--
 .../InitContainerMountSecretsStepSuite.scala  |  7 +----
 .../cluster/k8s/ExecutorPodFactorySuite.scala | 14 +++++----
 11 files changed, 61 insertions(+), 55 deletions(-)
 rename resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/{submit => }/SecretVolumeUtils.scala (71%)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
index 8286546ce0641..c35e7db51d407 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
@@ -24,26 +24,36 @@ import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBui
 private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String, String]) {
 
   /**
-   * Mounts Kubernetes secrets as secret volumes into the given container in the given pod.
+   * Add new secret volumes for the secrets specified in secretNamesToMountPaths into the given pod.
    *
    * @param pod the pod into which the secret volumes are being added.
-   * @param container the container into which the secret volumes are being mounted.
-   * @return the updated pod and container with the secrets mounted.
+   * @return the updated pod with the secret volumes added.
    */
-  def mountSecrets(pod: Pod, container: Container): (Pod, Container) = {
+  def addSecretVolumes(pod: Pod): Pod = {
     var podBuilder = new PodBuilder(pod)
     secretNamesToMountPaths.keys.foreach { name =>
       podBuilder = podBuilder
         .editOrNewSpec()
           .addNewVolume()
-          .withName(secretVolumeName(name))
-          .withNewSecret()
-            .withSecretName(name)
-            .endSecret()
-          .endVolume()
+            .withName(secretVolumeName(name))
+            .withNewSecret()
+              .withSecretName(name)
+              .endSecret()
+            .endVolume()
           .endSpec()
     }
 
+    podBuilder.build()
+  }
+
+  /**
+   * Mounts Kubernetes secret volumes of the secrets specified in secretNamesToMountPaths into the
+   * given container.
+   *
+   * @param container the container into which the secret volumes are being mounted.
+   * @return the updated container with the secrets mounted.
+   */
+  def mountSecrets(container: Container): Container = {
     var containerBuilder = new ContainerBuilder(container)
     secretNamesToMountPaths.foreach { case (name, path) =>
       containerBuilder = containerBuilder
@@ -53,7 +63,7 @@ private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String,
           .endVolumeMount()
     }
 
-    (podBuilder.build(), containerBuilder.build())
+    containerBuilder.build()
   }
 
   private def secretVolumeName(secretName: String): String = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
index 00c9c4ee49177..c9cc300d65569 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
@@ -127,6 +127,12 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
+    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
+      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
+    } else {
+      Nil
+    }
+
     val initContainerBootstrapStep = if (existNonContainerLocalFiles(sparkJars ++ sparkFiles)) {
       val orchestrator = new InitContainerConfigOrchestrator(
         sparkJars,
@@ -147,19 +153,13 @@ private[spark] class DriverConfigOrchestrator(
       Nil
     }
 
-    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
-      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
-    } else {
-      Nil
-    }
-
     Seq(
       initialSubmissionStep,
       serviceBootstrapStep,
       kubernetesCredentialsStep) ++
       dependencyResolutionStep ++
-      initContainerBootstrapStep ++
-      mountSecretsStep
+      mountSecretsStep ++
+      initContainerBootstrapStep
   }
 
   private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index b7a69a7dfd472..eca46b84c6066 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -119,7 +119,7 @@ private[spark] class BasicDriverConfigurationStep(
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.map(arg => "\"" + arg + "\"").mkString(" "))
+        .withValue(appArgs.mkString(" "))
         .endEnv()
       .addNewEnv()
         .withName(ENV_DRIVER_BIND_ADDRESS)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
index f872e0f4b65d1..91e9a9f211335 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
@@ -28,8 +28,8 @@ private[spark] class DriverMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends DriverConfigurationStep {
 
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val (pod, container) = bootstrap.mountSecrets(
-      driverSpec.driverPod, driverSpec.driverContainer)
+    val pod = bootstrap.addSecretVolumes(driverSpec.driverPod)
+    val container = bootstrap.mountSecrets(driverSpec.driverContainer)
     driverSpec.copy(
       driverPod = pod,
       driverContainer = container
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
index c0e7bb20cce8c..0daa7b95e8aae 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
@@ -28,12 +28,9 @@ private[spark] class InitContainerMountSecretsStep(
     bootstrap: MountSecretsBootstrap) extends InitContainerConfigurationStep {
 
   override def configureInitContainer(spec: InitContainerSpec) : InitContainerSpec = {
-    val (driverPod, initContainer) = bootstrap.mountSecrets(
-      spec.driverPod,
-      spec.initContainer)
-    spec.copy(
-      driverPod = driverPod,
-      initContainer = initContainer
-    )
+    // Mount the secret volumes given that the volumes have already been added to the driver pod
+    // when mounting the secrets into the main driver container.
+    val initContainer = bootstrap.mountSecrets(spec.initContainer)
+    spec.copy(initContainer = initContainer)
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index ba5d891f4c77e..066d7e9f70ca5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -214,7 +214,7 @@ private[spark] class ExecutorPodFactory(
 
     val (maybeSecretsMountedPod, maybeSecretsMountedContainer) =
       mountSecretsBootstrap.map { bootstrap =>
-        bootstrap.mountSecrets(executorPod, containerWithLimitCores)
+        (bootstrap.addSecretVolumes(executorPod), bootstrap.mountSecrets(containerWithLimitCores))
       }.getOrElse((executorPod, containerWithLimitCores))
 
     val (bootstrappedPod, bootstrappedContainer) =
@@ -227,7 +227,9 @@ private[spark] class ExecutorPodFactory(
 
         val (pod, mayBeSecretsMountedInitContainer) =
           initContainerMountSecretsBootstrap.map { bootstrap =>
-            bootstrap.mountSecrets(podWithInitContainer.pod, podWithInitContainer.initContainer)
+            // Mount the secret volumes given that the volumes have already been added to the
+            // executor pod when mounting the secrets into the main executor container.
+            (podWithInitContainer.pod, bootstrap.mountSecrets(podWithInitContainer.initContainer))
           }.getOrElse((podWithInitContainer.pod, podWithInitContainer.initContainer))
 
         val bootstrappedPod = KubernetesUtils.appendInitContainer(
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
similarity index 71%
rename from resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
rename to resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
index 8388c16ded268..16780584a674a 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/SecretVolumeUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.spark.deploy.k8s.submit
+package org.apache.spark.deploy.k8s
 
 import scala.collection.JavaConverters._
 
@@ -22,15 +22,15 @@ import io.fabric8.kubernetes.api.model.{Container, Pod}
 
 private[spark] object SecretVolumeUtils {
 
-  def podHasVolume(driverPod: Pod, volumeName: String): Boolean = {
-    driverPod.getSpec.getVolumes.asScala.exists(volume => volume.getName == volumeName)
+  def podHasVolume(pod: Pod, volumeName: String): Boolean = {
+    pod.getSpec.getVolumes.asScala.exists { volume =>
+      volume.getName == volumeName
+    }
   }
 
-  def containerHasVolume(
-      driverContainer: Container,
-      volumeName: String,
-      mountPath: String): Boolean = {
-    driverContainer.getVolumeMounts.asScala.exists(volumeMount =>
-      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath)
+  def containerHasVolume(container: Container, volumeName: String, mountPath: String): Boolean = {
+    container.getVolumeMounts.asScala.exists { volumeMount =>
+      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath
+    }
   }
 }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index e864c6a16eeb1..8ee629ac8ddc1 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -33,7 +33,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
   private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent"
   private val APP_NAME = "spark-test"
   private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val APP_ARGS = Array("arg1", "arg2", "arg 3")
+  private val APP_ARGS = Array("arg1", "arg2", "\"arg 3\"")
   private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
   private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
   private val DRIVER_CUSTOM_ENV_KEY1 = "customDriverEnv1"
@@ -82,7 +82,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
     assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
     assert(envs(ENV_DRIVER_MEMORY) === "256M")
     assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
-    assert(envs(ENV_DRIVER_ARGS) === "\"arg1\" \"arg2\" \"arg 3\"")
+    assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
     assert(envs(DRIVER_CUSTOM_ENV_KEY1) === "customDriverEnv1")
     assert(envs(DRIVER_CUSTOM_ENV_KEY2) === "customDriverEnv2")
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
index 9ec0cb55de5aa..960d0bda1d011 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
@@ -17,8 +17,8 @@
 package org.apache.spark.deploy.k8s.submit.steps
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.{KubernetesDriverSpec, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
+import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
 
 class DriverMountSecretsStepSuite extends SparkFunSuite {
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
index eab4e17659456..7ac0bde80dfe6 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
@@ -19,8 +19,7 @@ package org.apache.spark.deploy.k8s.submit.steps.initcontainer
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.SecretVolumeUtils
+import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
 
 class InitContainerMountSecretsStepSuite extends SparkFunSuite {
 
@@ -44,12 +43,8 @@ class InitContainerMountSecretsStepSuite extends SparkFunSuite {
     val initContainerMountSecretsStep = new InitContainerMountSecretsStep(mountSecretsBootstrap)
     val configuredInitContainerSpec = initContainerMountSecretsStep.configureInitContainer(
       baseInitContainerSpec)
-
-    val podWithSecretsMounted = configuredInitContainerSpec.driverPod
     val initContainerWithSecretsMounted = configuredInitContainerSpec.initContainer
 
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
-      assert(SecretVolumeUtils.podHasVolume(podWithSecretsMounted, volumeName)))
     Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
       assert(SecretVolumeUtils.containerHasVolume(
         initContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH)))
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 7121a802c69c1..884da8aabd880 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -25,7 +25,7 @@ import org.mockito.Mockito._
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer}
+import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer, SecretVolumeUtils}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 
@@ -165,17 +165,19 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     val factory = new ExecutorPodFactory(
       conf,
-      None,
+      Some(secretsBootstrap),
       Some(initContainerBootstrap),
       Some(secretsBootstrap))
     val executor = factory.createExecutorPod(
       "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
 
+    assert(executor.getSpec.getVolumes.size() === 1)
+    assert(SecretVolumeUtils.podHasVolume(executor, "secret1-volume"))
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getContainers.get(0), "secret1-volume", "/var/secret1"))
     assert(executor.getSpec.getInitContainers.size() === 1)
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0).getName
-      === "secret1-volume")
-    assert(executor.getSpec.getInitContainers.get(0).getVolumeMounts.get(0)
-      .getMountPath === "/var/secret1")
+    assert(SecretVolumeUtils.containerHasVolume(
+      executor.getSpec.getInitContainers.get(0), "secret1-volume", "/var/secret1"))
 
     checkOwnerReferences(executor, driverPodUid)
   }

From 0428368c2c5e135f99f62be20877bbbda43be310 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 4 Jan 2018 16:34:56 -0800
Subject: [PATCH 264/356] [SPARK-22960][K8S] Make build-push-docker-images.sh
 more dev-friendly.

- Make it possible to build images from a git clone.
- Make it easy to use minikube to test things.

Also fixed what seemed like a bug: the base image wasn't getting the tag
provided in the command line. Adding the tag allows users to use multiple
Spark builds in the same kubernetes cluster.

Tested by deploying images on minikube and running spark-submit from a dev
environment; also by building the images with different tags and verifying
"docker images" in minikube.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20154 from vanzin/SPARK-22960.
---
 docs/running-on-kubernetes.md                 |   9 +-
 .../src/main/dockerfiles/driver/Dockerfile    |   3 +-
 .../src/main/dockerfiles/executor/Dockerfile  |   3 +-
 .../dockerfiles/init-container/Dockerfile     |   3 +-
 .../main/dockerfiles/spark-base/Dockerfile    |   7 +-
 sbin/build-push-docker-images.sh              | 120 +++++++++++++++---
 6 files changed, 117 insertions(+), 28 deletions(-)

diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index e491329136a3c..2d69f636472ae 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -16,6 +16,9 @@ Kubernetes scheduler that has been added to Spark.
 you may setup a test cluster on your local machine using
 [minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
   * We recommend using the latest release of minikube with the DNS addon enabled.
+  * Be aware that the default minikube configuration is not enough for running Spark applications.
+  We recommend 3 CPUs and 4g of memory to be able to start a simple Spark application with a single
+  executor.
 * You must have appropriate permissions to list, create, edit and delete
 [pods](https://kubernetes.io/docs/user-guide/pods/) in your cluster. You can verify that you can list these resources
 by running `kubectl auth can-i <list|create|edit|delete> pods`.
@@ -197,7 +200,7 @@ kubectl port-forward <driver-pod-name> 4040:4040
 
 Then, the Spark driver UI can be accessed on `http://localhost:4040`.
 
-### Debugging 
+### Debugging
 
 There may be several kinds of failures. If the Kubernetes API server rejects the request made from spark-submit, or the
 connection is refused for a different reason, the submission logic should indicate the error encountered. However, if there
@@ -215,8 +218,8 @@ If the pod has encountered a runtime error, the status can be probed further usi
 kubectl logs <spark-driver-pod>
 ```
 
-Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark 
-application, includling all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of 
+Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark
+application, including all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of
 the Spark application.
 
 ## Kubernetes Features
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index 45fbcd9cd0deb..ff5289e10c21e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 0f806cf7e148e..3eabb42d4d852 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index 047056ab2633b..e0a249e0ac71f 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 
-FROM spark-base
+ARG base_image
+FROM ${base_image}
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
index 222e777db3a82..da1d6b9e161cc 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
@@ -17,6 +17,9 @@
 
 FROM openjdk:8-alpine
 
+ARG spark_jars
+ARG img_path
+
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
@@ -34,11 +37,11 @@ RUN set -ex && \
     ln -sv /bin/bash /bin/sh && \
     chgrp root /etc/passwd && chmod ug+rw /etc/passwd
 
-COPY jars /opt/spark/jars
+COPY ${spark_jars} /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY kubernetes/dockerfiles/spark-base/entrypoint.sh /opt/
+COPY ${img_path}/spark-base/entrypoint.sh /opt/
 
 ENV SPARK_HOME /opt/spark
 
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
index b3137598692d8..bb8806dd33f37 100755
--- a/sbin/build-push-docker-images.sh
+++ b/sbin/build-push-docker-images.sh
@@ -19,29 +19,94 @@
 # This script builds and pushes docker images when run from a release of Spark
 # with Kubernetes support.
 
-declare -A path=( [spark-driver]=kubernetes/dockerfiles/driver/Dockerfile \
-                  [spark-executor]=kubernetes/dockerfiles/executor/Dockerfile \
-                  [spark-init]=kubernetes/dockerfiles/init-container/Dockerfile )
+function error {
+  echo "$@" 1>&2
+  exit 1
+}
+
+# Detect whether this is a git clone or a Spark distribution and adjust paths
+# accordingly.
+if [ -z "${SPARK_HOME}" ]; then
+  SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
+fi
+. "${SPARK_HOME}/bin/load-spark-env.sh"
+
+if [ -f "$SPARK_HOME/RELEASE" ]; then
+  IMG_PATH="kubernetes/dockerfiles"
+  SPARK_JARS="jars"
+else
+  IMG_PATH="resource-managers/kubernetes/docker/src/main/dockerfiles"
+  SPARK_JARS="assembly/target/scala-$SPARK_SCALA_VERSION/jars"
+fi
+
+if [ ! -d "$IMG_PATH" ]; then
+  error "Cannot find docker images. This script must be run from a runnable distribution of Apache Spark."
+fi
+
+declare -A path=( [spark-driver]="$IMG_PATH/driver/Dockerfile" \
+                  [spark-executor]="$IMG_PATH/executor/Dockerfile" \
+                  [spark-init]="$IMG_PATH/init-container/Dockerfile" )
+
+function image_ref {
+  local image="$1"
+  local add_repo="${2:-1}"
+  if [ $add_repo = 1 ] && [ -n "$REPO" ]; then
+    image="$REPO/$image"
+  fi
+  if [ -n "$TAG" ]; then
+    image="$image:$TAG"
+  fi
+  echo "$image"
+}
 
 function build {
-  docker build -t spark-base -f kubernetes/dockerfiles/spark-base/Dockerfile .
+  local base_image="$(image_ref spark-base 0)"
+  docker build --build-arg "spark_jars=$SPARK_JARS" \
+    --build-arg "img_path=$IMG_PATH" \
+    -t "$base_image" \
+    -f "$IMG_PATH/spark-base/Dockerfile" .
   for image in "${!path[@]}"; do
-    docker build -t ${REPO}/$image:${TAG} -f ${path[$image]} .
+    docker build --build-arg "base_image=$base_image" -t "$(image_ref $image)" -f ${path[$image]} .
   done
 }
 
-
 function push {
   for image in "${!path[@]}"; do
-    docker push ${REPO}/$image:${TAG}
+    docker push "$(image_ref $image)"
   done
 }
 
 function usage {
-  echo "This script must be run from a runnable distribution of Apache Spark."
-  echo "Usage: ./sbin/build-push-docker-images.sh -r <repo> -t <tag> build"
-  echo "       ./sbin/build-push-docker-images.sh -r <repo> -t <tag> push"
-  echo "for example: ./sbin/build-push-docker-images.sh -r docker.io/myrepo -t v2.3.0 push"
+  cat <<EOF
+Usage: $0 [options] [command]
+Builds or pushes the built-in Spark Docker images.
+
+Commands:
+  build       Build images.
+  push        Push images to a registry. Requires a repository address to be provided, both
+              when building and when pushing the images.
+
+Options:
+  -r repo     Repository address.
+  -t tag      Tag to apply to built images, or to identify images to be pushed.
+  -m          Use minikube's Docker daemon.
+
+Using minikube when building images will do so directly into minikube's Docker daemon.
+There is no need to push the images into minikube in that case, they'll be automatically
+available when running applications inside the minikube cluster.
+
+Check the following documentation for more information on using the minikube Docker daemon:
+
+  https://kubernetes.io/docs/getting-started-guides/minikube/#reusing-the-docker-daemon
+
+Examples:
+  - Build images in minikube with tag "testing"
+    $0 -m -t testing build
+
+  - Build and push images with tag "v2.3.0" to docker.io/myrepo
+    $0 -r docker.io/myrepo -t v2.3.0 build
+    $0 -r docker.io/myrepo -t v2.3.0 push
+EOF
 }
 
 if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
@@ -49,21 +114,36 @@ if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
   exit 0
 fi
 
-while getopts r:t: option
+REPO=
+TAG=
+while getopts mr:t: option
 do
  case "${option}"
  in
  r) REPO=${OPTARG};;
  t) TAG=${OPTARG};;
+ m)
+   if ! which minikube 1>/dev/null; then
+     error "Cannot find minikube."
+   fi
+   eval $(minikube docker-env)
+   ;;
  esac
 done
 
-if [ -z "$REPO" ] || [ -z "$TAG" ]; then
+case "${@: -1}" in
+  build)
+    build
+    ;;
+  push)
+    if [ -z "$REPO" ]; then
+      usage
+      exit 1
+    fi
+    push
+    ;;
+  *)
     usage
-else
-  case "${@: -1}" in
-    build) build;;
-    push) push;;
-    *) usage;;
-  esac
-fi
+    exit 1
+    ;;
+esac

From df7fc3ef3899cadd252d2837092bebe3442d6523 Mon Sep 17 00:00:00 2001
From: Juliusz Sompolski <julek@databricks.com>
Date: Fri, 5 Jan 2018 10:16:34 +0800
Subject: [PATCH 265/356] [SPARK-22957] ApproxQuantile breaks if the number of
 rows exceeds MaxInt

## What changes were proposed in this pull request?

32bit Int was used for row rank.
That overflowed in a dataframe with more than 2B rows.

## How was this patch tested?

Added test, but ignored, as it takes 4 minutes.

Author: Juliusz Sompolski <julek@databricks.com>

Closes #20152 from juliuszsompolski/SPARK-22957.
---
 .../aggregate/ApproximatePercentile.scala            | 12 ++++++------
 .../spark/sql/catalyst/util/QuantileSummaries.scala  |  8 ++++----
 .../org/apache/spark/sql/DataFrameStatSuite.scala    |  8 ++++++++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 149ac265e6ed5..a45854a3b5146 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -296,8 +296,8 @@ object ApproximatePercentile {
       Ints.BYTES + Doubles.BYTES + Longs.BYTES +
       // length of summary.sampled
       Ints.BYTES +
-      // summary.sampled, Array[Stat(value: Double, g: Int, delta: Int)]
-      summaries.sampled.length * (Doubles.BYTES + Ints.BYTES + Ints.BYTES)
+      // summary.sampled, Array[Stat(value: Double, g: Long, delta: Long)]
+      summaries.sampled.length * (Doubles.BYTES + Longs.BYTES + Longs.BYTES)
     }
 
     final def serialize(obj: PercentileDigest): Array[Byte] = {
@@ -312,8 +312,8 @@ object ApproximatePercentile {
       while (i < summary.sampled.length) {
         val stat = summary.sampled(i)
         buffer.putDouble(stat.value)
-        buffer.putInt(stat.g)
-        buffer.putInt(stat.delta)
+        buffer.putLong(stat.g)
+        buffer.putLong(stat.delta)
         i += 1
       }
       buffer.array()
@@ -330,8 +330,8 @@ object ApproximatePercentile {
       var i = 0
       while (i < sampledLength) {
         val value = buffer.getDouble()
-        val g = buffer.getInt()
-        val delta = buffer.getInt()
+        val g = buffer.getLong()
+        val delta = buffer.getLong()
         sampled(i) = Stats(value, g, delta)
         i += 1
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index eb7941cf9e6af..b013add9c9778 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -105,7 +105,7 @@ class QuantileSummaries(
         if (newSamples.isEmpty || (sampleIdx == sampled.length && opsIdx == sorted.length - 1)) {
           0
         } else {
-          math.floor(2 * relativeError * currentCount).toInt
+          math.floor(2 * relativeError * currentCount).toLong
         }
 
       val tuple = Stats(currentSample, 1, delta)
@@ -192,10 +192,10 @@ class QuantileSummaries(
     }
 
     // Target rank
-    val rank = math.ceil(quantile * count).toInt
+    val rank = math.ceil(quantile * count).toLong
     val targetError = relativeError * count
     // Minimum rank at current sample
-    var minRank = 0
+    var minRank = 0L
     var i = 0
     while (i < sampled.length - 1) {
       val curSample = sampled(i)
@@ -235,7 +235,7 @@ object QuantileSummaries {
    * @param g the minimum rank jump from the previous value's minimum rank
    * @param delta the maximum span of the rank.
    */
-  case class Stats(value: Double, g: Int, delta: Int)
+  case class Stats(value: Double, g: Long, delta: Long)
 
   private def compressImmut(
       currentSamples: IndexedSeq[Stats],
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
index 46b21c3b64a2e..5169d2b5fc6b2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala
@@ -260,6 +260,14 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext {
     assert(res2(1).isEmpty)
   }
 
+  // SPARK-22957: check for 32bit overflow when computing rank.
+  // ignored - takes 4 minutes to run.
+  ignore("approx quantile 4: test for Int overflow") {
+    val res = spark.range(3000000000L).stat.approxQuantile("id", Array(0.8, 0.9), 0.05)
+    assert(res(0) > 2200000000.0)
+    assert(res(1) > 2200000000.0)
+  }
+
   test("crosstab") {
     withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> "false") {
       val rng = new Random()

From 52fc5c17d9d784b846149771b398e741621c0b5c Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Fri, 5 Jan 2018 14:02:21 +0800
Subject: [PATCH 266/356] [SPARK-22825][SQL] Fix incorrect results of Casting
 Array to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting arrays into strings;
```
scala> val df = spark.range(10).select('id.cast("integer")).agg(collect_list('id).as('ids))
scala> df.write.saveAsTable("t")
scala> sql("SELECT cast(ids as String) FROM t").show(false)
+------------------------------------------------------------------+
|ids                                                               |
+------------------------------------------------------------------+
|org.apache.spark.sql.catalyst.expressions.UnsafeArrayData8bc285df|
+------------------------------------------------------------------+
```

This pr modified the result into;
```
+------------------------------+
|ids                           |
+------------------------------+
|[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]|
+------------------------------+
```

## How was this patch tested?
Added tests in `CastSuite` and `SQLQuerySuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20024 from maropu/SPARK-22825.
---
 .../codegen/UTF8StringBuilder.java            | 78 +++++++++++++++++++
 .../spark/sql/catalyst/expressions/Cast.scala | 68 ++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 25 ++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  2 -
 4 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
new file mode 100644
index 0000000000000..f0f66bae245fd
--- /dev/null
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UTF8StringBuilder.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions.codegen;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A helper class to write {@link UTF8String}s to an internal buffer and build the concatenated
+ * {@link UTF8String} at the end.
+ */
+public class UTF8StringBuilder {
+
+  private static final int ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH;
+
+  private byte[] buffer;
+  private int cursor = Platform.BYTE_ARRAY_OFFSET;
+
+  public UTF8StringBuilder() {
+    // Since initial buffer size is 16 in `StringBuilder`, we set the same size here
+    this.buffer = new byte[16];
+  }
+
+  // Grows the buffer by at least `neededSize`
+  private void grow(int neededSize) {
+    if (neededSize > ARRAY_MAX - totalSize()) {
+      throw new UnsupportedOperationException(
+        "Cannot grow internal buffer by size " + neededSize + " because the size after growing " +
+          "exceeds size limitation " + ARRAY_MAX);
+    }
+    final int length = totalSize() + neededSize;
+    if (buffer.length < length) {
+      int newLength = length < ARRAY_MAX / 2 ? length * 2 : ARRAY_MAX;
+      final byte[] tmp = new byte[newLength];
+      Platform.copyMemory(
+        buffer,
+        Platform.BYTE_ARRAY_OFFSET,
+        tmp,
+        Platform.BYTE_ARRAY_OFFSET,
+        totalSize());
+      buffer = tmp;
+    }
+  }
+
+  private int totalSize() {
+    return cursor - Platform.BYTE_ARRAY_OFFSET;
+  }
+
+  public void append(UTF8String value) {
+    grow(value.numBytes());
+    value.writeToMemory(buffer, cursor);
+    cursor += value.numBytes();
+  }
+
+  public void append(String value) {
+    append(UTF8String.fromString(value));
+  }
+
+  public UTF8String build() {
+    return UTF8String.fromBytes(buffer, 0, totalSize());
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 274d8813f16db..d4fc5e0f168a7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -206,6 +206,28 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     case DateType => buildCast[Int](_, d => UTF8String.fromString(DateTimeUtils.dateToString(d)))
     case TimestampType => buildCast[Long](_,
       t => UTF8String.fromString(DateTimeUtils.timestampToString(t, timeZone)))
+    case ArrayType(et, _) =>
+      buildCast[ArrayData](_, array => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (array.numElements > 0) {
+          val toUTF8String = castToString(et)
+          if (!array.isNullAt(0)) {
+            builder.append(toUTF8String(array.get(0, et)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < array.numElements) {
+            builder.append(",")
+            if (!array.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8String(array.get(i, et)).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -597,6 +619,41 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
     """
   }
 
+  private def writeArrayToStringBuilder(
+      et: DataType,
+      array: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val elementToStringCode = castToStringCode(et, ctx)
+    val funcName = ctx.freshName("elementToString")
+    val elementToStringFunc = ctx.addNewFunction(funcName,
+      s"""
+         |private UTF8String $funcName(${ctx.javaType(et)} element) {
+         |  UTF8String elementStr = null;
+         |  ${elementToStringCode("element", "elementStr", null /* resultIsNull won't be used */)}
+         |  return elementStr;
+         |}
+       """.stripMargin)
+
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($array.numElements() > 0) {
+       |  if (!$array.isNullAt(0)) {
+       |    $buffer.append($elementToStringFunc(${ctx.getValue(array, et, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $array.numElements(); $loopIndex++) {
+       |    $buffer.append(",");
+       |    if (!$array.isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($elementToStringFunc(${ctx.getValue(array, et, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -608,6 +665,17 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         val tz = ctx.addReferenceObj("timeZone", timeZone)
         (c, evPrim, evNull) => s"""$evPrim = UTF8String.fromString(
           org.apache.spark.sql.catalyst.util.DateTimeUtils.timestampToString($c, $tz));"""
+      case ArrayType(et, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeArrayElemCode = writeArrayToStringBuilder(et, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeArrayElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1dd040e4696a1..e3ed7171defd8 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -853,4 +853,29 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
     cast("2", LongType).genCode(ctx)
     assert(ctx.inlinedMutableStates.length == 0)
   }
+
+  test("SPARK-22825 Cast array to string") {
+    val ret1 = cast(Literal.create(Array(1, 2, 3, 4, 5)), StringType)
+    checkEvaluation(ret1, "[1, 2, 3, 4, 5]")
+    val ret2 = cast(Literal.create(Array("ab", "cde", "f")), StringType)
+    checkEvaluation(ret2, "[ab, cde, f]")
+    val ret3 = cast(Literal.create(Array("ab", null, "c")), StringType)
+    checkEvaluation(ret3, "[ab,, c]")
+    val ret4 = cast(Literal.create(Array("ab".getBytes, "cde".getBytes, "f".getBytes)), StringType)
+    checkEvaluation(ret4, "[ab, cde, f]")
+    val ret5 = cast(
+      Literal.create(Array("2014-12-03", "2014-12-04", "2014-12-06").map(Date.valueOf)),
+      StringType)
+    checkEvaluation(ret5, "[2014-12-03, 2014-12-04, 2014-12-06]")
+    val ret6 = cast(
+      Literal.create(Array("2014-12-03 13:01:00", "2014-12-04 15:05:00").map(Timestamp.valueOf)),
+      StringType)
+    checkEvaluation(ret6, "[2014-12-03 13:01:00, 2014-12-04 15:05:00]")
+    val ret7 = cast(Literal.create(Array(Array(1, 2, 3), Array(4, 5))), StringType)
+    checkEvaluation(ret7, "[[1, 2, 3], [4, 5]]")
+    val ret8 = cast(
+      Literal.create(Array(Array(Array("a"), Array("b", "c")), Array(Array("d")))),
+      StringType)
+    checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]")
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 5e077285ade55..96bf65fce9c4a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -28,8 +28,6 @@ import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.aggregate
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, SortAggregateExec}
-import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}
-import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf

From cf0aa65576acbe0209c67f04c029058fd73555c1 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 4 Jan 2018 22:45:15 -0800
Subject: [PATCH 267/356] [SPARK-22949][ML] Apply CrossValidator approach to
 Driver/Distributed memory tradeoff for TrainValidationSplit

## What changes were proposed in this pull request?

Avoid holding all models in memory for `TrainValidationSplit`.

## How was this patch tested?

Existing tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20143 from MrBago/trainValidMemoryFix.
---
 .../spark/ml/tuning/CrossValidator.scala       |  4 +++-
 .../spark/ml/tuning/TrainValidationSplit.scala | 18 ++++--------------
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
index 095b54c0fe83f..a0b507d2e718c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/CrossValidator.scala
@@ -160,8 +160,10 @@ class CrossValidator @Since("1.2.0") (@Since("1.4.0") override val uid: String)
         } (executionContext)
       }
 
-      // Wait for metrics to be calculated before unpersisting validation dataset
+      // Wait for metrics to be calculated
       val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
+
+      // Unpersist training & validation set once all metrics have been produced
       trainingDataset.unpersist()
       validationDataset.unpersist()
       foldMetrics
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
index c73bd18475475..8826ef3271bc1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tuning/TrainValidationSplit.scala
@@ -143,24 +143,13 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
 
     // Fit models in a Future for training in parallel
     logDebug(s"Train split with multiple sets of parameters.")
-    val modelFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
-      Future[Model[_]] {
+    val metricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
+      Future[Double] {
         val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
 
         if (collectSubModelsParam) {
           subModels.get(paramIndex) = model
         }
-        model
-      } (executionContext)
-    }
-
-    // Unpersist training data only when all models have trained
-    Future.sequence[Model[_], Iterable](modelFutures)(implicitly, executionContext)
-      .onComplete { _ => trainingDataset.unpersist() } (executionContext)
-
-    // Evaluate models in a Future that will calulate a metric and allow model to be cleaned up
-    val metricFutures = modelFutures.zip(epm).map { case (modelFuture, paramMap) =>
-      modelFuture.map { model =>
         // TODO: duplicate evaluator to take extra params from input
         val metric = eval.evaluate(model.transform(validationDataset, paramMap))
         logDebug(s"Got metric $metric for model trained with $paramMap.")
@@ -171,7 +160,8 @@ class TrainValidationSplit @Since("1.5.0") (@Since("1.5.0") override val uid: St
     // Wait for all metrics to be calculated
     val metrics = metricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
 
-    // Unpersist validation set once all metrics have been produced
+    // Unpersist training & validation set once all metrics have been produced
+    trainingDataset.unpersist()
     validationDataset.unpersist()
 
     logInfo(s"Train validation split metrics: ${metrics.toSeq}")

From 6cff7d19f6a905fe425bd6892fe7ca014c0e696b Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Thu, 4 Jan 2018 23:23:41 -0800
Subject: [PATCH 268/356] [SPARK-22757][K8S] Enable spark.jars and spark.files
 in KUBERNETES mode

## What changes were proposed in this pull request?

We missed enabling `spark.files` and `spark.jars` in https://github.com/apache/spark/pull/19954. The result is that remote dependencies specified through `spark.files` or `spark.jars` are not included in the list of remote dependencies to be downloaded by the init-container. This PR fixes it.

## How was this patch tested?

Manual tests.

vanzin This replaces https://github.com/apache/spark/pull/20157.

foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20160 from liyinan926/SPARK-22757.
---
 .../src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index cbe1f2c3e08a1..1e381965c52ba 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -584,10 +584,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
         confKey = "spark.executor.memory"),
       OptionAssigner(args.totalExecutorCores, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.cores.max"),
-      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS, ALL_DEPLOY_MODES,
+      OptionAssigner(args.files, LOCAL | STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.files"),
       OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
-      OptionAssigner(args.jars, STANDALONE | MESOS, ALL_DEPLOY_MODES, confKey = "spark.jars"),
+      OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
+        confKey = "spark.jars"),
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,

From 51c33bd0d402af9e0284c6cbc0111f926446bfba Mon Sep 17 00:00:00 2001
From: Adrian Ionescu <adrian@databricks.com>
Date: Fri, 5 Jan 2018 21:32:39 +0800
Subject: [PATCH 269/356] [SPARK-22961][REGRESSION] Constant columns should
 generate QueryPlanConstraints

## What changes were proposed in this pull request?

#19201 introduced the following regression: given something like `df.withColumn("c", lit(2))`, we're no longer picking up `c === 2` as a constraint and infer filters from it when joins are involved, which may lead to noticeable performance degradation.

This patch re-enables this optimization by picking up Aliases of Literals in Projection lists as constraints and making sure they're not treated as aliased columns.

## How was this patch tested?

Unit test was added.

Author: Adrian Ionescu <adrian@databricks.com>

Closes #20155 from adrian-ionescu/constant_constraints.
---
 .../sql/catalyst/plans/logical/LogicalPlan.scala    |  2 ++
 .../plans/logical/QueryPlanConstraints.scala        |  2 +-
 .../InferFiltersFromConstraintsSuite.scala          | 13 +++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index a38458add7b5e..ff2a0ec588567 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -247,6 +247,8 @@ abstract class UnaryNode extends LogicalPlan {
   protected def getAliasedConstraints(projectList: Seq[NamedExpression]): Set[Expression] = {
     var allConstraints = child.constraints.asInstanceOf[Set[Expression]]
     projectList.foreach {
+      case a @ Alias(l: Literal, _) =>
+        allConstraints += EqualTo(a.toAttribute, l)
       case a @ Alias(e, _) =>
         // For every alias in `projectList`, replace the reference in constraints by its attribute.
         allConstraints ++= allConstraints.map(_ transform {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
index b0f611fd38dea..9c0a30a47f839 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/QueryPlanConstraints.scala
@@ -98,7 +98,7 @@ trait QueryPlanConstraints { self: LogicalPlan =>
   // we may avoid producing recursive constraints.
   private lazy val aliasMap: AttributeMap[Expression] = AttributeMap(
     expressions.collect {
-      case a: Alias => (a.toAttribute, a.child)
+      case a: Alias if !a.child.isInstanceOf[Literal] => (a.toAttribute, a.child)
     } ++ children.flatMap(_.asInstanceOf[QueryPlanConstraints].aliasMap))
     // Note: the explicit cast is necessary, since Scala compiler fails to infer the type.
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
index 5580f8604ec72..a0708bf7eee9a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromConstraintsSuite.scala
@@ -236,4 +236,17 @@ class InferFiltersFromConstraintsSuite extends PlanTest {
       comparePlans(optimized, originalQuery)
     }
   }
+
+  test("constraints should be inferred from aliased literals") {
+    val originalLeft = testRelation.subquery('left).as("left")
+    val optimizedLeft = testRelation.subquery('left).where(IsNotNull('a) && 'a === 2).as("left")
+
+    val right = Project(Seq(Literal(2).as("two")), testRelation.subquery('right)).as("right")
+    val condition = Some("left.a".attr === "right.two".attr)
+
+    val original = originalLeft.join(right, Inner, condition)
+    val correct = optimizedLeft.join(right, Inner, condition)
+
+    comparePlans(Optimize.execute(original.analyze), correct.analyze)
+  }
 }

From c0b7424ecacb56d3e7a18acc11ba3d5e7be57c43 Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Fri, 5 Jan 2018 09:58:28 -0800
Subject: [PATCH 270/356] [SPARK-22940][SQL] HiveExternalCatalogVersionsSuite
 should succeed on platforms that don't have wget

## What changes were proposed in this pull request?

Modified HiveExternalCatalogVersionsSuite.scala to use Utils.doFetchFile to download different versions of Spark binaries rather than launching wget as an external process.

On platforms that don't have wget installed, this suite fails with an error.

cloud-fan : would you like to check this change?

## How was this patch tested?

1) test-only of HiveExternalCatalogVersionsSuite on several platforms. Tested bad mirror, read timeout, and redirects.
2) ./dev/run-tests

Author: Bruce Robbins <bersprockets@gmail.com>

Closes #20147 from bersprockets/SPARK-22940-alt.
---
 .../HiveExternalCatalogVersionsSuite.scala    | 48 ++++++++++++++++---
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
index a3d5b941a6761..ae4aeb7b4ce4a 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala
@@ -18,11 +18,14 @@
 package org.apache.spark.sql.hive
 
 import java.io.File
-import java.nio.file.Files
+import java.nio.charset.StandardCharsets
+import java.nio.file.{Files, Paths}
 
 import scala.sys.process._
 
-import org.apache.spark.TestUtils
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.{SecurityManager, SparkConf, TestUtils}
 import org.apache.spark.sql.{QueryTest, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.CatalogTableType
@@ -55,14 +58,19 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
   private def tryDownloadSpark(version: String, path: String): Unit = {
     // Try mirrors a few times until one succeeds
     for (i <- 0 until 3) {
+      // we don't retry on a failure to get mirror url. If we can't get a mirror url,
+      // the test fails (getStringFromUrl will throw an exception)
       val preferredMirror =
-        Seq("wget", "https://www.apache.org/dyn/closer.lua?preferred=true", "-q", "-O", "-").!!.trim
-      val url = s"$preferredMirror/spark/spark-$version/spark-$version-bin-hadoop2.7.tgz"
+        getStringFromUrl("https://www.apache.org/dyn/closer.lua?preferred=true")
+      val filename = s"spark-$version-bin-hadoop2.7.tgz"
+      val url = s"$preferredMirror/spark/spark-$version/$filename"
       logInfo(s"Downloading Spark $version from $url")
-      if (Seq("wget", url, "-q", "-P", path).! == 0) {
+      try {
+        getFileFromUrl(url, path, filename)
         return
+      } catch {
+        case ex: Exception => logWarning(s"Failed to download Spark $version from $url", ex)
       }
-      logWarning(s"Failed to download Spark $version from $url")
     }
     fail(s"Unable to download Spark $version")
   }
@@ -85,6 +93,34 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils {
     new File(tmpDataDir, name).getCanonicalPath
   }
 
+  private def getFileFromUrl(urlString: String, targetDir: String, filename: String): Unit = {
+    val conf = new SparkConf
+    // if the caller passes the name of an existing file, we want doFetchFile to write over it with
+    // the contents from the specified url.
+    conf.set("spark.files.overwrite", "true")
+    val securityManager = new SecurityManager(conf)
+    val hadoopConf = new Configuration
+
+    val outDir = new File(targetDir)
+    if (!outDir.exists()) {
+      outDir.mkdirs()
+    }
+
+    // propagate exceptions up to the caller of getFileFromUrl
+    Utils.doFetchFile(urlString, outDir, filename, conf, securityManager, hadoopConf)
+  }
+
+  private def getStringFromUrl(urlString: String): String = {
+    val contentFile = File.createTempFile("string-", ".txt")
+    contentFile.deleteOnExit()
+
+    // exceptions will propagate to the caller of getStringFromUrl
+    getFileFromUrl(urlString, contentFile.getParent, contentFile.getName)
+
+    val contentPath = Paths.get(contentFile.toURI)
+    new String(Files.readAllBytes(contentPath), StandardCharsets.UTF_8)
+  }
+
   override def beforeAll(): Unit = {
     super.beforeAll()
 

From 930b90a84871e2504b57ed50efa7b8bb52d3ba44 Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Fri, 5 Jan 2018 11:51:25 -0800
Subject: [PATCH 271/356] [SPARK-13030][ML] Follow-up cleanups for
 OneHotEncoderEstimator

## What changes were proposed in this pull request?

Follow-up cleanups for the OneHotEncoderEstimator PR.  See some discussion in the original PR: https://github.com/apache/spark/pull/19527 or read below for what this PR includes:
* configedCategorySize: I reverted this to return an Array.  I realized the original setup (which I had recommended in the original PR) caused the whole model to be serialized in the UDF.
* encoder: I reorganized the logic to show what I meant in the comment in the previous PR.  I think it's simpler but am open to suggestions.

I also made some small style cleanups based on IntelliJ warnings.

## How was this patch tested?

Existing unit tests

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #20132 from jkbradley/viirya-SPARK-13030.
---
 .../ml/feature/OneHotEncoderEstimator.scala   | 92 ++++++++++---------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
index 074622d41e28d..bd1e3426c8780 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoderEstimator.scala
@@ -30,24 +30,27 @@ import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.{col, lit, udf}
-import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 
 /** Private trait for params and common methods for OneHotEncoderEstimator and OneHotEncoderModel */
 private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
     with HasInputCols with HasOutputCols {
 
   /**
-   * Param for how to handle invalid data.
+   * Param for how to handle invalid data during transform().
    * Options are 'keep' (invalid data presented as an extra categorical feature) or
    * 'error' (throw an error).
+   * Note that this Param is only used during transform; during fitting, invalid data
+   * will result in an error.
    * Default: "error"
    * @group param
    */
   @Since("2.3.0")
   override val handleInvalid: Param[String] = new Param[String](this, "handleInvalid",
-    "How to handle invalid data " +
+    "How to handle invalid data during transform(). " +
     "Options are 'keep' (invalid data presented as an extra categorical feature) " +
-    "or error (throw an error).",
+    "or error (throw an error). Note that this Param is only used during transform; " +
+    "during fitting, invalid data will result in an error.",
     ParamValidators.inArray(OneHotEncoderEstimator.supportedHandleInvalids))
 
   setDefault(handleInvalid, OneHotEncoderEstimator.ERROR_INVALID)
@@ -66,10 +69,11 @@ private[ml] trait OneHotEncoderBase extends Params with HasHandleInvalid
   def getDropLast: Boolean = $(dropLast)
 
   protected def validateAndTransformSchema(
-      schema: StructType, dropLast: Boolean, keepInvalid: Boolean): StructType = {
+      schema: StructType,
+      dropLast: Boolean,
+      keepInvalid: Boolean): StructType = {
     val inputColNames = $(inputCols)
     val outputColNames = $(outputCols)
-    val existingFields = schema.fields
 
     require(inputColNames.length == outputColNames.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -197,6 +201,10 @@ object OneHotEncoderEstimator extends DefaultParamsReadable[OneHotEncoderEstimat
   override def load(path: String): OneHotEncoderEstimator = super.load(path)
 }
 
+/**
+ * @param categorySizes  Original number of categories for each feature being encoded.
+ *                       The array contains one value for each input column, in order.
+ */
 @Since("2.3.0")
 class OneHotEncoderModel private[ml] (
     @Since("2.3.0") override val uid: String,
@@ -205,60 +213,58 @@ class OneHotEncoderModel private[ml] (
 
   import OneHotEncoderModel._
 
-  // Returns the category size for a given index with `dropLast` and `handleInvalid`
+  // Returns the category size for each index with `dropLast` and `handleInvalid`
   // taken into account.
-  private def configedCategorySize(orgCategorySize: Int, idx: Int): Int = {
+  private def getConfigedCategorySizes: Array[Int] = {
     val dropLast = getDropLast
     val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
 
     if (!dropLast && keepInvalid) {
       // When `handleInvalid` is "keep", an extra category is added as last category
       // for invalid data.
-      orgCategorySize + 1
+      categorySizes.map(_ + 1)
     } else if (dropLast && !keepInvalid) {
       // When `dropLast` is true, the last category is removed.
-      orgCategorySize - 1
+      categorySizes.map(_ - 1)
     } else {
       // When `dropLast` is true and `handleInvalid` is "keep", the extra category for invalid
       // data is removed. Thus, it is the same as the plain number of categories.
-      orgCategorySize
+      categorySizes
     }
   }
 
   private def encoder: UserDefinedFunction = {
-    val oneValue = Array(1.0)
-    val emptyValues = Array.empty[Double]
-    val emptyIndices = Array.empty[Int]
-    val dropLast = getDropLast
-    val handleInvalid = getHandleInvalid
-    val keepInvalid = handleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val keepInvalid = getHandleInvalid == OneHotEncoderEstimator.KEEP_INVALID
+    val configedSizes = getConfigedCategorySizes
+    val localCategorySizes = categorySizes
 
     // The udf performed on input data. The first parameter is the input value. The second
-    // parameter is the index of input.
-    udf { (label: Double, idx: Int) =>
-      val plainNumCategories = categorySizes(idx)
-      val size = configedCategorySize(plainNumCategories, idx)
-
-      if (label < 0) {
-        throw new SparkException(s"Negative value: $label. Input can't be negative.")
-      } else if (label == size && dropLast && !keepInvalid) {
-        // When `dropLast` is true and `handleInvalid` is not "keep",
-        // the last category is removed.
-        Vectors.sparse(size, emptyIndices, emptyValues)
-      } else if (label >= plainNumCategories && keepInvalid) {
-        // When `handleInvalid` is "keep", encodes invalid data to last category (and removed
-        // if `dropLast` is true)
-        if (dropLast) {
-          Vectors.sparse(size, emptyIndices, emptyValues)
+    // parameter is the index in inputCols of the column being encoded.
+    udf { (label: Double, colIdx: Int) =>
+      val origCategorySize = localCategorySizes(colIdx)
+      // idx: index in vector of the single 1-valued element
+      val idx = if (label >= 0 && label < origCategorySize) {
+        label
+      } else {
+        if (keepInvalid) {
+          origCategorySize
         } else {
-          Vectors.sparse(size, Array(size - 1), oneValue)
+          if (label < 0) {
+            throw new SparkException(s"Negative value: $label. Input can't be negative. " +
+              s"To handle invalid values, set Param handleInvalid to " +
+              s"${OneHotEncoderEstimator.KEEP_INVALID}")
+          } else {
+            throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
+              s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+          }
         }
-      } else if (label < plainNumCategories) {
-        Vectors.sparse(size, Array(label.toInt), oneValue)
+      }
+
+      val size = configedSizes(colIdx)
+      if (idx < size) {
+        Vectors.sparse(size, Array(idx.toInt), Array(1.0))
       } else {
-        assert(handleInvalid == OneHotEncoderEstimator.ERROR_INVALID)
-        throw new SparkException(s"Unseen value: $label. To handle unseen values, " +
-          s"set Param handleInvalid to ${OneHotEncoderEstimator.KEEP_INVALID}.")
+        Vectors.sparse(size, Array.empty[Int], Array.empty[Double])
       }
     }
   }
@@ -282,7 +288,6 @@ class OneHotEncoderModel private[ml] (
   @Since("2.3.0")
   override def transformSchema(schema: StructType): StructType = {
     val inputColNames = $(inputCols)
-    val outputColNames = $(outputCols)
 
     require(inputColNames.length == categorySizes.length,
       s"The number of input columns ${inputColNames.length} must be the same as the number of " +
@@ -300,6 +305,7 @@ class OneHotEncoderModel private[ml] (
    * account. Mismatched numbers will cause exception.
    */
   private def verifyNumOfValues(schema: StructType): StructType = {
+    val configedSizes = getConfigedCategorySizes
     $(outputCols).zipWithIndex.foreach { case (outputColName, idx) =>
       val inputColName = $(inputCols)(idx)
       val attrGroup = AttributeGroup.fromStructField(schema(outputColName))
@@ -308,9 +314,9 @@ class OneHotEncoderModel private[ml] (
       // comparing with expected category number with `handleInvalid` and
       // `dropLast` taken into account.
       if (attrGroup.attributes.nonEmpty) {
-        val numCategories = configedCategorySize(categorySizes(idx), idx)
+        val numCategories = configedSizes(idx)
         require(attrGroup.size == numCategories, "OneHotEncoderModel expected " +
-          s"$numCategories categorical values for input column ${inputColName}, " +
+          s"$numCategories categorical values for input column $inputColName, " +
             s"but the input column had metadata specifying ${attrGroup.size} values.")
       }
     }
@@ -322,7 +328,7 @@ class OneHotEncoderModel private[ml] (
     val transformedSchema = transformSchema(dataset.schema, logging = true)
     val keepInvalid = $(handleInvalid) == OneHotEncoderEstimator.KEEP_INVALID
 
-    val encodedColumns = (0 until $(inputCols).length).map { idx =>
+    val encodedColumns = $(inputCols).indices.map { idx =>
       val inputColName = $(inputCols)(idx)
       val outputColName = $(outputCols)(idx)
 

From ea956833017fcbd8ed2288368bfa2e417a2251c5 Mon Sep 17 00:00:00 2001
From: Gera Shegalov <gera@apache.org>
Date: Fri, 5 Jan 2018 17:25:28 -0800
Subject: [PATCH 272/356] [SPARK-22914][DEPLOY] Register history.ui.port

## What changes were proposed in this pull request?

Register spark.history.ui.port as a known spark conf to be used in substitution expressions even if it's not set explicitly.

## How was this patch tested?

Added unit test to demonstrate the issue

Author: Gera Shegalov <gera@apache.org>
Author: Gera Shegalov <gshegalov@salesforce.com>

Closes #20098 from gerashegalov/gera/register-SHS-port-conf.
---
 .../spark/deploy/history/HistoryServer.scala  |  3 +-
 .../apache/spark/deploy/history/config.scala  |  5 +++
 .../spark/deploy/yarn/ApplicationMaster.scala | 17 +++++---
 .../deploy/yarn/ApplicationMasterSuite.scala  | 43 +++++++++++++++++++
 4 files changed, 62 insertions(+), 6 deletions(-)
 create mode 100644 resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
index 75484f5c9f30f..0ec4afad0308c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/HistoryServer.scala
@@ -28,6 +28,7 @@ import org.eclipse.jetty.servlet.{ServletContextHandler, ServletHolder}
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.deploy.history.config.HISTORY_SERVER_UI_PORT
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.status.api.v1.{ApiRootResource, ApplicationInfo, UIRoot}
@@ -276,7 +277,7 @@ object HistoryServer extends Logging {
       .newInstance(conf)
       .asInstanceOf[ApplicationHistoryProvider]
 
-    val port = conf.getInt("spark.history.ui.port", 18080)
+    val port = conf.get(HISTORY_SERVER_UI_PORT)
 
     val server = new HistoryServer(conf, provider, securityManager, port)
     server.bind()
diff --git a/core/src/main/scala/org/apache/spark/deploy/history/config.scala b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
index 22b6d49d8e2a4..efdbf672bb52f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/config.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/config.scala
@@ -44,4 +44,9 @@ private[spark] object config {
     .bytesConf(ByteUnit.BYTE)
     .createWithDefaultString("10g")
 
+  val HISTORY_SERVER_UI_PORT = ConfigBuilder("spark.history.ui.port")
+    .doc("Web UI port to bind Spark History Server")
+    .intConf
+    .createWithDefault(18080)
+
 }
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index b2576b0d72633..4d5e3bb043671 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -427,11 +427,8 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
       uiAddress: Option[String]) = {
     val appId = client.getAttemptId().getApplicationId().toString()
     val attemptId = client.getAttemptId().getAttemptId().toString()
-    val historyAddress =
-      _sparkConf.get(HISTORY_SERVER_ADDRESS)
-        .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
-        .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
-        .getOrElse("")
+    val historyAddress = ApplicationMaster
+      .getHistoryServerAddress(_sparkConf, yarnConf, appId, attemptId)
 
     val driverUrl = RpcEndpointAddress(
       _sparkConf.get("spark.driver.host"),
@@ -834,6 +831,16 @@ object ApplicationMaster extends Logging {
     master.getAttemptId
   }
 
+  private[spark] def getHistoryServerAddress(
+      sparkConf: SparkConf,
+      yarnConf: YarnConfiguration,
+      appId: String,
+      attemptId: String): String = {
+    sparkConf.get(HISTORY_SERVER_ADDRESS)
+      .map { text => SparkHadoopUtil.get.substituteHadoopVariables(text, yarnConf) }
+      .map { address => s"${address}${HistoryServer.UI_PATH_PREFIX}/${appId}/${attemptId}" }
+      .getOrElse("")
+  }
 }
 
 /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
new file mode 100644
index 0000000000000..695a82f3583e6
--- /dev/null
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ApplicationMasterSuite.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy.yarn
+
+import org.apache.hadoop.yarn.conf.YarnConfiguration
+
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+class ApplicationMasterSuite extends SparkFunSuite {
+
+  test("history url with hadoop and spark substitutions") {
+    val host = "rm.host.com"
+    val port = 18080
+    val sparkConf = new SparkConf()
+
+    sparkConf.set("spark.yarn.historyServer.address",
+      "http://${hadoopconf-yarn.resourcemanager.hostname}:${spark.history.ui.port}")
+    val yarnConf = new YarnConfiguration()
+    yarnConf.set("yarn.resourcemanager.hostname", host)
+    val appId = "application_123_1"
+    val attemptId = appId + "_1"
+
+    val shsAddr = ApplicationMaster
+      .getHistoryServerAddress(sparkConf, yarnConf, appId, attemptId)
+
+    assert(shsAddr === s"http://${host}:${port}/history/${appId}/${attemptId}")
+  }
+}

From e8af7e8aeca15a6107248f358d9514521ffdc6d3 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sat, 6 Jan 2018 09:26:03 +0800
Subject: [PATCH 273/356] [SPARK-22937][SQL] SQL elt output binary for binary
 inputs

## What changes were proposed in this pull request?
This pr modified `elt` to output binary for binary inputs.
`elt` in the current master always output data as a string. But, in some databases (e.g., MySQL), if all inputs are binary, `elt` also outputs binary (Also, this might be a small surprise).
This pr is related to #19977.

## How was this patch tested?
Added tests in `SQLQueryTestSuite` and `TypeCoercionSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20135 from maropu/SPARK-22937.
---
 docs/sql-programming-guide.md                 |   2 +
 .../sql/catalyst/analysis/TypeCoercion.scala  |  29 +++++
 .../expressions/stringExpressions.scala       |  46 ++++---
 .../apache/spark/sql/internal/SQLConf.scala   |   8 ++
 .../catalyst/analysis/TypeCoercionSuite.scala |  54 ++++++++
 .../inputs/typeCoercion/native/elt.sql        |  44 +++++++
 .../results/typeCoercion/native/elt.sql.out   | 115 ++++++++++++++++++
 7 files changed, 281 insertions(+), 17 deletions(-)
 create mode 100644 sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
 create mode 100644 sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index dc3e384008d27..b50f9360b866c 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1783,6 +1783,8 @@ options.
 
  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
 
+ - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
+
 ## Upgrading From Spark SQL 2.1 to 2.2
 
   - Spark 2.1.1 introduced a new configuration key: `spark.sql.hive.caseSensitiveInferenceMode`. It had a default setting of `NEVER_INFER`, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting's default value to `INFER_AND_SAVE` to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the `INFER_AND_SAVE` configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set `spark.sql.hive.caseSensitiveInferenceMode` to `NEVER_INFER` to avoid the initial overhead of schema inference. Note that with the new default `INFER_AND_SAVE` setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table's first access.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
index e9436367c7e2e..e8669c4637d06 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala
@@ -54,6 +54,7 @@ object TypeCoercion {
       BooleanEquality ::
       FunctionArgumentConversion ::
       ConcatCoercion(conf) ::
+      EltCoercion(conf) ::
       CaseWhenCoercion ::
       IfCoercion ::
       StackCoercion ::
@@ -684,6 +685,34 @@ object TypeCoercion {
     }
   }
 
+  /**
+   * Coerces the types of [[Elt]] children to expected ones.
+   *
+   * If `spark.sql.function.eltOutputAsString` is false and all children types are binary,
+   * the expected types are binary. Otherwise, the expected ones are strings.
+   */
+  case class EltCoercion(conf: SQLConf) extends TypeCoercionRule {
+
+    override protected def coerceTypes(plan: LogicalPlan): LogicalPlan = plan transform { case p =>
+      p transformExpressionsUp {
+        // Skip nodes if unresolved or not enough children
+        case c @ Elt(children) if !c.childrenResolved || children.size < 2 => c
+        case c @ Elt(children) =>
+          val index = children.head
+          val newIndex = ImplicitTypeCasts.implicitCast(index, IntegerType).getOrElse(index)
+          val newInputs = if (conf.eltOutputAsString ||
+              !children.tail.map(_.dataType).forall(_ == BinaryType)) {
+            children.tail.map { e =>
+              ImplicitTypeCasts.implicitCast(e, StringType).getOrElse(e)
+            }
+          } else {
+            children.tail
+          }
+          c.copy(children = newIndex +: newInputs)
+      }
+    }
+  }
+
   /**
    * Turns Add/Subtract of DateType/TimestampType/StringType and CalendarIntervalType
    * to TimeAdd/TimeSub
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 41dc762154a4c..e004bfc6af473 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -271,33 +271,45 @@ case class ConcatWs(children: Seq[Expression])
   }
 }
 
+/**
+ * An expression that returns the `n`-th input in given inputs.
+ * If all inputs are binary, `elt` returns an output as binary. Otherwise, it returns as string.
+ * If any input is null, `elt` returns null.
+ */
 // scalastyle:off line.size.limit
 @ExpressionDescription(
-  usage = "_FUNC_(n, str1, str2, ...) - Returns the `n`-th string, e.g., returns `str2` when `n` is 2.",
+  usage = "_FUNC_(n, input1, input2, ...) - Returns the `n`-th input, e.g., returns `input2` when `n` is 2.",
   examples = """
     Examples:
       > SELECT _FUNC_(1, 'scala', 'java');
        scala
   """)
 // scalastyle:on line.size.limit
-case class Elt(children: Seq[Expression])
-  extends Expression with ImplicitCastInputTypes {
+case class Elt(children: Seq[Expression]) extends Expression {
 
   private lazy val indexExpr = children.head
-  private lazy val stringExprs = children.tail.toArray
+  private lazy val inputExprs = children.tail.toArray
 
   /** This expression is always nullable because it returns null if index is out of range. */
   override def nullable: Boolean = true
 
-  override def dataType: DataType = StringType
-
-  override def inputTypes: Seq[DataType] = IntegerType +: Seq.fill(children.size - 1)(StringType)
+  override def dataType: DataType = inputExprs.map(_.dataType).headOption.getOrElse(StringType)
 
   override def checkInputDataTypes(): TypeCheckResult = {
     if (children.size < 2) {
       TypeCheckResult.TypeCheckFailure("elt function requires at least two arguments")
     } else {
-      super[ImplicitCastInputTypes].checkInputDataTypes()
+      val (indexType, inputTypes) = (indexExpr.dataType, inputExprs.map(_.dataType))
+      if (indexType != IntegerType) {
+        return TypeCheckResult.TypeCheckFailure(s"first input to function $prettyName should " +
+          s"have IntegerType, but it's $indexType")
+      }
+      if (inputTypes.exists(tpe => !Seq(StringType, BinaryType).contains(tpe))) {
+        return TypeCheckResult.TypeCheckFailure(
+          s"input to function $prettyName should have StringType or BinaryType, but it's " +
+            inputTypes.map(_.simpleString).mkString("[", ", ", "]"))
+      }
+      TypeUtils.checkForSameTypeInputExpr(inputTypes, s"function $prettyName")
     }
   }
 
@@ -307,27 +319,27 @@ case class Elt(children: Seq[Expression])
       null
     } else {
       val index = indexObj.asInstanceOf[Int]
-      if (index <= 0 || index > stringExprs.length) {
+      if (index <= 0 || index > inputExprs.length) {
         null
       } else {
-        stringExprs(index - 1).eval(input)
+        inputExprs(index - 1).eval(input)
       }
     }
   }
 
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val index = indexExpr.genCode(ctx)
-    val strings = stringExprs.map(_.genCode(ctx))
+    val inputs = inputExprs.map(_.genCode(ctx))
     val indexVal = ctx.freshName("index")
     val indexMatched = ctx.freshName("eltIndexMatched")
 
-    val stringVal = ctx.addMutableState(ctx.javaType(dataType), "stringVal")
+    val inputVal = ctx.addMutableState(ctx.javaType(dataType), "inputVal")
 
-    val assignStringValue = strings.zipWithIndex.map { case (eval, index) =>
+    val assignInputValue = inputs.zipWithIndex.map { case (eval, index) =>
       s"""
          |if ($indexVal == ${index + 1}) {
          |  ${eval.code}
-         |  $stringVal = ${eval.isNull} ? null : ${eval.value};
+         |  $inputVal = ${eval.isNull} ? null : ${eval.value};
          |  $indexMatched = true;
          |  continue;
          |}
@@ -335,7 +347,7 @@ case class Elt(children: Seq[Expression])
     }
 
     val codes = ctx.splitExpressionsWithCurrentInputs(
-      expressions = assignStringValue,
+      expressions = assignInputValue,
       funcName = "eltFunc",
       extraArguments = ("int", indexVal) :: Nil,
       returnType = ctx.JAVA_BOOLEAN,
@@ -361,11 +373,11 @@ case class Elt(children: Seq[Expression])
          |${index.code}
          |final int $indexVal = ${index.value};
          |${ctx.JAVA_BOOLEAN} $indexMatched = false;
-         |$stringVal = null;
+         |$inputVal = null;
          |do {
          |  $codes
          |} while (false);
-         |final UTF8String ${ev.value} = $stringVal;
+         |final ${ctx.javaType(dataType)} ${ev.value} = $inputVal;
          |final boolean ${ev.isNull} = ${ev.value} == null;
        """.stripMargin)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5d6edf6b8abec..80b8965e084a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1052,6 +1052,12 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val ELT_OUTPUT_AS_STRING = buildConf("spark.sql.function.eltOutputAsString")
+    .doc("When this option is set to false and all inputs are binary, `elt` returns " +
+      "an output as binary. Otherwise, it returns as a string. ")
+    .booleanConf
+    .createWithDefault(false)
+
   val CONTINUOUS_STREAMING_EXECUTOR_QUEUE_SIZE =
     buildConf("spark.sql.streaming.continuous.executorQueueSize")
     .internal()
@@ -1412,6 +1418,8 @@ class SQLConf extends Serializable with Logging {
 
   def concatBinaryAsString: Boolean = getConf(CONCAT_BINARY_AS_STRING)
 
+  def eltOutputAsString: Boolean = getConf(ELT_OUTPUT_AS_STRING)
+
   def partitionOverwriteMode: PartitionOverwriteMode.Value =
     PartitionOverwriteMode.withName(getConf(PARTITION_OVERWRITE_MODE))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
index 3661530cd622b..52a7ebdafd7c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercionSuite.scala
@@ -923,6 +923,60 @@ class TypeCoercionSuite extends AnalysisTest {
     }
   }
 
+  test("type coercion for Elt") {
+    val rule = TypeCoercion.EltCoercion(conf)
+
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Literal(1), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1.toShort), Literal("ab"), Literal("cde"))),
+      Elt(Seq(Cast(Literal(1.toShort), IntegerType), Literal("ab"), Literal("cde"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(null), Literal("abc"))),
+      Elt(Seq(Literal(2), Cast(Literal(null), StringType), Literal("abc"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(1), Literal("234"))),
+      Elt(Seq(Literal(2), Cast(Literal(1), StringType), Literal("234"))))
+    ruleTest(rule,
+      Elt(Seq(Literal(3), Literal(1L), Literal(2.toByte), Literal(0.1))),
+      Elt(Seq(Literal(3), Cast(Literal(1L), StringType), Cast(Literal(2.toByte), StringType),
+        Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(true), Literal(0.1f), Literal(3.toShort))),
+      Elt(Seq(Literal(2), Cast(Literal(true), StringType), Cast(Literal(0.1f), StringType),
+        Cast(Literal(3.toShort), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(1L), Literal(0.1))),
+      Elt(Seq(Literal(1), Cast(Literal(1L), StringType), Cast(Literal(0.1), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(Decimal(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(Decimal(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(1), Literal(java.math.BigDecimal.valueOf(10)))),
+      Elt(Seq(Literal(1), Cast(Literal(java.math.BigDecimal.valueOf(10)), StringType))))
+    ruleTest(rule,
+      Elt(Seq(Literal(2), Literal(new java.sql.Date(0)), Literal(new Timestamp(0)))),
+      Elt(Seq(Literal(2), Cast(Literal(new java.sql.Date(0)), StringType),
+        Cast(Literal(new Timestamp(0)), StringType))))
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "true") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Cast(Literal("123".getBytes), StringType),
+          Cast(Literal("456".getBytes), StringType))))
+    }
+
+    withSQLConf("spark.sql.function.eltOutputAsString" -> "false") {
+      ruleTest(rule,
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))),
+        Elt(Seq(Literal(1), Literal("123".getBytes), Literal("456".getBytes))))
+    }
+  }
+
   test("BooleanEquality type cast") {
     val be = TypeCoercion.BooleanEquality
     // Use something more than a literal to avoid triggering the simplification rules.
diff --git a/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
new file mode 100644
index 0000000000000..717616f91db05
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/inputs/typeCoercion/native/elt.sql
@@ -0,0 +1,44 @@
+-- Mixed inputs (output type is string)
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+);
+
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+);
+
+-- turn on eltOutputAsString
+set spark.sql.function.eltOutputAsString=true;
+
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
+
+-- turn off eltOutputAsString
+set spark.sql.function.eltOutputAsString=false;
+
+-- Elt binary inputs (output type is binary)
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
new file mode 100644
index 0000000000000..b62e1b6826045
--- /dev/null
+++ b/sql/core/src/test/resources/sql-tests/results/typeCoercion/native/elt.sql.out
@@ -0,0 +1,115 @@
+-- Automatically generated by SQLQueryTestSuite
+-- Number of queries: 6
+
+
+-- !query 0
+SELECT elt(2, col1, col2, col3, col4, col5) col
+FROM (
+  SELECT
+    'prefix_' col1,
+    id col2,
+    string(id + 1) col3,
+    encode(string(id + 2), 'utf-8') col4,
+    CAST(id AS DOUBLE) col5
+  FROM range(10)
+)
+-- !query 0 schema
+struct<col:string>
+-- !query 0 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 1
+SELECT elt(3, col1, col2, col3, col4) col
+FROM (
+  SELECT
+    string(id) col1,
+    string(id + 1) col2,
+    encode(string(id + 2), 'utf-8') col3,
+    encode(string(id + 3), 'utf-8') col4
+  FROM range(10)
+)
+-- !query 1 schema
+struct<col:string>
+-- !query 1 output
+10
+11
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 2
+set spark.sql.function.eltOutputAsString=true
+-- !query 2 schema
+struct<key:string,value:string>
+-- !query 2 output
+spark.sql.function.eltOutputAsString	true
+
+
+-- !query 3
+SELECT elt(1, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 3 schema
+struct<col:string>
+-- !query 3 output
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+
+
+-- !query 4
+set spark.sql.function.eltOutputAsString=false
+-- !query 4 schema
+struct<key:string,value:string>
+-- !query 4 output
+spark.sql.function.eltOutputAsString	false
+
+
+-- !query 5
+SELECT elt(2, col1, col2) col
+FROM (
+  SELECT
+    encode(string(id), 'utf-8') col1,
+    encode(string(id + 1), 'utf-8') col2
+  FROM range(10)
+)
+-- !query 5 schema
+struct<col:binary>
+-- !query 5 output
+1
+10
+2
+3
+4
+5
+6
+7
+8
+9

From bf65cd3cda46d5480bfcd13110975c46ca631972 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Fri, 5 Jan 2018 17:29:27 -0800
Subject: [PATCH 274/356] [SPARK-22960][K8S] Revert use of ARG base_image in
 images

## What changes were proposed in this pull request?

This PR reverts the `ARG base_image` before `FROM` in the images of driver, executor, and init-container, introduced in https://github.com/apache/spark/pull/20154. The reason is Docker versions before 17.06 do not support this use (`ARG` before `FROM`).

## How was this patch tested?

Tested manually.

vanzin foxish kimoonkim

Author: Yinan Li <liyinan926@gmail.com>

Closes #20170 from liyinan926/master.
---
 .../docker/src/main/dockerfiles/driver/Dockerfile         | 3 +--
 .../docker/src/main/dockerfiles/executor/Dockerfile       | 3 +--
 .../docker/src/main/dockerfiles/init-container/Dockerfile | 3 +--
 sbin/build-push-docker-images.sh                          | 8 ++++----
 4 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
index ff5289e10c21e..45fbcd9cd0deb 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
index 3eabb42d4d852..0f806cf7e148e 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
index e0a249e0ac71f..047056ab2633b 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
@@ -15,8 +15,7 @@
 # limitations under the License.
 #
 
-ARG base_image
-FROM ${base_image}
+FROM spark-base
 
 # If this docker file is being used in the context of building your images from a Spark distribution, the docker build
 # command should be invoked from the top level directory of the Spark distribution. E.g.:
diff --git a/sbin/build-push-docker-images.sh b/sbin/build-push-docker-images.sh
index bb8806dd33f37..b9532597419a5 100755
--- a/sbin/build-push-docker-images.sh
+++ b/sbin/build-push-docker-images.sh
@@ -60,13 +60,13 @@ function image_ref {
 }
 
 function build {
-  local base_image="$(image_ref spark-base 0)"
-  docker build --build-arg "spark_jars=$SPARK_JARS" \
+  docker build \
+    --build-arg "spark_jars=$SPARK_JARS" \
     --build-arg "img_path=$IMG_PATH" \
-    -t "$base_image" \
+    -t spark-base \
     -f "$IMG_PATH/spark-base/Dockerfile" .
   for image in "${!path[@]}"; do
-    docker build --build-arg "base_image=$base_image" -t "$(image_ref $image)" -f ${path[$image]} .
+    docker build -t "$(image_ref $image)" -f ${path[$image]} .
   done
 }
 

From f2dd8b923759e8771b0e5f59bfa7ae4ad7e6a339 Mon Sep 17 00:00:00 2001
From: Li Jin <ice.xelloss@gmail.com>
Date: Sat, 6 Jan 2018 16:11:20 +0800
Subject: [PATCH 275/356] [SPARK-22930][PYTHON][SQL] Improve the description of
 Vectorized UDFs for non-deterministic cases

## What changes were proposed in this pull request?

Add tests for using non deterministic UDFs in aggregate.

Update pandas_udf docstring w.r.t to determinism.

## How was this patch tested?
test_nondeterministic_udf_in_aggregate

Author: Li Jin <ice.xelloss@gmail.com>

Closes #20142 from icexelloss/SPARK-22930-pandas-udf-deterministic.
---
 python/pyspark/sql/functions.py | 12 +++++++-
 python/pyspark/sql/tests.py     | 52 +++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index a4ed562ad48b4..733e32bd825b0 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2214,7 +2214,17 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        .. seealso:: :meth:`pyspark.sql.GroupedData.apply`
 
-    .. note:: The user-defined function must be deterministic.
+    .. note:: The user-defined functions are considered deterministic by default. Due to
+        optimization, duplicate invocations may be eliminated or the function may even be invoked
+        more times than it is present in the query. If your function is not deterministic, call
+        `asNondeterministic` on the user defined function. E.g.:
+
+    >>> @pandas_udf('double', PandasUDFType.SCALAR)  # doctest: +SKIP
+    ... def random(v):
+    ...     import numpy as np
+    ...     import pandas as pd
+    ...     return pd.Series(np.random.randn(len(v))
+    >>> random = random.asNondeterministic()  # doctest: +SKIP
 
     .. note:: The user-defined functions do not support conditional expressions or short curcuiting
         in boolean expressions and it ends up with being executed all internally. If the functions
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 6dc767f9ec46e..689736d8e6456 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -386,6 +386,7 @@ def test_udf3(self):
         self.assertEqual(row[0], 5)
 
     def test_nondeterministic_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
         from pyspark.sql.functions import udf
         import random
         udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
@@ -413,6 +414,18 @@ def test_nondeterministic_udf2(self):
         pydoc.render_doc(random_udf)
         pydoc.render_doc(random_udf1)
 
+    def test_nondeterministic_udf_in_aggregate(self):
+        from pyspark.sql.functions import udf, sum
+        import random
+        udf_random_col = udf(lambda: int(100 * random.random()), 'int').asNondeterministic()
+        df = self.spark.range(10)
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.groupby('id').agg(sum(udf_random_col())).collect()
+            with self.assertRaisesRegexp(AnalysisException, "nondeterministic"):
+                df.agg(sum(udf_random_col())).collect()
+
     def test_chained_udf(self):
         self.spark.catalog.registerFunction("double", lambda x: x + x, IntegerType())
         [row] = self.spark.sql("SELECT double(1)").collect()
@@ -3567,6 +3580,18 @@ def tearDownClass(cls):
         time.tzset()
         ReusedSQLTestCase.tearDownClass()
 
+    @property
+    def random_udf(self):
+        from pyspark.sql.functions import pandas_udf
+
+        @pandas_udf('double')
+        def random_udf(v):
+            import pandas as pd
+            import numpy as np
+            return pd.Series(np.random.random(len(v)))
+        random_udf = random_udf.asNondeterministic()
+        return random_udf
+
     def test_vectorized_udf_basic(self):
         from pyspark.sql.functions import pandas_udf, col
         df = self.spark.range(10).select(
@@ -3950,6 +3975,33 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         finally:
             self.spark.conf.set("spark.sql.session.timeZone", orig_tz)
 
+    def test_nondeterministic_udf(self):
+        # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
+        from pyspark.sql.functions import udf, pandas_udf, col
+
+        @pandas_udf('double')
+        def plus_ten(v):
+            return v + 10
+        random_udf = self.random_udf
+
+        df = self.spark.range(10).withColumn('rand', random_udf(col('id')))
+        result1 = df.withColumn('plus_ten(rand)', plus_ten(df['rand'])).toPandas()
+
+        self.assertEqual(random_udf.deterministic, False)
+        self.assertTrue(result1['plus_ten(rand)'].equals(result1['rand'] + 10))
+
+    def test_nondeterministic_udf_in_aggregate(self):
+        from pyspark.sql.functions import pandas_udf, sum
+
+        df = self.spark.range(10)
+        random_udf = self.random_udf
+
+        with QuietTest(self.sc):
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.groupby(df.id).agg(sum(random_udf(df.id))).collect()
+            with self.assertRaisesRegexp(AnalysisException, 'nondeterministic'):
+                df.agg(sum(random_udf(df.id))).collect()
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class GroupbyApplyTests(ReusedSQLTestCase):

From be9a804f2ef77a5044d3da7d9374976daf59fc16 Mon Sep 17 00:00:00 2001
From: zuotingbing <zuo.tingbing9@zte.com.cn>
Date: Sat, 6 Jan 2018 18:07:45 +0800
Subject: [PATCH 276/356] [SPARK-22793][SQL] Memory leak in Spark Thrift Server

# What changes were proposed in this pull request?
1. Start HiveThriftServer2.
2. Connect to thriftserver through beeline.
3. Close the beeline.
4. repeat step2 and step 3 for many times.
we found there are many directories never be dropped under the path `hive.exec.local.scratchdir` and `hive.exec.scratchdir`, as we know the scratchdir has been added to deleteOnExit when it be created. So it means that the cache size of FileSystem `deleteOnExit` will keep increasing until JVM terminated.

In addition, we use `jmap -histo:live [PID]`
to printout the size of objects in HiveThriftServer2 Process, we can find the object `org.apache.spark.sql.hive.client.HiveClientImpl` and `org.apache.hadoop.hive.ql.session.SessionState` keep increasing even though we closed all the beeline connections, which may caused the leak of Memory.

# How was this patch tested?
manual tests

This PR follw-up the https://github.com/apache/spark/pull/19989

Author: zuotingbing <zuo.tingbing9@zte.com.cn>

Closes #20029 from zuotingbing/SPARK-22793.
---
 .../org/apache/spark/sql/hive/HiveSessionStateBuilder.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
index 92cb4ef11c9e3..dc92ad3b0c1ac 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveSessionStateBuilder.scala
@@ -42,7 +42,7 @@ class HiveSessionStateBuilder(session: SparkSession, parentState: Option[Session
    * Create a Hive aware resource loader.
    */
   override protected lazy val resourceLoader: HiveSessionResourceLoader = {
-    val client: HiveClient = externalCatalog.client.newSession()
+    val client: HiveClient = externalCatalog.client
     new HiveSessionResourceLoader(session, client)
   }
 

From 7b78041423b6ee330def2336dfd1ff9ae8469c59 Mon Sep 17 00:00:00 2001
From: fjh100456 <fu.jinhua6@zte.com.cn>
Date: Sat, 6 Jan 2018 18:19:57 +0800
Subject: [PATCH 277/356] [SPARK-21786][SQL] When acquiring
 'compressionCodecClassName' in 'ParquetOptions', `parquet.compression` needs
 to be considered.

[SPARK-21786][SQL] When acquiring 'compressionCodecClassName' in 'ParquetOptions', `parquet.compression` needs to be considered.

## What changes were proposed in this pull request?
Since Hive 1.1, Hive allows users to set parquet compression codec via table-level properties parquet.compression. See the JIRA: https://issues.apache.org/jira/browse/HIVE-7858 . We do support orc.compression for ORC. Thus, for external users, it is more straightforward to support both. See the stackflow question: https://stackoverflow.com/questions/36941122/spark-sql-ignores-parquet-compression-propertie-specified-in-tblproperties
In Spark side, our table-level compression conf compression was added by #11464 since Spark 2.0.
We need to support both table-level conf. Users might also use session-level conf spark.sql.parquet.compression.codec. The priority rule will be like
If other compression codec configuration was found through hive or parquet, the precedence would be compression, parquet.compression, spark.sql.parquet.compression.codec. Acceptable values include: none, uncompressed, snappy, gzip, lzo.
The rule for Parquet is consistent with the ORC after the change.

Changes:
1.Increased acquiring 'compressionCodecClassName' from `parquet.compression`,and the precedence order is `compression`,`parquet.compression`,`spark.sql.parquet.compression.codec`, just like what we do in `OrcOptions`.

2.Change `spark.sql.parquet.compression.codec` to support "none".Actually in `ParquetOptions`,we do support "none" as equivalent to "uncompressed", but it does not allowed to configured to "none".

3.Change `compressionCode` to `compressionCodecClassName`.

## How was this patch tested?
Add test.

Author: fjh100456 <fu.jinhua6@zte.com.cn>

Closes #20076 from fjh100456/ParquetOptionIssue.
---
 docs/sql-programming-guide.md                 |   6 +-
 .../apache/spark/sql/internal/SQLConf.scala   |  14 +-
 .../datasources/parquet/ParquetOptions.scala  |  12 +-
 ...rquetCompressionCodecPrecedenceSuite.scala | 122 ++++++++++++++++++
 4 files changed, 145 insertions(+), 9 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index b50f9360b866c..3ccaaf4d5b1fa 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -953,8 +953,10 @@ Configuration of Parquet can be done using the `setConf` method on `SparkSession
   <td><code>spark.sql.parquet.compression.codec</code></td>
   <td>snappy</td>
   <td>
-    Sets the compression codec use when writing Parquet files. Acceptable values include:
-    uncompressed, snappy, gzip, lzo.
+    Sets the compression codec used when writing Parquet files. If either `compression` or
+    `parquet.compression` is specified in the table-specific options/properties, the precedence would be
+    `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`. Acceptable values include:
+    none, uncompressed, snappy, gzip, lzo.
   </td>
 </tr>
 <tr>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 80b8965e084a2..7d1217de254a2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -325,11 +325,13 @@ object SQLConf {
     .createWithDefault(false)
 
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
-    .doc("Sets the compression codec use when writing Parquet files. Acceptable values include: " +
-      "uncompressed, snappy, gzip, lzo.")
+    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or" +
+      "`parquet.compression` is specified in the table-specific options/properties, the precedence" +
+      "would be `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`." +
+      "Acceptable values include: none, uncompressed, snappy, gzip, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set("uncompressed", "snappy", "gzip", "lzo"))
+    .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
     .createWithDefault("snappy")
 
   val PARQUET_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.parquet.filterPushdown")
@@ -366,8 +368,10 @@ object SQLConf {
       .createWithDefault(true)
 
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
-    .doc("Sets the compression codec use when writing ORC files. Acceptable values include: " +
-      "none, uncompressed, snappy, zlib, lzo.")
+    .doc("Sets the compression codec used when writing ORC files. If either `compression` or" +
+      "`orc.compress` is specified in the table-specific options/properties, the precedence" +
+      "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." +
+      "Acceptable values include: none, uncompressed, snappy, zlib, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "zlib", "lzo"))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
index 772d4565de548..ef67ea7d17cea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetOptions.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.util.Locale
 
+import org.apache.parquet.hadoop.ParquetOutputFormat
 import org.apache.parquet.hadoop.metadata.CompressionCodecName
 
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
@@ -42,8 +43,15 @@ private[parquet] class ParquetOptions(
    * Acceptable values are defined in [[shortParquetCompressionCodecNames]].
    */
   val compressionCodecClassName: String = {
-    val codecName = parameters.getOrElse("compression",
-      sqlConf.parquetCompressionCodec).toLowerCase(Locale.ROOT)
+    // `compression`, `parquet.compression`(i.e., ParquetOutputFormat.COMPRESSION), and
+    // `spark.sql.parquet.compression.codec`
+    // are in order of precedence from highest to lowest.
+    val parquetCompressionConf = parameters.get(ParquetOutputFormat.COMPRESSION)
+    val codecName = parameters
+      .get("compression")
+      .orElse(parquetCompressionConf)
+      .getOrElse(sqlConf.parquetCompressionCodec)
+      .toLowerCase(Locale.ROOT)
     if (!shortParquetCompressionCodecNames.contains(codecName)) {
       val availableCodecs =
         shortParquetCompressionCodecNames.keys.map(_.toLowerCase(Locale.ROOT))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
new file mode 100644
index 0000000000000..ed8fd2b453456
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetCompressionCodecPrecedenceSuite.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.io.File
+
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.hadoop.ParquetOutputFormat
+
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.test.SharedSQLContext
+
+class ParquetCompressionCodecPrecedenceSuite extends ParquetTest with SharedSQLContext {
+  test("Test `spark.sql.parquet.compression.codec` config") {
+    Seq("NONE", "UNCOMPRESSED", "SNAPPY", "GZIP", "LZO").foreach { c =>
+      withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> c) {
+        val expected = if (c == "NONE") "UNCOMPRESSED" else c
+        val option = new ParquetOptions(Map.empty[String, String], spark.sessionState.conf)
+        assert(option.compressionCodecClassName == expected)
+      }
+    }
+  }
+
+  test("[SPARK-21786] Test Acquiring 'compressionCodecClassName' for parquet in right order.") {
+    // When "compression" is configured, it should be the first choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map("compression" -> "uncompressed", ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "UNCOMPRESSED")
+    }
+
+    // When "compression" is not configured, "parquet.compression" should be the preferred choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map(ParquetOutputFormat.COMPRESSION -> "gzip")
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "GZIP")
+    }
+
+    // When both "compression" and "parquet.compression" are not configured,
+    // spark.sql.parquet.compression.codec should be the right choice.
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      val props = Map.empty[String, String]
+      val option = new ParquetOptions(props, spark.sessionState.conf)
+      assert(option.compressionCodecClassName == "SNAPPY")
+    }
+  }
+
+  private def getTableCompressionCodec(path: String): Seq[String] = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val codecs = for {
+      footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
+      block <- footer.getParquetMetadata.getBlocks.asScala
+      column <- block.getColumns.asScala
+    } yield column.getCodec.name()
+    codecs.distinct
+  }
+
+  private def createTableWithCompression(
+      tableName: String,
+      isPartitioned: Boolean,
+      compressionCodec: String,
+      rootDir: File): Unit = {
+    val options =
+      s"""
+        |OPTIONS('path'='${rootDir.toURI.toString.stripSuffix("/")}/$tableName',
+        |'parquet.compression'='$compressionCodec')
+       """.stripMargin
+    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p)" else ""
+    sql(
+      s"""
+        |CREATE TABLE $tableName USING Parquet $options $partitionCreate
+        |AS SELECT 1 AS col1, 2 AS p
+       """.stripMargin)
+  }
+
+  private def checkCompressionCodec(compressionCodec: String, isPartitioned: Boolean): Unit = {
+    withTempDir { tmpDir =>
+      val tempTableName = "TempParquetTable"
+      withTable(tempTableName) {
+        createTableWithCompression(tempTableName, isPartitioned, compressionCodec, tmpDir)
+        val partitionPath = if (isPartitioned) "p=2" else ""
+        val path = s"${tmpDir.getPath.stripSuffix("/")}/$tempTableName/$partitionPath"
+        val realCompressionCodecs = getTableCompressionCodec(path)
+        assert(realCompressionCodecs.forall(_ == compressionCodec))
+      }
+    }
+  }
+
+  test("Create parquet table with compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      Seq("UNCOMPRESSED", "SNAPPY", "GZIP").foreach { compressionCodec =>
+        checkCompressionCodec(compressionCodec, isPartitioned)
+      }
+    }
+  }
+
+  test("Create table with unknown compression") {
+    Seq(true, false).foreach { isPartitioned =>
+      val exception = intercept[IllegalArgumentException] {
+        checkCompressionCodec("aa", isPartitioned)
+      }
+      assert(exception.getMessage.contains("Codec [aa] is not available"))
+    }
+  }
+}

From 993f21567a1dd33e43ef9a626e0ddfbe46f83f93 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 6 Jan 2018 23:08:26 +0800
Subject: [PATCH 278/356] [SPARK-22901][PYTHON][FOLLOWUP] Adds the doc for
 asNondeterministic for wrapped UDF function

## What changes were proposed in this pull request?

This PR wraps the `asNondeterministic` attribute in the wrapped UDF function to set the docstring properly.

```python
from pyspark.sql.functions import udf
help(udf(lambda x: x).asNondeterministic)
```

Before:

```
Help on function <lambda> in module pyspark.sql.udf:

<lambda> lambda
(END
```

After:

```
Help on function asNondeterministic in module pyspark.sql.udf:

asNondeterministic()
    Updates UserDefinedFunction to nondeterministic.

    .. versionadded:: 2.3
(END)
```

## How was this patch tested?

Manually tested and a simple test was added.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20173 from HyukjinKwon/SPARK-22901-followup.
---
 python/pyspark/sql/tests.py | 1 +
 python/pyspark/sql/udf.py   | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 689736d8e6456..122a65b83aef9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -413,6 +413,7 @@ def test_nondeterministic_udf2(self):
         pydoc.render_doc(udf(lambda: random.randint(6, 6), IntegerType()))
         pydoc.render_doc(random_udf)
         pydoc.render_doc(random_udf1)
+        pydoc.render_doc(udf(lambda x: x).asNondeterministic)
 
     def test_nondeterministic_udf_in_aggregate(self):
         from pyspark.sql.functions import udf, sum
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 5e75eb6545333..5e80ab9165867 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -169,8 +169,8 @@ def wrapper(*args):
         wrapper.returnType = self.returnType
         wrapper.evalType = self.evalType
         wrapper.deterministic = self.deterministic
-        wrapper.asNondeterministic = lambda: self.asNondeterministic()._wrapped()
-
+        wrapper.asNondeterministic = functools.wraps(
+            self.asNondeterministic)(lambda: self.asNondeterministic()._wrapped())
         return wrapper
 
     def asNondeterministic(self):

From 9a7048b2889bd0fd66e68a0ce3e07e466315a051 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sun, 7 Jan 2018 00:19:21 +0800
Subject: [PATCH 279/356] [HOTFIX] Fix style checking failure

## What changes were proposed in this pull request?
This PR is to fix the  style checking failure.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20175 from gatorsmile/stylefix.
---
 .../org/apache/spark/sql/internal/SQLConf.scala     | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 7d1217de254a2..5c61f10bb71ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -325,10 +325,11 @@ object SQLConf {
     .createWithDefault(false)
 
   val PARQUET_COMPRESSION = buildConf("spark.sql.parquet.compression.codec")
-    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or" +
-      "`parquet.compression` is specified in the table-specific options/properties, the precedence" +
-      "would be `compression`, `parquet.compression`, `spark.sql.parquet.compression.codec`." +
-      "Acceptable values include: none, uncompressed, snappy, gzip, lzo.")
+    .doc("Sets the compression codec used when writing Parquet files. If either `compression` or " +
+      "`parquet.compression` is specified in the table-specific options/properties, the " +
+      "precedence would be `compression`, `parquet.compression`, " +
+      "`spark.sql.parquet.compression.codec`. Acceptable values include: none, uncompressed, " +
+      "snappy, gzip, lzo.")
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
     .checkValues(Set("none", "uncompressed", "snappy", "gzip", "lzo"))
@@ -368,8 +369,8 @@ object SQLConf {
       .createWithDefault(true)
 
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
-    .doc("Sets the compression codec used when writing ORC files. If either `compression` or" +
-      "`orc.compress` is specified in the table-specific options/properties, the precedence" +
+    .doc("Sets the compression codec used when writing ORC files. If either `compression` or " +
+      "`orc.compress` is specified in the table-specific options/properties, the precedence " +
       "would be `compression`, `orc.compress`, `spark.sql.orc.compression.codec`." +
       "Acceptable values include: none, uncompressed, snappy, zlib, lzo.")
     .stringConf

From 18e94149992618a2b4e6f0fd3b3f4594e1745224 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 7 Jan 2018 13:42:01 +0800
Subject: [PATCH 280/356] [SPARK-22973][SQL] Fix incorrect results of Casting
 Map to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting maps into strings;
```
scala> Seq(Map(1 -> "a", 2 -> "b")).toDF("a").write.saveAsTable("t")
scala> sql("SELECT cast(a as String) FROM t").show(false)
+----------------------------------------------------------------+
|a                                                               |
+----------------------------------------------------------------+
|org.apache.spark.sql.catalyst.expressions.UnsafeMapData38bdd75d|
+----------------------------------------------------------------+
```
This pr modified the result into;
```
+----------------+
|a               |
+----------------+
|[1 -> a, 2 -> b]|
+----------------+
```

## How was this patch tested?
Added tests in `CastSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20166 from maropu/SPARK-22973.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 89 +++++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 28 ++++++
 2 files changed, 117 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index d4fc5e0f168a7..f2de4c8e30bec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -228,6 +228,37 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case MapType(kt, vt, _) =>
+      buildCast[MapData](_, map => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (map.numElements > 0) {
+          val keyArray = map.keyArray()
+          val valueArray = map.valueArray()
+          val keyToUTF8String = castToString(kt)
+          val valueToUTF8String = castToString(vt)
+          builder.append(keyToUTF8String(keyArray.get(0, kt)).asInstanceOf[UTF8String])
+          builder.append(" ->")
+          if (!valueArray.isNullAt(0)) {
+            builder.append(" ")
+            builder.append(valueToUTF8String(valueArray.get(0, vt)).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < map.numElements) {
+            builder.append(", ")
+            builder.append(keyToUTF8String(keyArray.get(i, kt)).asInstanceOf[UTF8String])
+            builder.append(" ->")
+            if (!valueArray.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(valueToUTF8String(valueArray.get(i, vt))
+                .asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -654,6 +685,53 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
      """.stripMargin
   }
 
+  private def writeMapToStringBuilder(
+      kt: DataType,
+      vt: DataType,
+      map: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+
+    def dataToStringFunc(func: String, dataType: DataType) = {
+      val funcName = ctx.freshName(func)
+      val dataToStringCode = castToStringCode(dataType, ctx)
+      ctx.addNewFunction(funcName,
+        s"""
+           |private UTF8String $funcName(${ctx.javaType(dataType)} data) {
+           |  UTF8String dataStr = null;
+           |  ${dataToStringCode("data", "dataStr", null /* resultIsNull won't be used */)}
+           |  return dataStr;
+           |}
+         """.stripMargin)
+    }
+
+    val keyToStringFunc = dataToStringFunc("keyToString", kt)
+    val valueToStringFunc = dataToStringFunc("valueToString", vt)
+    val loopIndex = ctx.freshName("loopIndex")
+    s"""
+       |$buffer.append("[");
+       |if ($map.numElements() > 0) {
+       |  $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, "0")}));
+       |  $buffer.append(" ->");
+       |  if (!$map.valueArray().isNullAt(0)) {
+       |    $buffer.append(" ");
+       |    $buffer.append($valueToStringFunc(${ctx.getValue(s"$map.valueArray()", vt, "0")}));
+       |  }
+       |  for (int $loopIndex = 1; $loopIndex < $map.numElements(); $loopIndex++) {
+       |    $buffer.append(", ");
+       |    $buffer.append($keyToStringFunc(${ctx.getValue(s"$map.keyArray()", kt, loopIndex)}));
+       |    $buffer.append(" ->");
+       |    if (!$map.valueArray().isNullAt($loopIndex)) {
+       |      $buffer.append(" ");
+       |      $buffer.append($valueToStringFunc(
+       |        ${ctx.getValue(s"$map.valueArray()", vt, loopIndex)}));
+       |    }
+       |  }
+       |}
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -676,6 +754,17 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case MapType(kt, vt, _) =>
+        (c, evPrim, evNull) => {
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeMapElemCode = writeMapToStringBuilder(kt, vt, c, buffer, ctx)
+          s"""
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeMapElemCode;
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index e3ed7171defd8..1445bb8a97d40 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -878,4 +878,32 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType)
     checkEvaluation(ret8, "[[[a], [b, c]], [[d]]]")
   }
+
+  test("SPARK-22973 Cast map to string") {
+    val ret1 = cast(Literal.create(Map(1 -> "a", 2 -> "b", 3 -> "c")), StringType)
+    checkEvaluation(ret1, "[1 -> a, 2 -> b, 3 -> c]")
+    val ret2 = cast(
+      Literal.create(Map("1" -> "a".getBytes, "2" -> null, "3" -> "c".getBytes)),
+      StringType)
+    checkEvaluation(ret2, "[1 -> a, 2 ->, 3 -> c]")
+    val ret3 = cast(
+      Literal.create(Map(
+        1 -> Date.valueOf("2014-12-03"),
+        2 -> Date.valueOf("2014-12-04"),
+        3 -> Date.valueOf("2014-12-05"))),
+      StringType)
+    checkEvaluation(ret3, "[1 -> 2014-12-03, 2 -> 2014-12-04, 3 -> 2014-12-05]")
+    val ret4 = cast(
+      Literal.create(Map(
+        1 -> Timestamp.valueOf("2014-12-03 13:01:00"),
+        2 -> Timestamp.valueOf("2014-12-04 15:05:00"))),
+      StringType)
+    checkEvaluation(ret4, "[1 -> 2014-12-03 13:01:00, 2 -> 2014-12-04 15:05:00]")
+    val ret5 = cast(
+      Literal.create(Map(
+        1 -> Array(1, 2, 3),
+        2 -> Array(4, 5, 6))),
+      StringType)
+    checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]")
+  }
 }

From 71d65a32158a55285be197bec4e41fedc9225b94 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 8 Jan 2018 11:39:45 +0800
Subject: [PATCH 281/356] [SPARK-22985] Fix argument escaping bug in
 from_utc_timestamp / to_utc_timestamp codegen

## What changes were proposed in this pull request?

This patch adds additional escaping in `from_utc_timestamp` / `to_utc_timestamp` expression codegen in order to a bug where invalid timezones which contain special characters could cause generated code to fail to compile.

## How was this patch tested?

New regression tests in `DateExpressionsSuite`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20182 from JoshRosen/SPARK-22985-fix-utc-timezone-function-escaping-bugs.
---
 .../catalyst/expressions/datetimeExpressions.scala   | 12 ++++++++----
 .../catalyst/expressions/DateExpressionsSuite.scala  |  6 ++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 7a674ea7f4d76..424871f2047e9 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -23,6 +23,8 @@ import java.util.{Calendar, TimeZone}
 
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang3.StringEscapeUtils
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodegenFallback, ExprCode}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
@@ -1008,7 +1010,7 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1017,8 +1019,9 @@ case class FromUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
@@ -1185,7 +1188,7 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
     if (right.foldable) {
-      val tz = right.eval()
+      val tz = right.eval().asInstanceOf[UTF8String]
       if (tz == null) {
         ev.copy(code = s"""
            |boolean ${ev.isNull} = true;
@@ -1194,8 +1197,9 @@ case class ToUTCTimestamp(left: Expression, right: Expression)
       } else {
         val tzClass = classOf[TimeZone].getName
         val dtu = DateTimeUtils.getClass.getName.stripSuffix("$")
+        val escapedTz = StringEscapeUtils.escapeJava(tz.toString)
         val tzTerm = ctx.addMutableState(tzClass, "tz",
-          v => s"""$v = $dtu.getTimeZone("$tz");""")
+          v => s"""$v = $dtu.getTimeZone("$escapedTz");""")
         val utcTerm = "tzUTC"
         ctx.addImmutableStateIfNotExists(tzClass, utcTerm,
           v => s"""$v = $dtu.getTimeZone("UTC");""")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 63f6ceeb21b96..786266a2c13c0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -22,6 +22,7 @@ import java.text.SimpleDateFormat
 import java.util.{Calendar, Locale, TimeZone}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.TimeZoneGMT
@@ -791,6 +792,9 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(
+      ToUTCTimestamp(Literal(Timestamp.valueOf("2015-07-24 00:00:00")), Literal("\"quote")) :: Nil)
   }
 
   test("from_utc_timestamp") {
@@ -811,5 +815,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     test(null, "UTC", null)
     test("2015-07-24 00:00:00", null, null)
     test(null, null, null)
+    // Test escaping of timezone
+    GenerateUnsafeProjection.generate(FromUTCTimestamp(Literal(0), Literal("\"quote")) :: Nil)
   }
 }

From 3e40eb3f1ffac3d2f49459a801e3ce171ed34091 Mon Sep 17 00:00:00 2001
From: Guilherme Berger <gberger@palantir.com>
Date: Mon, 8 Jan 2018 14:32:05 +0900
Subject: [PATCH 282/356] [SPARK-22566][PYTHON] Better error message for
 `_merge_type` in Pandas to Spark DF conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

It provides a better error message when doing `spark_session.createDataFrame(pandas_df)` with no schema and an error occurs in the schema inference due to incompatible types.

The Pandas column names are propagated down and the error message mentions which column had the merging error.

https://issues.apache.org/jira/browse/SPARK-22566

## How was this patch tested?

Manually in the `./bin/pyspark` console, and with new tests: `./python/run-tests`

<img width="873" alt="screen shot 2017-11-21 at 13 29 49" src="https://user-images.githubusercontent.com/3977115/33080121-382274e0-cecf-11e7-808f-057a65bb7b00.png">

I state that the contribution is my original work and that I license the work to the Apache Spark project under the project’s open source license.

Author: Guilherme Berger <gberger@palantir.com>

Closes #19792 from gberger/master.
---
 python/pyspark/sql/session.py |  17 +++---
 python/pyspark/sql/tests.py   | 100 ++++++++++++++++++++++++++++++++++
 python/pyspark/sql/types.py   |  28 +++++++---
 3 files changed, 129 insertions(+), 16 deletions(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6e5eec48e8aca..6052fa9e84096 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -325,11 +325,12 @@ def range(self, start, end=None, step=1, numPartitions=None):
 
         return DataFrame(jdf, self._wrapped)
 
-    def _inferSchemaFromList(self, data):
+    def _inferSchemaFromList(self, data, names=None):
         """
         Infer schema from list of Row or tuple.
 
         :param data: list of Row or tuple
+        :param names: list of column names
         :return: :class:`pyspark.sql.types.StructType`
         """
         if not data:
@@ -338,12 +339,12 @@ def _inferSchemaFromList(self, data):
         if type(first) is dict:
             warnings.warn("inferring schema from dict is deprecated,"
                           "please use pyspark.sql.Row instead")
-        schema = reduce(_merge_type, map(_infer_schema, data))
+        schema = reduce(_merge_type, (_infer_schema(row, names) for row in data))
         if _has_nulltype(schema):
             raise ValueError("Some of types cannot be determined after inferring")
         return schema
 
-    def _inferSchema(self, rdd, samplingRatio=None):
+    def _inferSchema(self, rdd, samplingRatio=None, names=None):
         """
         Infer schema from an RDD of Row or tuple.
 
@@ -360,10 +361,10 @@ def _inferSchema(self, rdd, samplingRatio=None):
                           "Use pyspark.sql.Row instead")
 
         if samplingRatio is None:
-            schema = _infer_schema(first)
+            schema = _infer_schema(first, names=names)
             if _has_nulltype(schema):
                 for row in rdd.take(100)[1:]:
-                    schema = _merge_type(schema, _infer_schema(row))
+                    schema = _merge_type(schema, _infer_schema(row, names=names))
                     if not _has_nulltype(schema):
                         break
                 else:
@@ -372,7 +373,7 @@ def _inferSchema(self, rdd, samplingRatio=None):
         else:
             if samplingRatio < 0.99:
                 rdd = rdd.sample(False, float(samplingRatio))
-            schema = rdd.map(_infer_schema).reduce(_merge_type)
+            schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type)
         return schema
 
     def _createFromRDD(self, rdd, schema, samplingRatio):
@@ -380,7 +381,7 @@ def _createFromRDD(self, rdd, schema, samplingRatio):
         Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.
         """
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchema(rdd, samplingRatio)
+            struct = self._inferSchema(rdd, samplingRatio, names=schema)
             converter = _create_converter(struct)
             rdd = rdd.map(converter)
             if isinstance(schema, (list, tuple)):
@@ -406,7 +407,7 @@ def _createFromLocal(self, data, schema):
             data = list(data)
 
         if schema is None or isinstance(schema, (list, tuple)):
-            struct = self._inferSchemaFromList(data)
+            struct = self._inferSchemaFromList(data, names=schema)
             converter = _create_converter(struct)
             data = map(converter, data)
             if isinstance(schema, (list, tuple)):
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 122a65b83aef9..13576ff57001b 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -68,6 +68,7 @@
 from pyspark.sql.types import UserDefinedType, _infer_type, _make_type_verifier
 from pyspark.sql.types import _array_signed_int_typecode_ctype_mappings, _array_type_mappings
 from pyspark.sql.types import _array_unsigned_int_typecode_ctype_mappings
+from pyspark.sql.types import _merge_type
 from pyspark.tests import QuietTest, ReusedPySparkTestCase, SparkSubmitTests
 from pyspark.sql.functions import UserDefinedFunction, sha2, lit
 from pyspark.sql.window import Window
@@ -898,6 +899,15 @@ def test_infer_schema(self):
         result = self.spark.sql("SELECT l[0].a from test2 where d['key'].d = '2'")
         self.assertEqual(1, result.head()[0])
 
+    def test_infer_schema_not_enough_names(self):
+        df = self.spark.createDataFrame([["a", "b"]], ["col1"])
+        self.assertEqual(df.columns, ['col1', '_2'])
+
+    def test_infer_schema_fails(self):
+        with self.assertRaisesRegexp(TypeError, 'field a'):
+            self.spark.createDataFrame(self.spark.sparkContext.parallelize([[1, 1], ["x", 1]]),
+                                       schema=["a", "b"], samplingRatio=0.99)
+
     def test_infer_nested_schema(self):
         NestedRow = Row("f1", "f2")
         nestedRdd1 = self.sc.parallelize([NestedRow([1, 2], {"row1": 1.0}),
@@ -918,6 +928,10 @@ def test_infer_nested_schema(self):
         df = self.spark.createDataFrame(rdd)
         self.assertEqual(Row(field1=1, field2=u'row1'), df.first())
 
+    def test_create_dataframe_from_dict_respects_schema(self):
+        df = self.spark.createDataFrame([{'a': 1}], ["b"])
+        self.assertEqual(df.columns, ['b'])
+
     def test_create_dataframe_from_objects(self):
         data = [MyObject(1, "1"), MyObject(2, "2")]
         df = self.spark.createDataFrame(data)
@@ -1772,6 +1786,92 @@ def test_infer_long_type(self):
         self.assertEqual(_infer_type(2**61), LongType())
         self.assertEqual(_infer_type(2**71), LongType())
 
+    def test_merge_type(self):
+        self.assertEqual(_merge_type(LongType(), NullType()), LongType())
+        self.assertEqual(_merge_type(NullType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(LongType(), LongType()), LongType())
+
+        self.assertEqual(_merge_type(
+            ArrayType(LongType()),
+            ArrayType(LongType())
+        ), ArrayType(LongType()))
+        with self.assertRaisesRegexp(TypeError, 'element in array'):
+            _merge_type(ArrayType(LongType()), ArrayType(DoubleType()))
+
+        self.assertEqual(_merge_type(
+            MapType(StringType(), LongType()),
+            MapType(StringType(), LongType())
+        ), MapType(StringType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'key of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(DoubleType(), LongType()))
+        with self.assertRaisesRegexp(TypeError, 'value of map'):
+            _merge_type(
+                MapType(StringType(), LongType()),
+                MapType(StringType(), DoubleType()))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+            StructType([StructField("f1", LongType()), StructField("f2", StringType())])
+        ), StructType([StructField("f1", LongType()), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'field f1'):
+            _merge_type(
+                StructType([StructField("f1", LongType()), StructField("f2", StringType())]),
+                StructType([StructField("f1", DoubleType()), StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+            StructType([StructField("f1", StructType([StructField("f2", LongType())]))])
+        ), StructType([StructField("f1", StructType([StructField("f2", LongType())]))]))
+        with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'):
+            _merge_type(
+                StructType([StructField("f1", StructType([StructField("f2", LongType())]))]),
+                StructType([StructField("f1", StructType([StructField("f2", StringType())]))]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]),
+            StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())])
+        ), StructType([StructField("f1", ArrayType(LongType())), StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'element in array field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", ArrayType(LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", ArrayType(DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())]),
+            StructType([
+                StructField("f1", MapType(StringType(), LongType())),
+                StructField("f2", StringType())])
+        ), StructType([
+            StructField("f1", MapType(StringType(), LongType())),
+            StructField("f2", StringType())]))
+        with self.assertRaisesRegexp(TypeError, 'value of map field f1'):
+            _merge_type(
+                StructType([
+                    StructField("f1", MapType(StringType(), LongType())),
+                    StructField("f2", StringType())]),
+                StructType([
+                    StructField("f1", MapType(StringType(), DoubleType())),
+                    StructField("f2", StringType())]))
+
+        self.assertEqual(_merge_type(
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+            StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))])
+        ), StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]))
+        with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'):
+            _merge_type(
+                StructType([StructField("f1", ArrayType(MapType(StringType(), LongType())))]),
+                StructType([StructField("f1", ArrayType(MapType(DoubleType(), LongType())))])
+            )
+
     def test_filter_with_datetime(self):
         time = datetime.datetime(2015, 4, 17, 23, 1, 2, 3000)
         date = time.date()
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 146e673ae9756..0dc5823f72a3c 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1073,7 +1073,7 @@ def _infer_type(obj):
             raise TypeError("not supported type: %s" % type(obj))
 
 
-def _infer_schema(row):
+def _infer_schema(row, names=None):
     """Infer the schema from dict/namedtuple/object"""
     if isinstance(row, dict):
         items = sorted(row.items())
@@ -1084,7 +1084,10 @@ def _infer_schema(row):
         elif hasattr(row, "_fields"):  # namedtuple
             items = zip(row._fields, tuple(row))
         else:
-            names = ['_%d' % i for i in range(1, len(row) + 1)]
+            if names is None:
+                names = ['_%d' % i for i in range(1, len(row) + 1)]
+            elif len(names) < len(row):
+                names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
             items = zip(names, row)
 
     elif hasattr(row, "__dict__"):  # object
@@ -1109,19 +1112,27 @@ def _has_nulltype(dt):
         return isinstance(dt, NullType)
 
 
-def _merge_type(a, b):
+def _merge_type(a, b, name=None):
+    if name is None:
+        new_msg = lambda msg: msg
+        new_name = lambda n: "field %s" % n
+    else:
+        new_msg = lambda msg: "%s: %s" % (name, msg)
+        new_name = lambda n: "field %s in %s" % (n, name)
+
     if isinstance(a, NullType):
         return b
     elif isinstance(b, NullType):
         return a
     elif type(a) is not type(b):
         # TODO: type cast (such as int -> long)
-        raise TypeError("Can not merge type %s and %s" % (type(a), type(b)))
+        raise TypeError(new_msg("Can not merge type %s and %s" % (type(a), type(b))))
 
     # same type
     if isinstance(a, StructType):
         nfs = dict((f.name, f.dataType) for f in b.fields)
-        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType())))
+        fields = [StructField(f.name, _merge_type(f.dataType, nfs.get(f.name, NullType()),
+                                                  name=new_name(f.name)))
                   for f in a.fields]
         names = set([f.name for f in fields])
         for n in nfs:
@@ -1130,11 +1141,12 @@ def _merge_type(a, b):
         return StructType(fields)
 
     elif isinstance(a, ArrayType):
-        return ArrayType(_merge_type(a.elementType, b.elementType), True)
+        return ArrayType(_merge_type(a.elementType, b.elementType,
+                                     name='element in array %s' % name), True)
 
     elif isinstance(a, MapType):
-        return MapType(_merge_type(a.keyType, b.keyType),
-                       _merge_type(a.valueType, b.valueType),
+        return MapType(_merge_type(a.keyType, b.keyType, name='key of map %s' % name),
+                       _merge_type(a.valueType, b.valueType, name='value of map %s' % name),
                        True)
     else:
         return a

From 8fdeb4b9946bd9be045abb919da2e531708b3bd4 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 8 Jan 2018 13:59:08 +0800
Subject: [PATCH 283/356] [SPARK-22979][PYTHON][SQL] Avoid per-record type
 dispatch in Python data conversion (EvaluatePython.fromJava)

## What changes were proposed in this pull request?

Seems we can avoid type dispatch for each value when Java objection (from Pyrolite) -> Spark's internal data format because we know the schema ahead.

I manually performed the benchmark as below:

```scala
  test("EvaluatePython.fromJava / EvaluatePython.makeFromJava") {
    val numRows = 1000 * 1000
    val numFields = 30

    val random = new Random(System.nanoTime())
    val types = Array(
      BooleanType, ByteType, FloatType, DoubleType, IntegerType, LongType, ShortType,
      DecimalType.ShortDecimal, DecimalType.IntDecimal, DecimalType.ByteDecimal,
      DecimalType.FloatDecimal, DecimalType.LongDecimal, new DecimalType(5, 2),
      new DecimalType(12, 2), new DecimalType(30, 10), CalendarIntervalType)
    val schema = RandomDataGenerator.randomSchema(random, numFields, types)
    val rows = mutable.ArrayBuffer.empty[Array[Any]]
    var i = 0
    while (i < numRows) {
      val row = RandomDataGenerator.randomRow(random, schema)
      rows += row.toSeq.toArray
      i += 1
    }

    val benchmark = new Benchmark("EvaluatePython.fromJava / EvaluatePython.makeFromJava", numRows)
    benchmark.addCase("Before - EvaluatePython.fromJava", 3) { _ =>
      var i = 0
      while (i < numRows) {
        EvaluatePython.fromJava(rows(i), schema)
        i += 1
      }
    }

    benchmark.addCase("After - EvaluatePython.makeFromJava", 3) { _ =>
      val fromJava = EvaluatePython.makeFromJava(schema)
      var i = 0
      while (i < numRows) {
        fromJava(rows(i))
        i += 1
      }
    }

    benchmark.run()
  }
```

```
EvaluatePython.fromJava / EvaluatePython.makeFromJava: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------
Before - EvaluatePython.fromJava              1265 / 1346          0.8        1264.8       1.0X
After - EvaluatePython.makeFromJava            571 /  649          1.8         570.8       2.2X
```

If the structure is nested, I think the advantage should be larger than this.

## How was this patch tested?

Existing tests should cover this. Also, I manually checked if the values from before / after are actually same via `assert` when performing the benchmarks.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20172 from HyukjinKwon/type-dispatch-python-eval.
---
 .../org/apache/spark/sql/SparkSession.scala   |   5 +-
 .../python/BatchEvalPythonExec.scala          |   7 +-
 .../sql/execution/python/EvaluatePython.scala | 166 ++++++++++++------
 3 files changed, 118 insertions(+), 60 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
index 272eb844226d4..734573ba31f71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
@@ -742,7 +742,10 @@ class SparkSession private(
   private[sql] def applySchemaToPythonRDD(
       rdd: RDD[Array[Any]],
       schema: StructType): DataFrame = {
-    val rowRdd = rdd.map(r => python.EvaluatePython.fromJava(r, schema).asInstanceOf[InternalRow])
+    val rowRdd = rdd.mapPartitions { iter =>
+      val fromJava = python.EvaluatePython.makeFromJava(schema)
+      iter.map(r => fromJava(r).asInstanceOf[InternalRow])
+    }
     internalCreateDataFrame(rowRdd, schema)
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
index 26ee25f633ea4..f4d83e8dc7c2b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/BatchEvalPythonExec.scala
@@ -79,16 +79,19 @@ case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], chi
     } else {
       StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
     }
+
+    val fromJava = EvaluatePython.makeFromJava(resultType)
+
     outputIterator.flatMap { pickedResult =>
       val unpickledBatch = unpickle.loads(pickedResult)
       unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
     }.map { result =>
       if (udfs.length == 1) {
         // fast path for single UDF
-        mutableRow(0) = EvaluatePython.fromJava(result, resultType)
+        mutableRow(0) = fromJava(result)
         mutableRow
       } else {
-        EvaluatePython.fromJava(result, resultType).asInstanceOf[InternalRow]
+        fromJava(result).asInstanceOf[InternalRow]
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
index 9bbfa6018ba77..520afad287648 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
@@ -83,82 +83,134 @@ object EvaluatePython {
   }
 
   /**
-   * Converts `obj` to the type specified by the data type, or returns null if the type of obj is
-   * unexpected. Because Python doesn't enforce the type.
+   * Make a converter that converts `obj` to the type specified by the data type, or returns
+   * null if the type of obj is unexpected. Because Python doesn't enforce the type.
    */
-  def fromJava(obj: Any, dataType: DataType): Any = (obj, dataType) match {
-    case (null, _) => null
-
-    case (c: Boolean, BooleanType) => c
+  def makeFromJava(dataType: DataType): Any => Any = dataType match {
+    case BooleanType => (obj: Any) => nullSafeConvert(obj) {
+      case b: Boolean => b
+    }
 
-    case (c: Byte, ByteType) => c
-    case (c: Short, ByteType) => c.toByte
-    case (c: Int, ByteType) => c.toByte
-    case (c: Long, ByteType) => c.toByte
+    case ByteType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c
+      case c: Short => c.toByte
+      case c: Int => c.toByte
+      case c: Long => c.toByte
+    }
 
-    case (c: Byte, ShortType) => c.toShort
-    case (c: Short, ShortType) => c
-    case (c: Int, ShortType) => c.toShort
-    case (c: Long, ShortType) => c.toShort
+    case ShortType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toShort
+      case c: Short => c
+      case c: Int => c.toShort
+      case c: Long => c.toShort
+    }
 
-    case (c: Byte, IntegerType) => c.toInt
-    case (c: Short, IntegerType) => c.toInt
-    case (c: Int, IntegerType) => c
-    case (c: Long, IntegerType) => c.toInt
+    case IntegerType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toInt
+      case c: Short => c.toInt
+      case c: Int => c
+      case c: Long => c.toInt
+    }
 
-    case (c: Byte, LongType) => c.toLong
-    case (c: Short, LongType) => c.toLong
-    case (c: Int, LongType) => c.toLong
-    case (c: Long, LongType) => c
+    case LongType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Byte => c.toLong
+      case c: Short => c.toLong
+      case c: Int => c.toLong
+      case c: Long => c
+    }
 
-    case (c: Float, FloatType) => c
-    case (c: Double, FloatType) => c.toFloat
+    case FloatType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c
+      case c: Double => c.toFloat
+    }
 
-    case (c: Float, DoubleType) => c.toDouble
-    case (c: Double, DoubleType) => c
+    case DoubleType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Float => c.toDouble
+      case c: Double => c
+    }
 
-    case (c: java.math.BigDecimal, dt: DecimalType) => Decimal(c, dt.precision, dt.scale)
+    case dt: DecimalType => (obj: Any) => nullSafeConvert(obj) {
+      case c: java.math.BigDecimal => Decimal(c, dt.precision, dt.scale)
+    }
 
-    case (c: Int, DateType) => c
+    case DateType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Int => c
+    }
 
-    case (c: Long, TimestampType) => c
-    // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
-    case (c: Int, TimestampType) => c.toLong
+    case TimestampType => (obj: Any) => nullSafeConvert(obj) {
+      case c: Long => c
+      // Py4J serializes values between MIN_INT and MAX_INT as Ints, not Longs
+      case c: Int => c.toLong
+    }
 
-    case (c, StringType) => UTF8String.fromString(c.toString)
+    case StringType => (obj: Any) => nullSafeConvert(obj) {
+      case _ => UTF8String.fromString(obj.toString)
+    }
 
-    case (c: String, BinaryType) => c.getBytes(StandardCharsets.UTF_8)
-    case (c, BinaryType) if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    case BinaryType => (obj: Any) => nullSafeConvert(obj) {
+      case c: String => c.getBytes(StandardCharsets.UTF_8)
+      case c if c.getClass.isArray && c.getClass.getComponentType.getName == "byte" => c
+    }
 
-    case (c: java.util.List[_], ArrayType(elementType, _)) =>
-      new GenericArrayData(c.asScala.map { e => fromJava(e, elementType)}.toArray)
+    case ArrayType(elementType, _) =>
+      val elementFromJava = makeFromJava(elementType)
 
-    case (c, ArrayType(elementType, _)) if c.getClass.isArray =>
-      new GenericArrayData(c.asInstanceOf[Array[_]].map(e => fromJava(e, elementType)))
+      (obj: Any) => nullSafeConvert(obj) {
+        case c: java.util.List[_] =>
+          new GenericArrayData(c.asScala.map { e => elementFromJava(e) }.toArray)
+        case c if c.getClass.isArray =>
+          new GenericArrayData(c.asInstanceOf[Array[_]].map(e => elementFromJava(e)))
+      }
 
-    case (javaMap: java.util.Map[_, _], MapType(keyType, valueType, _)) =>
-      ArrayBasedMapData(
-        javaMap,
-        (key: Any) => fromJava(key, keyType),
-        (value: Any) => fromJava(value, valueType))
+    case MapType(keyType, valueType, _) =>
+      val keyFromJava = makeFromJava(keyType)
+      val valueFromJava = makeFromJava(valueType)
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case javaMap: java.util.Map[_, _] =>
+          ArrayBasedMapData(
+            javaMap,
+            (key: Any) => keyFromJava(key),
+            (value: Any) => valueFromJava(value))
+      }
 
-    case (c, StructType(fields)) if c.getClass.isArray =>
-      val array = c.asInstanceOf[Array[_]]
-      if (array.length != fields.length) {
-        throw new IllegalStateException(
-          s"Input row doesn't have expected number of values required by the schema. " +
-            s"${fields.length} fields are required while ${array.length} values are provided."
-        )
+    case StructType(fields) =>
+      val fieldsFromJava = fields.map(f => makeFromJava(f.dataType)).toArray
+
+      (obj: Any) => nullSafeConvert(obj) {
+        case c if c.getClass.isArray =>
+          val array = c.asInstanceOf[Array[_]]
+          if (array.length != fields.length) {
+            throw new IllegalStateException(
+              s"Input row doesn't have expected number of values required by the schema. " +
+                s"${fields.length} fields are required while ${array.length} values are provided."
+            )
+          }
+
+          val row = new GenericInternalRow(fields.length)
+          var i = 0
+          while (i < fields.length) {
+            row(i) = fieldsFromJava(i)(array(i))
+            i += 1
+          }
+          row
       }
-      new GenericInternalRow(array.zip(fields).map {
-        case (e, f) => fromJava(e, f.dataType)
-      })
 
-    case (_, udt: UserDefinedType[_]) => fromJava(obj, udt.sqlType)
+    case udt: UserDefinedType[_] => makeFromJava(udt.sqlType)
+
+    case other => (obj: Any) => nullSafeConvert(other)(PartialFunction.empty)
+  }
 
-    // all other unexpected type should be null, or we will have runtime exception
-    // TODO(davies): we could improve this by try to cast the object to expected type
-    case (c, _) => null
+  private def nullSafeConvert(input: Any)(f: PartialFunction[Any, Any]): Any = {
+    if (input == null) {
+      null
+    } else {
+      f.applyOrElse(input, {
+        // all other unexpected type should be null, or we will have runtime exception
+        // TODO(davies): we could improve this by try to cast the object to expected type
+        _: Any => null
+      })
+    }
   }
 
   private val module = "pyspark.sql.types"

From 2c73d2a948bdde798aaf0f87c18846281deb05fd Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Mon, 8 Jan 2018 16:04:03 +0800
Subject: [PATCH 284/356] [SPARK-22983] Don't push filters beneath aggregates
 with empty grouping expressions

## What changes were proposed in this pull request?

The following SQL query should return zero rows, but in Spark it actually returns one row:

```
SELECT 1 from (
  SELECT 1 AS z,
  MIN(a.x)
  FROM (select 1 as x) a
  WHERE false
) b
where b.z != b.z
```

The problem stems from the `PushDownPredicate` rule: when this rule encounters a filter on top of an Aggregate operator, e.g. `Filter(Agg(...))`, it removes the original filter and adds a new filter onto Aggregate's child, e.g. `Agg(Filter(...))`. This is sometimes okay, but the case above is a counterexample: because there is no explicit `GROUP BY`, we are implicitly computing a global aggregate over the entire table so the original filter was not acting like a `HAVING` clause filtering the number of groups: if we push this filter then it fails to actually reduce the cardinality of the Aggregate output, leading to the wrong answer.

In 2016 I fixed a similar problem involving invalid pushdowns of data-independent filters (filters which reference no columns of the filtered relation). There was additional discussion after my fix was merged which pointed out that my patch was an incomplete fix (see #15289), but it looks I must have either misunderstood the comment or forgot to follow up on the additional points raised there.

This patch fixes the problem by choosing to never push down filters in cases where there are no grouping expressions. Since there are no grouping keys, the only columns are aggregate columns and we can't push filters defined over aggregate results, so this change won't cause us to miss out on any legitimate pushdown opportunities.

## How was this patch tested?

New regression tests in `SQLQueryTestSuite` and `FilterPushdownSuite`.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20180 from JoshRosen/SPARK-22983-dont-push-filters-beneath-aggs-with-empty-grouping-expressions.
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala |  3 ++-
 .../catalyst/optimizer/FilterPushdownSuite.scala | 13 +++++++++++++
 .../test/resources/sql-tests/inputs/group-by.sql |  9 +++++++++
 .../resources/sql-tests/results/group-by.sql.out | 16 +++++++++++++++-
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0d4b02c6e7d8a..df0af8264a329 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -795,7 +795,8 @@ object PushDownPredicate extends Rule[LogicalPlan] with PredicateHelper {
       project.copy(child = Filter(replaceAlias(condition, aliasMap), grandChild))
 
     case filter @ Filter(condition, aggregate: Aggregate)
-      if aggregate.aggregateExpressions.forall(_.deterministic) =>
+      if aggregate.aggregateExpressions.forall(_.deterministic)
+        && aggregate.groupingExpressions.nonEmpty =>
       // Find all the aliased expressions in the aggregate list that don't include any actual
       // AggregateExpression, and create a map from the alias to the expression
       val aliasMap = AttributeMap(aggregate.aggregateExpressions.collect {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 85a5e979f6021..82a10254d846d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -809,6 +809,19 @@ class FilterPushdownSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("aggregate: don't push filters if the aggregate has no grouping expressions") {
+    val originalQuery = LocalRelation.apply(testRelation.output, Seq.empty)
+      .select('a, 'b)
+      .groupBy()(count(1))
+      .where(false)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = originalQuery.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("broadcast hint") {
     val originalQuery = ResolvedHint(testRelation)
       .where('a === 2L && 'b + Rand(10).as("rnd") === 3)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
index 1e1384549a410..c5070b734d521 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by.sql
@@ -60,3 +60,12 @@ SELECT a, COUNT(1) FROM testData WHERE false GROUP BY a;
 -- Aggregate with empty input and empty GroupBy expressions.
 SELECT COUNT(1) FROM testData WHERE false;
 SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t;
+
+-- Aggregate with empty GroupBy expressions and filter on top
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
diff --git a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
index 986bb01c13fe4..c1abc6dff754b 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 25
+-- Number of queries: 26
 
 
 -- !query 0
@@ -227,3 +227,17 @@ SELECT 1 FROM (SELECT COUNT(1) FROM testData WHERE false) t
 struct<1:int>
 -- !query 24 output
 1
+
+
+-- !query 25
+SELECT 1 from (
+  SELECT 1 AS z,
+  MIN(a.x)
+  FROM (select 1 as x) a
+  WHERE false
+) b
+where b.z != b.z
+-- !query 25 schema
+struct<1:int>
+-- !query 25 output
+

From eb45b52e826ea9cea48629760db35ef87f91fea0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 8 Jan 2018 19:41:41 +0800
Subject: [PATCH 285/356] [SPARK-21865][SQL] simplify the distribution semantic
 of Spark SQL

## What changes were proposed in this pull request?

**The current shuffle planning logic**

1. Each operator specifies the distribution requirements for its children, via the `Distribution` interface.
2. Each operator specifies its output partitioning, via the `Partitioning` interface.
3. `Partitioning.satisfy` determines whether a `Partitioning` can satisfy a `Distribution`.
4. For each operator, check each child of it, add a shuffle node above the child if the child partitioning can not satisfy the required distribution.
5. For each operator, check if its children's output partitionings are compatible with each other, via the `Partitioning.compatibleWith`.
6. If the check in 5 failed, add a shuffle above each child.
7. try to eliminate the shuffles added in 6, via `Partitioning.guarantees`.

This design has a major problem with the definition of "compatible".

`Partitioning.compatibleWith` is not well defined, ideally a `Partitioning` can't know if it's compatible with other `Partitioning`, without more information from the operator. For example, `t1 join t2 on t1.a = t2.b`, `HashPartitioning(a, 10)` should be compatible with `HashPartitioning(b, 10)` under this case, but the partitioning itself doesn't know it.

As a result, currently `Partitioning.compatibleWith` always return false except for literals, which make it almost useless. This also means, if an operator has distribution requirements for multiple children, Spark always add shuffle nodes to all the children(although some of them can be eliminated). However, there is no guarantee that the children's output partitionings are compatible with each other after adding these shuffles, we just assume that the operator will only specify `ClusteredDistribution` for multiple children.

I think it's very hard to guarantee children co-partition for all kinds of operators, and we can not even give a clear definition about co-partition between distributions like `ClusteredDistribution(a,b)` and `ClusteredDistribution(c)`.

I think we should drop the "compatible" concept in the distribution model, and let the operator achieve the co-partition requirement by special distribution requirements.

**Proposed shuffle planning logic after this PR**
(The first 4 are same as before)
1. Each operator specifies the distribution requirements for its children, via the `Distribution` interface.
2. Each operator specifies its output partitioning, via the `Partitioning` interface.
3. `Partitioning.satisfy` determines whether a `Partitioning` can satisfy a `Distribution`.
4. For each operator, check each child of it, add a shuffle node above the child if the child partitioning can not satisfy the required distribution.
5. For each operator, check if its children's output partitionings have the same number of partitions.
6. If the check in 5 failed, pick the max number of partitions from children's output partitionings, and add shuffle to child whose number of partitions doesn't equal to the max one.

The new distribution model is very simple, we only have one kind of relationship, which is `Partitioning.satisfy`. For multiple children, Spark only guarantees they have the same number of partitions, and it's the operator's responsibility to leverage this guarantee to achieve more complicated requirements. For example, non-broadcast joins can use the newly added `HashPartitionedDistribution` to achieve co-partition.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #19080 from cloud-fan/exchange.
---
 .../plans/physical/partitioning.scala         | 286 +++++++-----------
 .../sql/catalyst/PartitioningSuite.scala      |  55 ----
 .../spark/sql/execution/SparkPlan.scala       |  16 +-
 .../exchange/EnsureRequirements.scala         | 120 +++-----
 .../joins/ShuffledHashJoinExec.scala          |   2 +-
 .../execution/joins/SortMergeJoinExec.scala   |   2 +-
 .../apache/spark/sql/execution/objects.scala  |   2 +-
 .../spark/sql/execution/PlannerSuite.scala    |  81 ++---
 8 files changed, 194 insertions(+), 370 deletions(-)
 delete mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index e57c842ce2a36..0189bd73c56bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -30,18 +30,43 @@ import org.apache.spark.sql.types.{DataType, IntegerType}
  *  - Intra-partition ordering of data: In this case the distribution describes guarantees made
  *    about how tuples are distributed within a single partition.
  */
-sealed trait Distribution
+sealed trait Distribution {
+  /**
+   * The required number of partitions for this distribution. If it's None, then any number of
+   * partitions is allowed for this distribution.
+   */
+  def requiredNumPartitions: Option[Int]
+
+  /**
+   * Creates a default partitioning for this distribution, which can satisfy this distribution while
+   * matching the given number of partitions.
+   */
+  def createPartitioning(numPartitions: Int): Partitioning
+}
 
 /**
  * Represents a distribution where no promises are made about co-location of data.
  */
-case object UnspecifiedDistribution extends Distribution
+case object UnspecifiedDistribution extends Distribution {
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    throw new IllegalStateException("UnspecifiedDistribution does not have default partitioning.")
+  }
+}
 
 /**
  * Represents a distribution that only has a single partition and all tuples of the dataset
  * are co-located.
  */
-case object AllTuples extends Distribution
+case object AllTuples extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1, "The default partitioning of AllTuples can only have 1 partition.")
+    SinglePartition
+  }
+}
 
 /**
  * Represents data where tuples that share the same values for the `clustering`
@@ -51,12 +76,41 @@ case object AllTuples extends Distribution
  */
 case class ClusteredDistribution(
     clustering: Seq[Expression],
-    numPartitions: Option[Int] = None) extends Distribution {
+    requiredNumPartitions: Option[Int] = None) extends Distribution {
   require(
     clustering != Nil,
     "The clustering expressions of a ClusteredDistribution should not be Nil. " +
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
+      s"This ClusteredDistribution requires ${requiredNumPartitions.get} partitions, but " +
+        s"the actual number of partitions is $numPartitions.")
+    HashPartitioning(clustering, numPartitions)
+  }
+}
+
+/**
+ * Represents data where tuples have been clustered according to the hash of the given
+ * `expressions`. The hash function is defined as `HashPartitioning.partitionIdExpression`, so only
+ * [[HashPartitioning]] can satisfy this distribution.
+ *
+ * This is a strictly stronger guarantee than [[ClusteredDistribution]]. Given a tuple and the
+ * number of partitions, this distribution strictly requires which partition the tuple should be in.
+ */
+case class HashClusteredDistribution(expressions: Seq[Expression]) extends Distribution {
+  require(
+    expressions != Nil,
+    "The expressions for hash of a HashPartitionedDistribution should not be Nil. " +
+      "An AllTuples should be used to represent a distribution that only has " +
+      "a single partition.")
+
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    HashPartitioning(expressions, numPartitions)
+  }
 }
 
 /**
@@ -73,46 +127,31 @@ case class OrderedDistribution(ordering: Seq[SortOrder]) extends Distribution {
       "An AllTuples should be used to represent a distribution that only has " +
       "a single partition.")
 
-  // TODO: This is not really valid...
-  def clustering: Set[Expression] = ordering.map(_.child).toSet
+  override def requiredNumPartitions: Option[Int] = None
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    RangePartitioning(ordering, numPartitions)
+  }
 }
 
 /**
  * Represents data where tuples are broadcasted to every node. It is quite common that the
  * entire set of tuples is transformed into different data structure.
  */
-case class BroadcastDistribution(mode: BroadcastMode) extends Distribution
+case class BroadcastDistribution(mode: BroadcastMode) extends Distribution {
+  override def requiredNumPartitions: Option[Int] = Some(1)
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(numPartitions == 1,
+      "The default partitioning of BroadcastDistribution can only have 1 partition.")
+    BroadcastPartitioning(mode)
+  }
+}
 
 /**
- * Describes how an operator's output is split across partitions. The `compatibleWith`,
- * `guarantees`, and `satisfies` methods describe relationships between child partitionings,
- * target partitionings, and [[Distribution]]s. These relations are described more precisely in
- * their individual method docs, but at a high level:
- *
- *  - `satisfies` is a relationship between partitionings and distributions.
- *  - `compatibleWith` is relationships between an operator's child output partitionings.
- *  - `guarantees` is a relationship between a child's existing output partitioning and a target
- *     output partitioning.
- *
- *  Diagrammatically:
- *
- *            +--------------+
- *            | Distribution |
- *            +--------------+
- *                    ^
- *                    |
- *               satisfies
- *                    |
- *            +--------------+                  +--------------+
- *            |    Child     |                  |    Target    |
- *       +----| Partitioning |----guarantees--->| Partitioning |
- *       |    +--------------+                  +--------------+
- *       |            ^
- *       |            |
- *       |     compatibleWith
- *       |            |
- *       +------------+
- *
+ * Describes how an operator's output is split across partitions. It has 2 major properties:
+ *   1. number of partitions.
+ *   2. if it can satisfy a given distribution.
  */
 sealed trait Partitioning {
   /** Returns the number of partitions that the data is split across */
@@ -123,113 +162,35 @@ sealed trait Partitioning {
    * to satisfy the partitioning scheme mandated by the `required` [[Distribution]],
    * i.e. the current dataset does not need to be re-partitioned for the `required`
    * Distribution (it is possible that tuples within a partition need to be reorganized).
-   */
-  def satisfies(required: Distribution): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]]
-   * guarantees the same partitioning scheme described by `other`.
-   *
-   * Compatibility of partitionings is only checked for operators that have multiple children
-   * and that require a specific child output [[Distribution]], such as joins.
-   *
-   * Intuitively, partitionings are compatible if they route the same partitioning key to the same
-   * partition. For instance, two hash partitionings are only compatible if they produce the same
-   * number of output partitionings and hash records according to the same hash function and
-   * same partitioning key schema.
-   *
-   * Put another way, two partitionings are compatible with each other if they satisfy all of the
-   * same distribution guarantees.
-   */
-  def compatibleWith(other: Partitioning): Boolean
-
-  /**
-   * Returns true iff we can say that the partitioning scheme of this [[Partitioning]] guarantees
-   * the same partitioning scheme described by `other`. If a `A.guarantees(B)`, then repartitioning
-   * the child's output according to `B` will be unnecessary. `guarantees` is used as a performance
-   * optimization to allow the exchange planner to avoid redundant repartitionings. By default,
-   * a partitioning only guarantees partitionings that are equal to itself (i.e. the same number
-   * of partitions, same strategy (range or hash), etc).
-   *
-   * In order to enable more aggressive optimization, this strict equality check can be relaxed.
-   * For example, say that the planner needs to repartition all of an operator's children so that
-   * they satisfy the [[AllTuples]] distribution. One way to do this is to repartition all children
-   * to have the [[SinglePartition]] partitioning. If one of the operator's children already happens
-   * to be hash-partitioned with a single partition then we do not need to re-shuffle this child;
-   * this repartitioning can be avoided if a single-partition [[HashPartitioning]] `guarantees`
-   * [[SinglePartition]].
-   *
-   * The SinglePartition example given above is not particularly interesting; guarantees' real
-   * value occurs for more advanced partitioning strategies. SPARK-7871 will introduce a notion
-   * of null-safe partitionings, under which partitionings can specify whether rows whose
-   * partitioning keys contain null values will be grouped into the same partition or whether they
-   * will have an unknown / random distribution. If a partitioning does not require nulls to be
-   * clustered then a partitioning which _does_ cluster nulls will guarantee the null clustered
-   * partitioning. The converse is not true, however: a partitioning which clusters nulls cannot
-   * be guaranteed by one which does not cluster them. Thus, in general `guarantees` is not a
-   * symmetric relation.
    *
-   * Another way to think about `guarantees`: if `A.guarantees(B)`, then any partitioning of rows
-   * produced by `A` could have also been produced by `B`.
+   * By default a [[Partitioning]] can satisfy [[UnspecifiedDistribution]], and [[AllTuples]] if
+   * the [[Partitioning]] only have one partition. Implementations can overwrite this method with
+   * special logic.
    */
-  def guarantees(other: Partitioning): Boolean = this == other
-}
-
-object Partitioning {
-  def allCompatible(partitionings: Seq[Partitioning]): Boolean = {
-    // Note: this assumes transitivity
-    partitionings.sliding(2).map {
-      case Seq(a) => true
-      case Seq(a, b) =>
-        if (a.numPartitions != b.numPartitions) {
-          assert(!a.compatibleWith(b) && !b.compatibleWith(a))
-          false
-        } else {
-          a.compatibleWith(b) && b.compatibleWith(a)
-        }
-    }.forall(_ == true)
-  }
-}
-
-case class UnknownPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
+  def satisfies(required: Distribution): Boolean = required match {
     case UnspecifiedDistribution => true
+    case AllTuples => numPartitions == 1
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
 }
 
+case class UnknownPartitioning(numPartitions: Int) extends Partitioning
+
 /**
  * Represents a partitioning where rows are distributed evenly across output partitions
  * by starting from a random target partition number and distributing rows in a round-robin
  * fashion. This partitioning is used when implementing the DataFrame.repartition() operator.
  */
-case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning {
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = false
-
-  override def guarantees(other: Partitioning): Boolean = false
-}
+case class RoundRobinPartitioning(numPartitions: Int) extends Partitioning
 
 case object SinglePartition extends Partitioning {
   val numPartitions = 1
 
   override def satisfies(required: Distribution): Boolean = required match {
     case _: BroadcastDistribution => false
-    case ClusteredDistribution(_, desiredPartitions) => desiredPartitions.forall(_ == 1)
+    case ClusteredDistribution(_, Some(requiredNumPartitions)) => requiredNumPartitions == 1
     case _ => true
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other.numPartitions == 1
-
-  override def guarantees(other: Partitioning): Boolean = other.numPartitions == 1
 }
 
 /**
@@ -244,22 +205,19 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: HashPartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case h: HashClusteredDistribution =>
+          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
+            case (l, r) => l.semanticEquals(r)
+          }
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          expressions.forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 
   /**
@@ -288,25 +246,18 @@ case class RangePartitioning(ordering: Seq[SortOrder], numPartitions: Int)
   override def nullable: Boolean = false
   override def dataType: DataType = IntegerType
 
-  override def satisfies(required: Distribution): Boolean = required match {
-    case UnspecifiedDistribution => true
-    case OrderedDistribution(requiredOrdering) =>
-      val minSize = Seq(requiredOrdering.size, ordering.size).min
-      requiredOrdering.take(minSize) == ordering.take(minSize)
-    case ClusteredDistribution(requiredClustering, desiredPartitions) =>
-      ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
-        desiredPartitions.forall(_ == numPartitions) // if desiredPartitions = None, returns true
-    case _ => false
-  }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
-  }
-
-  override def guarantees(other: Partitioning): Boolean = other match {
-    case o: RangePartitioning => this.semanticEquals(o)
-    case _ => false
+  override def satisfies(required: Distribution): Boolean = {
+    super.satisfies(required) || {
+      required match {
+        case OrderedDistribution(requiredOrdering) =>
+          val minSize = Seq(requiredOrdering.size, ordering.size).min
+          requiredOrdering.take(minSize) == ordering.take(minSize)
+        case ClusteredDistribution(requiredClustering, requiredNumPartitions) =>
+          ordering.map(_.child).forall(x => requiredClustering.exists(_.semanticEquals(x))) &&
+            (requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions)
+        case _ => false
+      }
+    }
   }
 }
 
@@ -347,20 +298,6 @@ case class PartitioningCollection(partitionings: Seq[Partitioning])
   override def satisfies(required: Distribution): Boolean =
     partitionings.exists(_.satisfies(required))
 
-  /**
-   * Returns true if any `partitioning` of this collection is compatible with
-   * the given [[Partitioning]].
-   */
-  override def compatibleWith(other: Partitioning): Boolean =
-    partitionings.exists(_.compatibleWith(other))
-
-  /**
-   * Returns true if any `partitioning` of this collection guarantees
-   * the given [[Partitioning]].
-   */
-  override def guarantees(other: Partitioning): Boolean =
-    partitionings.exists(_.guarantees(other))
-
   override def toString: String = {
     partitionings.map(_.toString).mkString("(", " or ", ")")
   }
@@ -377,9 +314,4 @@ case class BroadcastPartitioning(mode: BroadcastMode) extends Partitioning {
     case BroadcastDistribution(m) if m == mode => true
     case _ => false
   }
-
-  override def compatibleWith(other: Partitioning): Boolean = other match {
-    case BroadcastPartitioning(m) if m == mode => true
-    case _ => false
-  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
deleted file mode 100644
index 5b802ccc637dd..0000000000000
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/PartitioningSuite.scala
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal}
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning}
-
-class PartitioningSuite extends SparkFunSuite {
-  test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") {
-    val expressions = Seq(Literal(2), Literal(3))
-    // Consider two HashPartitionings that have the same _set_ of hash expressions but which are
-    // created with different orderings of those expressions:
-    val partitioningA = HashPartitioning(expressions, 100)
-    val partitioningB = HashPartitioning(expressions.reverse, 100)
-    // These partitionings are not considered equal:
-    assert(partitioningA != partitioningB)
-    // However, they both satisfy the same clustered distribution:
-    val distribution = ClusteredDistribution(expressions)
-    assert(partitioningA.satisfies(distribution))
-    assert(partitioningB.satisfies(distribution))
-    // These partitionings compute different hashcodes for the same input row:
-    def computeHashCode(partitioning: HashPartitioning): Int = {
-      val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty)
-      hashExprProj.apply(InternalRow.empty).hashCode()
-    }
-    assert(computeHashCode(partitioningA) != computeHashCode(partitioningB))
-    // Thus, these partitionings are incompatible:
-    assert(!partitioningA.compatibleWith(partitioningB))
-    assert(!partitioningB.compatibleWith(partitioningA))
-    assert(!partitioningA.guarantees(partitioningB))
-    assert(!partitioningB.guarantees(partitioningA))
-
-    // Just to be sure that we haven't cheated by having these methods always return false,
-    // check that identical partitionings are still compatible with and guarantee each other:
-    assert(partitioningA === partitioningA)
-    assert(partitioningA.guarantees(partitioningA))
-    assert(partitioningA.compatibleWith(partitioningA))
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 787c1cfbfb3d8..82300efc01632 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -94,7 +94,21 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
   /** Specifies how data is partitioned across different nodes in the cluster. */
   def outputPartitioning: Partitioning = UnknownPartitioning(0) // TODO: WRONG WIDTH!
 
-  /** Specifies any partition requirements on the input data for this operator. */
+  /**
+   * Specifies the data distribution requirements of all the children for this operator. By default
+   * it's [[UnspecifiedDistribution]] for each child, which means each child can have any
+   * distribution.
+   *
+   * If an operator overwrites this method, and specifies distribution requirements(excluding
+   * [[UnspecifiedDistribution]] and [[BroadcastDistribution]]) for more than one child, Spark
+   * guarantees that the outputs of these children will have same number of partitions, so that the
+   * operator can safely zip partitions of these children's result RDDs. Some operators can leverage
+   * this guarantee to satisfy some interesting requirement, e.g., non-broadcast joins can specify
+   * HashClusteredDistribution(a,b) for its left child, and specify HashClusteredDistribution(c,d)
+   * for its right child, then it's guaranteed that left and right child are co-partitioned by
+   * a,b/c,d, which means tuples of same value are in the partitions of same index, e.g.,
+   * (a=1,b=2) and (c=1,d=2) are both in the second partition of left and right child.
+   */
   def requiredChildDistribution: Seq[Distribution] =
     Seq.fill(children.size)(UnspecifiedDistribution)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
index c8e236be28b42..e3d28388c5470 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/exchange/EnsureRequirements.scala
@@ -46,23 +46,6 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     if (minNumPostShufflePartitions > 0) Some(minNumPostShufflePartitions) else None
   }
 
-  /**
-   * Given a required distribution, returns a partitioning that satisfies that distribution.
-   * @param requiredDistribution The distribution that is required by the operator
-   * @param numPartitions Used when the distribution doesn't require a specific number of partitions
-   */
-  private def createPartitioning(
-      requiredDistribution: Distribution,
-      numPartitions: Int): Partitioning = {
-    requiredDistribution match {
-      case AllTuples => SinglePartition
-      case ClusteredDistribution(clustering, desiredPartitions) =>
-        HashPartitioning(clustering, desiredPartitions.getOrElse(numPartitions))
-      case OrderedDistribution(ordering) => RangePartitioning(ordering, numPartitions)
-      case dist => sys.error(s"Do not know how to satisfy distribution $dist")
-    }
-  }
-
   /**
    * Adds [[ExchangeCoordinator]] to [[ShuffleExchangeExec]]s if adaptive query execution is enabled
    * and partitioning schemes of these [[ShuffleExchangeExec]]s support [[ExchangeCoordinator]].
@@ -88,8 +71,9 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
         // shuffle data when we have more than one children because data generated by
         // these children may not be partitioned in the same way.
         // Please see the comment in withCoordinator for more details.
-        val supportsDistribution =
-          requiredChildDistributions.forall(_.isInstanceOf[ClusteredDistribution])
+        val supportsDistribution = requiredChildDistributions.forall { dist =>
+          dist.isInstanceOf[ClusteredDistribution] || dist.isInstanceOf[HashClusteredDistribution]
+        }
         children.length > 1 && supportsDistribution
       }
 
@@ -142,8 +126,7 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
             //
             // It will be great to introduce a new Partitioning to represent the post-shuffle
             // partitions when one post-shuffle partition includes multiple pre-shuffle partitions.
-            val targetPartitioning =
-              createPartitioning(distribution, defaultNumPreShufflePartitions)
+            val targetPartitioning = distribution.createPartitioning(defaultNumPreShufflePartitions)
             assert(targetPartitioning.isInstanceOf[HashPartitioning])
             ShuffleExchangeExec(targetPartitioning, child, Some(coordinator))
         }
@@ -162,71 +145,56 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
     assert(requiredChildDistributions.length == children.length)
     assert(requiredChildOrderings.length == children.length)
 
-    // Ensure that the operator's children satisfy their output distribution requirements:
+    // Ensure that the operator's children satisfy their output distribution requirements.
     children = children.zip(requiredChildDistributions).map {
       case (child, distribution) if child.outputPartitioning.satisfies(distribution) =>
         child
       case (child, BroadcastDistribution(mode)) =>
         BroadcastExchangeExec(mode, child)
       case (child, distribution) =>
-        ShuffleExchangeExec(createPartitioning(distribution, defaultNumPreShufflePartitions), child)
+        val numPartitions = distribution.requiredNumPartitions
+          .getOrElse(defaultNumPreShufflePartitions)
+        ShuffleExchangeExec(distribution.createPartitioning(numPartitions), child)
     }
 
-    // If the operator has multiple children and specifies child output distributions (e.g. join),
-    // then the children's output partitionings must be compatible:
-    def requireCompatiblePartitioning(distribution: Distribution): Boolean = distribution match {
-      case UnspecifiedDistribution => false
-      case BroadcastDistribution(_) => false
+    // Get the indexes of children which have specified distribution requirements and need to have
+    // same number of partitions.
+    val childrenIndexes = requiredChildDistributions.zipWithIndex.filter {
+      case (UnspecifiedDistribution, _) => false
+      case (_: BroadcastDistribution, _) => false
       case _ => true
-    }
-    if (children.length > 1
-        && requiredChildDistributions.exists(requireCompatiblePartitioning)
-        && !Partitioning.allCompatible(children.map(_.outputPartitioning))) {
+    }.map(_._2)
 
-      // First check if the existing partitions of the children all match. This means they are
-      // partitioned by the same partitioning into the same number of partitions. In that case,
-      // don't try to make them match `defaultPartitions`, just use the existing partitioning.
-      val maxChildrenNumPartitions = children.map(_.outputPartitioning.numPartitions).max
-      val useExistingPartitioning = children.zip(requiredChildDistributions).forall {
-        case (child, distribution) =>
-          child.outputPartitioning.guarantees(
-            createPartitioning(distribution, maxChildrenNumPartitions))
+    val childrenNumPartitions =
+      childrenIndexes.map(children(_).outputPartitioning.numPartitions).toSet
+
+    if (childrenNumPartitions.size > 1) {
+      // Get the number of partitions which is explicitly required by the distributions.
+      val requiredNumPartitions = {
+        val numPartitionsSet = childrenIndexes.flatMap {
+          index => requiredChildDistributions(index).requiredNumPartitions
+        }.toSet
+        assert(numPartitionsSet.size <= 1,
+          s"$operator have incompatible requirements of the number of partitions for its children")
+        numPartitionsSet.headOption
       }
 
-      children = if (useExistingPartitioning) {
-        // We do not need to shuffle any child's output.
-        children
-      } else {
-        // We need to shuffle at least one child's output.
-        // Now, we will determine the number of partitions that will be used by created
-        // partitioning schemes.
-        val numPartitions = {
-          // Let's see if we need to shuffle all child's outputs when we use
-          // maxChildrenNumPartitions.
-          val shufflesAllChildren = children.zip(requiredChildDistributions).forall {
-            case (child, distribution) =>
-              !child.outputPartitioning.guarantees(
-                createPartitioning(distribution, maxChildrenNumPartitions))
-          }
-          // If we need to shuffle all children, we use defaultNumPreShufflePartitions as the
-          // number of partitions. Otherwise, we use maxChildrenNumPartitions.
-          if (shufflesAllChildren) defaultNumPreShufflePartitions else maxChildrenNumPartitions
-        }
+      val targetNumPartitions = requiredNumPartitions.getOrElse(childrenNumPartitions.max)
 
-        children.zip(requiredChildDistributions).map {
-          case (child, distribution) =>
-            val targetPartitioning = createPartitioning(distribution, numPartitions)
-            if (child.outputPartitioning.guarantees(targetPartitioning)) {
-              child
-            } else {
-              child match {
-                // If child is an exchange, we replace it with
-                // a new one having targetPartitioning.
-                case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(targetPartitioning, c)
-                case _ => ShuffleExchangeExec(targetPartitioning, child)
-              }
+      children = children.zip(requiredChildDistributions).zipWithIndex.map {
+        case ((child, distribution), index) if childrenIndexes.contains(index) =>
+          if (child.outputPartitioning.numPartitions == targetNumPartitions) {
+            child
+          } else {
+            val defaultPartitioning = distribution.createPartitioning(targetNumPartitions)
+            child match {
+              // If child is an exchange, we replace it with a new one having defaultPartitioning.
+              case ShuffleExchangeExec(_, c, _) => ShuffleExchangeExec(defaultPartitioning, c)
+              case _ => ShuffleExchangeExec(defaultPartitioning, child)
+            }
           }
-        }
+
+        case ((child, _), _) => child
       }
     }
 
@@ -324,10 +292,10 @@ case class EnsureRequirements(conf: SQLConf) extends Rule[SparkPlan] {
   }
 
   def apply(plan: SparkPlan): SparkPlan = plan.transformUp {
-    case operator @ ShuffleExchangeExec(partitioning, child, _) =>
-      child.children match {
-        case ShuffleExchangeExec(childPartitioning, baseChild, _)::Nil =>
-          if (childPartitioning.guarantees(partitioning)) child else operator
+    // TODO: remove this after we create a physical operator for `RepartitionByExpression`.
+    case operator @ ShuffleExchangeExec(upper: HashPartitioning, child, _) =>
+      child.outputPartitioning match {
+        case lower: HashPartitioning if upper.semanticEquals(lower) => child
         case _ => operator
       }
     case operator: SparkPlan =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
index 66e8031bb5191..897a4dae39f32 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoinExec.scala
@@ -46,7 +46,7 @@ case class ShuffledHashJoinExec(
     "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
     val buildDataSize = longMetric("buildDataSize")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
index 94405410cce90..2de2f30eb05d3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala
@@ -78,7 +78,7 @@ case class SortMergeJoinExec(
   }
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil
+    HashClusteredDistribution(leftKeys) :: HashClusteredDistribution(rightKeys) :: Nil
 
   override def outputOrdering: Seq[SortOrder] = joinType match {
     // For inner join, orders of both sides keys should be kept.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
index d1bd8a7076863..03d1bbf2ab882 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/objects.scala
@@ -456,7 +456,7 @@ case class CoGroupExec(
     right: SparkPlan) extends BinaryExecNode with ObjectProducerExec {
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftGroup) :: ClusteredDistribution(rightGroup) :: Nil
+    HashClusteredDistribution(leftGroup) :: HashClusteredDistribution(rightGroup) :: Nil
 
   override def requiredChildOrdering: Seq[Seq[SortOrder]] =
     leftGroup.map(SortOrder(_, Ascending)) :: rightGroup.map(SortOrder(_, Ascending)) :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index b50642d275ba8..f8b26f5b28cc7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -260,11 +260,16 @@ class PlannerSuite extends SharedSQLContext {
   // do they satisfy the distribution requirements? As a result, we need at least four test cases.
 
   private def assertDistributionRequirementsAreSatisfied(outputPlan: SparkPlan): Unit = {
-    if (outputPlan.children.length > 1
-        && outputPlan.requiredChildDistribution.toSet != Set(UnspecifiedDistribution)) {
-      val childPartitionings = outputPlan.children.map(_.outputPartitioning)
-      if (!Partitioning.allCompatible(childPartitionings)) {
-        fail(s"Partitionings are not compatible: $childPartitionings")
+    if (outputPlan.children.length > 1) {
+      val childPartitionings = outputPlan.children.zip(outputPlan.requiredChildDistribution)
+        .filter {
+          case (_, UnspecifiedDistribution) => false
+          case (_, _: BroadcastDistribution) => false
+          case _ => true
+        }.map(_._1.outputPartitioning)
+
+      if (childPartitionings.map(_.numPartitions).toSet.size > 1) {
+        fail(s"Partitionings doesn't have same number of partitions: $childPartitionings")
       }
     }
     outputPlan.children.zip(outputPlan.requiredChildDistribution).foreach {
@@ -274,40 +279,7 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements with incompatible child partitionings which satisfy distribution") {
-    // Consider an operator that requires inputs that are clustered by two expressions (e.g.
-    // sort merge join where there are multiple columns in the equi-join condition)
-    val clusteringA = Literal(1) :: Nil
-    val clusteringB = Literal(2) :: Nil
-    val distribution = ClusteredDistribution(clusteringA ++ clusteringB)
-    // Say that the left and right inputs are each partitioned by _one_ of the two join columns:
-    val leftPartitioning = HashPartitioning(clusteringA, 1)
-    val rightPartitioning = HashPartitioning(clusteringB, 1)
-    // Individually, each input's partitioning satisfies the clustering distribution:
-    assert(leftPartitioning.satisfies(distribution))
-    assert(rightPartitioning.satisfies(distribution))
-    // However, these partitionings are not compatible with each other, so we still need to
-    // repartition both inputs prior to performing the join:
-    assert(!leftPartitioning.compatibleWith(rightPartitioning))
-    assert(!rightPartitioning.compatibleWith(leftPartitioning))
-    val inputPlan = DummySparkPlan(
-      children = Seq(
-        DummySparkPlan(outputPartitioning = leftPartitioning),
-        DummySparkPlan(outputPartitioning = rightPartitioning)
-      ),
-      requiredChildDistribution = Seq(distribution, distribution),
-      requiredChildOrdering = Seq(Seq.empty, Seq.empty)
-    )
-    val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
-    assertDistributionRequirementsAreSatisfied(outputPlan)
-    if (outputPlan.collect { case e: ShuffleExchangeExec => true }.isEmpty) {
-      fail(s"Exchange should have been added:\n$outputPlan")
-    }
-  }
-
   test("EnsureRequirements with child partitionings with different numbers of output partitions") {
-    // This is similar to the previous test, except it checks that partitionings are not compatible
-    // unless they produce the same number of partitions.
     val clustering = Literal(1) :: Nil
     val distribution = ClusteredDistribution(clustering)
     val inputPlan = DummySparkPlan(
@@ -386,18 +358,15 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
-  test("EnsureRequirements eliminates Exchange if child has Exchange with same partitioning") {
+  test("EnsureRequirements eliminates Exchange if child has same partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 5)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(1) :: Nil, 5)
+    assert(partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 2) {
@@ -407,17 +376,13 @@ class PlannerSuite extends SharedSQLContext {
 
   test("EnsureRequirements does not eliminate Exchange with different partitioning") {
     val distribution = ClusteredDistribution(Literal(1) :: Nil)
-    // Number of partitions differ
-    val finalPartitioning = HashPartitioning(Literal(1) :: Nil, 8)
-    val childPartitioning = HashPartitioning(Literal(2) :: Nil, 5)
-    assert(!childPartitioning.satisfies(distribution))
-    val inputPlan = ShuffleExchangeExec(finalPartitioning,
-      DummySparkPlan(
-        children = DummySparkPlan(outputPartitioning = childPartitioning) :: Nil,
-        requiredChildDistribution = Seq(distribution),
-        requiredChildOrdering = Seq(Seq.empty)),
-      None)
+    val partitioning = HashPartitioning(Literal(2) :: Nil, 5)
+    assert(!partitioning.satisfies(distribution))
 
+    val inputPlan = ShuffleExchangeExec(
+      partitioning,
+      DummySparkPlan(outputPartitioning = partitioning),
+      None)
     val outputPlan = EnsureRequirements(spark.sessionState.conf).apply(inputPlan)
     assertDistributionRequirementsAreSatisfied(outputPlan)
     if (outputPlan.collect { case e: ShuffleExchangeExec => true }.size == 1) {

From 40b983c3b44b6771f07302ce87987fa4716b5ebf Mon Sep 17 00:00:00 2001
From: Xianjin YE <advancedxy@gmail.com>
Date: Mon, 8 Jan 2018 23:49:07 +0800
Subject: [PATCH 286/356] [SPARK-22952][CORE] Deprecate stageAttemptId in
 favour of stageAttemptNumber

## What changes were proposed in this pull request?
1.  Deprecate attemptId in StageInfo and add `def attemptNumber() = attemptId`
2. Replace usage of stageAttemptId with stageAttemptNumber

## How was this patch tested?
I manually checked the compiler warning info

Author: Xianjin YE <advancedxy@gmail.com>

Closes #20178 from advancedxy/SPARK-22952.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 15 +++---
 .../apache/spark/scheduler/StageInfo.scala    |  4 +-
 .../spark/scheduler/StatsReportListener.scala |  2 +-
 .../spark/status/AppStatusListener.scala      |  7 +--
 .../org/apache/spark/status/LiveEntity.scala  |  4 +-
 .../spark/ui/scope/RDDOperationGraph.scala    |  2 +-
 .../org/apache/spark/util/JsonProtocol.scala  |  2 +-
 .../spark/status/AppStatusListenerSuite.scala | 54 ++++++++++---------
 .../execution/ui/SQLAppStatusListener.scala   |  2 +-
 9 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index c2498d4808e91..199937b8c27af 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -815,7 +815,8 @@ class DAGScheduler(
   private[scheduler] def handleBeginEvent(task: Task[_], taskInfo: TaskInfo) {
     // Note that there is a chance that this task is launched after the stage is cancelled.
     // In that case, we wouldn't have the stage anymore in stageIdToStage.
-    val stageAttemptId = stageIdToStage.get(task.stageId).map(_.latestInfo.attemptId).getOrElse(-1)
+    val stageAttemptId =
+      stageIdToStage.get(task.stageId).map(_.latestInfo.attemptNumber).getOrElse(-1)
     listenerBus.post(SparkListenerTaskStart(task.stageId, stageAttemptId, taskInfo))
   }
 
@@ -1050,7 +1051,7 @@ class DAGScheduler(
             val locs = taskIdToLocations(id)
             val part = stage.rdd.partitions(id)
             stage.pendingPartitions += id
-            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
+            new ShuffleMapTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, properties, serializedTaskMetrics, Option(jobId),
               Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1060,7 +1061,7 @@ class DAGScheduler(
             val p: Int = stage.partitions(id)
             val part = stage.rdd.partitions(p)
             val locs = taskIdToLocations(id)
-            new ResultTask(stage.id, stage.latestInfo.attemptId,
+            new ResultTask(stage.id, stage.latestInfo.attemptNumber,
               taskBinary, part, locs, id, properties, serializedTaskMetrics,
               Option(jobId), Option(sc.applicationId), sc.applicationAttemptId)
           }
@@ -1076,7 +1077,7 @@ class DAGScheduler(
       logInfo(s"Submitting ${tasks.size} missing tasks from $stage (${stage.rdd}) (first 15 " +
         s"tasks are for partitions ${tasks.take(15).map(_.partitionId)})")
       taskScheduler.submitTasks(new TaskSet(
-        tasks.toArray, stage.id, stage.latestInfo.attemptId, jobId, properties))
+        tasks.toArray, stage.id, stage.latestInfo.attemptNumber, jobId, properties))
     } else {
       // Because we posted SparkListenerStageSubmitted earlier, we should mark
       // the stage as completed here in case there are no tasks to run
@@ -1245,7 +1246,7 @@ class DAGScheduler(
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
-            if (stageIdToStage(task.stageId).latestInfo.attemptId == task.stageAttemptId) {
+            if (stageIdToStage(task.stageId).latestInfo.attemptNumber == task.stageAttemptId) {
               // This task was for the currently running attempt of the stage. Since the task
               // completed successfully from the perspective of the TaskSetManager, mark it as
               // no longer pending (the TaskSetManager may consider the task complete even
@@ -1324,10 +1325,10 @@ class DAGScheduler(
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleIdToMapStage(shuffleId)
 
-        if (failedStage.latestInfo.attemptId != task.stageAttemptId) {
+        if (failedStage.latestInfo.attemptNumber != task.stageAttemptId) {
           logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" +
             s" ${task.stageAttemptId} and there is a more recent attempt for that stage " +
-            s"(attempt ID ${failedStage.latestInfo.attemptId}) running")
+            s"(attempt ${failedStage.latestInfo.attemptNumber}) running")
         } else {
           // It is likely that we receive multiple FetchFailed for a single stage (because we have
           // multiple tasks running concurrently on different executors). In that case, it is
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index c513ed36d1680..903e25b7986f2 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -30,7 +30,7 @@ import org.apache.spark.storage.RDDInfo
 @DeveloperApi
 class StageInfo(
     val stageId: Int,
-    val attemptId: Int,
+    @deprecated("Use attemptNumber instead", "2.3.0") val attemptId: Int,
     val name: String,
     val numTasks: Int,
     val rddInfos: Seq[RDDInfo],
@@ -56,6 +56,8 @@ class StageInfo(
     completionTime = Some(System.currentTimeMillis)
   }
 
+  def attemptNumber(): Int = attemptId
+
   private[spark] def getStatusString: String = {
     if (completionTime.isDefined) {
       if (failureReason.isDefined) {
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
index 3c8cab7504c17..3c7af4f6146fa 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StatsReportListener.scala
@@ -79,7 +79,7 @@ class StatsReportListener extends SparkListener with Logging {
       x => info.completionTime.getOrElse(System.currentTimeMillis()) - x
     ).getOrElse("-")
 
-    s"Stage(${info.stageId}, ${info.attemptId}); Name: '${info.name}'; " +
+    s"Stage(${info.stageId}, ${info.attemptNumber}); Name: '${info.name}'; " +
       s"Status: ${info.getStatusString}$failureReason; numTasks: ${info.numTasks}; " +
       s"Took: $timeTaken msec"
   }
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 487a782e865e8..88b75ddd5993a 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -529,7 +529,8 @@ private[spark] class AppStatusListener(
   }
 
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
-    val maybeStage = Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptId)))
+    val maybeStage =
+      Option(liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptNumber)))
     maybeStage.foreach { stage =>
       val now = System.nanoTime()
       stage.info = event.stageInfo
@@ -785,7 +786,7 @@ private[spark] class AppStatusListener(
   }
 
   private def getOrCreateStage(info: StageInfo): LiveStage = {
-    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptId),
+    val stage = liveStages.computeIfAbsent((info.stageId, info.attemptNumber),
       new Function[(Int, Int), LiveStage]() {
         override def apply(key: (Int, Int)): LiveStage = new LiveStage()
       })
@@ -912,7 +913,7 @@ private[spark] class AppStatusListener(
   private def cleanupTasks(stage: LiveStage): Unit = {
     val countToDelete = calculateNumberToRemove(stage.savedTasks.get(), maxTasksPerStage).toInt
     if (countToDelete > 0) {
-      val stageKey = Array(stage.info.stageId, stage.info.attemptId)
+      val stageKey = Array(stage.info.stageId, stage.info.attemptNumber)
       val view = kvstore.view(classOf[TaskDataWrapper]).index("stage").first(stageKey)
         .last(stageKey)
 
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 52e83f250d34e..305c2fafa6aac 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -412,14 +412,14 @@ private class LiveStage extends LiveEntity {
 
   def executorSummary(executorId: String): LiveExecutorStageSummary = {
     executorSummaries.getOrElseUpdate(executorId,
-      new LiveExecutorStageSummary(info.stageId, info.attemptId, executorId))
+      new LiveExecutorStageSummary(info.stageId, info.attemptNumber, executorId))
   }
 
   def toApi(): v1.StageData = {
     new v1.StageData(
       status,
       info.stageId,
-      info.attemptId,
+      info.attemptNumber,
 
       info.numTasks,
       activeTasks,
diff --git a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
index 827a8637b9bd2..948858224d724 100644
--- a/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
+++ b/core/src/main/scala/org/apache/spark/ui/scope/RDDOperationGraph.scala
@@ -116,7 +116,7 @@ private[spark] object RDDOperationGraph extends Logging {
     // Use a special prefix here to differentiate this cluster from other operation clusters
     val stageClusterId = STAGE_CLUSTER_PREFIX + stage.stageId
     val stageClusterName = s"Stage ${stage.stageId}" +
-      { if (stage.attemptId == 0) "" else s" (attempt ${stage.attemptId})" }
+      { if (stage.attemptNumber == 0) "" else s" (attempt ${stage.attemptNumber})" }
     val rootCluster = new RDDOperationCluster(stageClusterId, stageClusterName)
 
     var rootNodeCount = 0
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 5e60218c5740b..ff83301d631c4 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -263,7 +263,7 @@ private[spark] object JsonProtocol {
     val completionTime = stageInfo.completionTime.map(JInt(_)).getOrElse(JNothing)
     val failureReason = stageInfo.failureReason.map(JString(_)).getOrElse(JNothing)
     ("Stage ID" -> stageInfo.stageId) ~
-    ("Stage Attempt ID" -> stageInfo.attemptId) ~
+    ("Stage Attempt ID" -> stageInfo.attemptNumber) ~
     ("Stage Name" -> stageInfo.name) ~
     ("Number of Tasks" -> stageInfo.numTasks) ~
     ("RDD Info" -> rddInfo) ~
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index 997c7de8dd02b..b8c84e24c2c3f 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -195,7 +195,9 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     val s1Tasks = createTasks(4, execIds)
     s1Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId,
+        stages.head.attemptNumber,
+        task))
     }
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size)
@@ -213,10 +215,11 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
       check[TaskDataWrapper](task.taskId) { wrapper =>
         assert(wrapper.info.taskId === task.taskId)
         assert(wrapper.stageId === stages.head.stageId)
-        assert(wrapper.stageAttemptId === stages.head.attemptId)
-        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptId)))
+        assert(wrapper.stageAttemptId === stages.head.attemptNumber)
+        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptNumber)))
 
-        val runtime = Array[AnyRef](stages.head.stageId: JInteger, stages.head.attemptId: JInteger,
+        val runtime = Array[AnyRef](stages.head.stageId: JInteger,
+          stages.head.attemptNumber: JInteger,
           -1L: JLong)
         assert(Arrays.equals(wrapper.runtime, runtime))
 
@@ -237,7 +240,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
         Some(1L), None, true, false, None)
       listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
         task.executorId,
-        Seq((task.taskId, stages.head.stageId, stages.head.attemptId, Seq(accum)))))
+        Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
     }
 
     check[StageDataWrapper](key(stages.head)) { stage =>
@@ -254,12 +257,12 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     // Fail one of the tasks, re-start it.
     time += 1
     s1Tasks.head.markFinished(TaskState.FAILED, time)
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskResultLost, s1Tasks.head, null))
 
     time += 1
     val reattempt = newAttempt(s1Tasks.head, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt))
 
     assert(store.count(classOf[TaskDataWrapper]) === s1Tasks.size + 1)
@@ -289,7 +292,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val killed = s1Tasks.drop(1).head
     killed.finishTime = time
     killed.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", TaskKilled("killed"), killed, null))
 
     check[JobDataWrapper](1) { job =>
@@ -311,13 +314,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val denied = newAttempt(killed, nextTaskId())
     val denyReason = TaskCommitDenied(1, 1, 1)
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       denied))
 
     time += 1
     denied.finishTime = time
     denied.failed = true
-    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
       "taskType", denyReason, denied, null))
 
     check[JobDataWrapper](1) { job =>
@@ -337,7 +340,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start a new attempt.
     val reattempt2 = newAttempt(denied, nextTaskId())
-    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptId,
+    listener.onTaskStart(SparkListenerTaskStart(stages.head.stageId, stages.head.attemptNumber,
       reattempt2))
 
     // Succeed all tasks in stage 1.
@@ -350,7 +353,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     pending.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.head.stageId, stages.head.attemptNumber,
         "taskType", Success, task, s1Metrics))
     }
 
@@ -414,13 +417,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val s2Tasks = createTasks(4, execIds)
     s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId, stages.last.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(stages.last.stageId,
+        stages.last.attemptNumber,
+        task))
     }
 
     time += 1
     s2Tasks.foreach { task =>
       task.markFinished(TaskState.FAILED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(stages.last.stageId, stages.last.attemptNumber,
         "taskType", TaskResultLost, task, null))
     }
 
@@ -455,7 +460,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // - Re-submit stage 2, all tasks, and succeed them and the stage.
     val oldS2 = stages.last
-    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptId + 1, oldS2.name, oldS2.numTasks,
+    val newS2 = new StageInfo(oldS2.stageId, oldS2.attemptNumber + 1, oldS2.name, oldS2.numTasks,
       oldS2.rddInfos, oldS2.parentIds, oldS2.details, oldS2.taskMetrics)
 
     time += 1
@@ -466,14 +471,14 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val newS2Tasks = createTasks(4, execIds)
 
     newS2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(newS2.stageId, newS2.attemptNumber, task))
     }
 
     time += 1
     newS2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptId, "taskType", Success,
-        task, null))
+      listener.onTaskEnd(SparkListenerTaskEnd(newS2.stageId, newS2.attemptNumber, "taskType",
+        Success, task, null))
     }
 
     time += 1
@@ -522,14 +527,15 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     val j2s2Tasks = createTasks(4, execIds)
 
     j2s2Tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskStart(SparkListenerTaskStart(j2Stages.last.stageId,
+        j2Stages.last.attemptNumber,
         task))
     }
 
     time += 1
     j2s2Tasks.foreach { task =>
       task.markFinished(TaskState.FINISHED, time)
-      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptId,
+      listener.onTaskEnd(SparkListenerTaskEnd(j2Stages.last.stageId, j2Stages.last.attemptNumber,
         "taskType", Success, task, null))
     }
 
@@ -919,13 +925,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     time += 1
     val tasks = createTasks(2, Array("1"))
     tasks.foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
 
     // Start a 3rd task. The finished tasks should be deleted.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -934,7 +940,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Start a 4th task. The first task should be deleted, even if it's still running.
     createTasks(1, Array("1")).foreach { task =>
-      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptId, task))
+      listener.onTaskStart(SparkListenerTaskStart(attempt2.stageId, attempt2.attemptNumber, task))
     }
     assert(store.count(classOf[TaskDataWrapper]) === 2)
     intercept[NoSuchElementException] {
@@ -960,7 +966,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
   }
 
-  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptId)
+  private def key(stage: StageInfo): Array[Int] = Array(stage.stageId, stage.attemptNumber)
 
   private def check[T: ClassTag](key: Any)(fn: T => Unit): Unit = {
     val value = store.read(classTag[T].runtimeClass, key).asInstanceOf[T]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
index d8adbe7bee13e..73a105266e1c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusListener.scala
@@ -99,7 +99,7 @@ class SQLAppStatusListener(
     // Reset the metrics tracking object for the new attempt.
     Option(stageMetrics.get(event.stageInfo.stageId)).foreach { metrics =>
       metrics.taskMetrics.clear()
-      metrics.attemptId = event.stageInfo.attemptId
+      metrics.attemptId = event.stageInfo.attemptNumber
     }
   }
 

From eed82a0b211352215316ec70dc48aefc013ad0b2 Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Mon, 8 Jan 2018 13:01:45 -0800
Subject: [PATCH 287/356] [SPARK-22992][K8S] Remove assumption of the DNS
 domain

## What changes were proposed in this pull request?

Remove the use of FQDN to access the driver because it assumes that it's set up in a DNS zone - `cluster.local` which is common but not ubiquitous
Note that we already access the in-cluster API server through `kubernetes.default.svc`, so, by extension, this should work as well.
The alternative is to introduce DNS zones for both of those addresses.

## How was this patch tested?
Unit tests

cc vanzin liyinan926 mridulm mccheah

Author: foxish <ramanathana@google.com>

Closes #20187 from foxish/cluster.local.
---
 .../deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala  | 2 +-
 .../k8s/submit/steps/DriverServiceBootstrapStepSuite.scala    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
index eb594e4f16ec0..34af7cde6c1a9 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
@@ -83,7 +83,7 @@ private[spark] class DriverServiceBootstrapStep(
       .build()
 
     val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc.cluster.local"
+    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc"
     val resolvedSparkConf = driverSpec.driverSparkConf.clone()
       .set(DRIVER_HOST_KEY, driverHostname)
       .set("spark.driver.port", driverPort.toString)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
index 006ce2668f8a0..78c8c3ba1afbd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
@@ -85,7 +85,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
     val expectedServiceName = SHORT_RESOURCE_NAME_PREFIX +
       DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 
@@ -120,7 +120,7 @@ class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter
     val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
     val expectedServiceName = s"spark-10000${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}"
     assert(driverService.getMetadata.getName === expectedServiceName)
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc.cluster.local"
+    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
     verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
   }
 

From 4f7e75883436069c2d9028c4cd5daa78e8d59560 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Mon, 8 Jan 2018 13:24:08 -0800
Subject: [PATCH 288/356] [SPARK-22912] v2 data source support in
 MicroBatchExecution

## What changes were proposed in this pull request?

Support for v2 data sources in microbatch streaming.

## How was this patch tested?

A very basic new unit test on the toy v2 implementation of rate source. Once we have a v1 source fully migrated to v2, we'll need to do more detailed compatibility testing.

Author: Jose Torres <jose@databricks.com>

Closes #20097 from jose-torres/v2-impl.
---
 ...pache.spark.sql.sources.DataSourceRegister |   1 +
 .../datasources/v2/DataSourceV2Relation.scala |  10 ++
 .../streaming/MicroBatchExecution.scala       | 112 ++++++++++++++----
 .../streaming/ProgressReporter.scala          |   6 +-
 .../streaming/RateSourceProvider.scala        |  10 +-
 .../execution/streaming/StreamExecution.scala |   4 +-
 .../streaming/StreamingRelation.scala         |   4 +-
 .../continuous/ContinuousExecution.scala      |   4 +-
 .../ContinuousRateStreamSource.scala          |  17 +--
 .../sources/RateStreamSourceV2.scala          |  31 ++++-
 .../sql/streaming/DataStreamReader.scala      |  25 +++-
 .../sql/streaming/StreamingQueryManager.scala |  24 ++--
 .../streaming/RateSourceV2Suite.scala         |  68 +++++++++--
 .../spark/sql/streaming/StreamTest.scala      |   2 +-
 .../continuous/ContinuousSuite.scala          |   2 +-
 15 files changed, 241 insertions(+), 79 deletions(-)

diff --git a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
index 6cdfe2fae5642..0259c774bbf4a 100644
--- a/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
+++ b/sql/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -7,3 +7,4 @@ org.apache.spark.sql.execution.datasources.text.TextFileFormat
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.TextSocketSourceProvider
 org.apache.spark.sql.execution.streaming.RateSourceProvider
+org.apache.spark.sql.execution.streaming.sources.RateSourceProviderV2
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
index 7eb99a645001a..cba20dd902007 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala
@@ -35,6 +35,16 @@ case class DataSourceV2Relation(
   }
 }
 
+/**
+ * A specialization of DataSourceV2Relation with the streaming bit set to true. Otherwise identical
+ * to the non-streaming relation.
+ */
+class StreamingDataSourceV2Relation(
+    fullOutput: Seq[AttributeReference],
+    reader: DataSourceV2Reader) extends DataSourceV2Relation(fullOutput, reader) {
+  override def isStreaming: Boolean = true
+}
+
 object DataSourceV2Relation {
   def apply(reader: DataSourceV2Reader): DataSourceV2Relation = {
     new DataSourceV2Relation(reader.readSchema().toAttributes, reader)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 9a7a13fcc5806..42240eeb58d4b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -17,6 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming
 
+import java.util.Optional
+
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
 
 import org.apache.spark.sql.{Dataset, SparkSession}
@@ -24,7 +27,10 @@ import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
+import org.apache.spark.sql.execution.datasources.v2.{StreamingDataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.v2.streaming.{MicroBatchReadSupport, MicroBatchWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset => OffsetV2}
 import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime, Trigger}
 import org.apache.spark.util.{Clock, Utils}
 
@@ -33,10 +39,11 @@ class MicroBatchExecution(
     name: String,
     checkpointRoot: String,
     analyzedPlan: LogicalPlan,
-    sink: Sink,
+    sink: BaseStreamingSink,
     trigger: Trigger,
     triggerClock: Clock,
     outputMode: OutputMode,
+    extraOptions: Map[String, String],
     deleteCheckpointOnStop: Boolean)
   extends StreamExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
@@ -57,6 +64,13 @@ class MicroBatchExecution(
     var nextSourceId = 0L
     val toExecutionRelationMap = MutableMap[StreamingRelation, StreamingExecutionRelation]()
     val v2ToExecutionRelationMap = MutableMap[StreamingRelationV2, StreamingExecutionRelation]()
+    // We transform each distinct streaming relation into a StreamingExecutionRelation, keeping a
+    // map as we go to ensure each identical relation gets the same StreamingExecutionRelation
+    // object. For each microbatch, the StreamingExecutionRelation will be replaced with a logical
+    // plan for the data within that batch.
+    // Note that we have to use the previous `output` as attributes in StreamingExecutionRelation,
+    // since the existing logical plan has already used those attributes. The per-microbatch
+    // transformation is responsible for replacing attributes with their final values.
     val _logicalPlan = analyzedPlan.transform {
       case streamingRelation@StreamingRelation(dataSource, _, output) =>
         toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
@@ -64,19 +78,26 @@ class MicroBatchExecution(
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
           val source = dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
-      case s @ StreamingRelationV2(v2DataSource, _, _, output, v1DataSource)
-          if !v2DataSource.isInstanceOf[MicroBatchReadSupport] =>
+      case s @ StreamingRelationV2(source: MicroBatchReadSupport, _, options, output, _) =>
+        v2ToExecutionRelationMap.getOrElseUpdate(s, {
+          // Materialize source to avoid creating it in every batch
+          val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
+          val reader = source.createMicroBatchReader(
+            Optional.empty(), // user specified schema
+            metadataPath,
+            new DataSourceV2Options(options.asJava))
+          nextSourceId += 1
+          StreamingExecutionRelation(reader, output)(sparkSession)
+        })
+      case s @ StreamingRelationV2(_, _, _, output, v1Relation) =>
         v2ToExecutionRelationMap.getOrElseUpdate(s, {
           // Materialize source to avoid creating it in every batch
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
-          val source = v1DataSource.createSource(metadataPath)
+          assert(v1Relation.isDefined, "v2 execution didn't match but v1 was unavailable")
+          val source = v1Relation.get.dataSource.createSource(metadataPath)
           nextSourceId += 1
-          // We still need to use the previous `output` instead of `source.schema` as attributes in
-          // "df.logicalPlan" has already used attributes of the previous `output`.
           StreamingExecutionRelation(source, output)(sparkSession)
         })
     }
@@ -192,7 +213,8 @@ class MicroBatchExecution(
                     source.getBatch(start, end)
                   }
                 case nonV1Tuple =>
-                  throw new IllegalStateException(s"Unexpected V2 source in $nonV1Tuple")
+                  // The V2 API does not have the same edge case requiring getBatch to be called
+                  // here, so we do nothing here.
               }
               currentBatchId = latestCommittedBatchId + 1
               committedOffsets ++= availableOffsets
@@ -236,14 +258,27 @@ class MicroBatchExecution(
     val hasNewData = {
       awaitProgressLock.lock()
       try {
-        val latestOffsets: Map[Source, Option[Offset]] = uniqueSources.map {
+        // Generate a map from each unique source to the next available offset.
+        val latestOffsets: Map[BaseStreamingSource, Option[Offset]] = uniqueSources.map {
           case s: Source =>
             updateStatusMessage(s"Getting offsets from $s")
             reportTimeTaken("getOffset") {
               (s, s.getOffset)
             }
+          case s: MicroBatchReader =>
+            updateStatusMessage(s"Getting offsets from $s")
+            reportTimeTaken("getOffset") {
+            // Once v1 streaming source execution is gone, we can refactor this away.
+            // For now, we set the range here to get the source to infer the available end offset,
+            // get that offset, and then set the range again when we later execute.
+            s.setOffsetRange(
+              toJava(availableOffsets.get(s).map(off => s.deserializeOffset(off.json))),
+              Optional.empty())
+
+              (s, Some(s.getEndOffset))
+            }
         }.toMap
-        availableOffsets ++= latestOffsets.filter { case (s, o) => o.nonEmpty }.mapValues(_.get)
+        availableOffsets ++= latestOffsets.filter { case (_, o) => o.nonEmpty }.mapValues(_.get)
 
         if (dataAvailable) {
           true
@@ -317,6 +352,8 @@ class MicroBatchExecution(
           if (prevBatchOff.isDefined) {
             prevBatchOff.get.toStreamProgress(sources).foreach {
               case (src: Source, off) => src.commit(off)
+              case (reader: MicroBatchReader, off) =>
+                reader.commit(reader.deserializeOffset(off.json))
             }
           } else {
             throw new IllegalStateException(s"batch $currentBatchId doesn't exist")
@@ -357,7 +394,16 @@ class MicroBatchExecution(
             s"DataFrame returned by getBatch from $source did not have isStreaming=true\n" +
               s"${batch.queryExecution.logical}")
           logDebug(s"Retrieving data from $source: $current -> $available")
-          Some(source -> batch)
+          Some(source -> batch.logicalPlan)
+        case (reader: MicroBatchReader, available)
+          if committedOffsets.get(reader).map(_ != available).getOrElse(true) =>
+          val current = committedOffsets.get(reader).map(off => reader.deserializeOffset(off.json))
+          reader.setOffsetRange(
+            toJava(current),
+            Optional.of(available.asInstanceOf[OffsetV2]))
+          logDebug(s"Retrieving data from $reader: $current -> $available")
+          Some(reader ->
+            new StreamingDataSourceV2Relation(reader.readSchema().toAttributes, reader))
         case _ => None
       }
     }
@@ -365,15 +411,14 @@ class MicroBatchExecution(
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Replace sources in the logical plan with data that has arrived since the last batch.
-    val withNewSources = logicalPlan transform {
+    val newBatchesPlan = logicalPlan transform {
       case StreamingExecutionRelation(source, output) =>
-        newData.get(source).map { data =>
-          val newPlan = data.logicalPlan
-          assert(output.size == newPlan.output.size,
+        newData.get(source).map { dataPlan =>
+          assert(output.size == dataPlan.output.size,
             s"Invalid batch: ${Utils.truncatedString(output, ",")} != " +
-              s"${Utils.truncatedString(newPlan.output, ",")}")
-          replacements ++= output.zip(newPlan.output)
-          newPlan
+              s"${Utils.truncatedString(dataPlan.output, ",")}")
+          replacements ++= output.zip(dataPlan.output)
+          dataPlan
         }.getOrElse {
           LocalRelation(output, isStreaming = true)
         }
@@ -381,7 +426,7 @@ class MicroBatchExecution(
 
     // Rewire the plan to use the new attributes that were returned by the source.
     val replacementMap = AttributeMap(replacements)
-    val triggerLogicalPlan = withNewSources transformAllExpressions {
+    val newAttributePlan = newBatchesPlan transformAllExpressions {
       case a: Attribute if replacementMap.contains(a) =>
         replacementMap(a).withMetadata(a.metadata)
       case ct: CurrentTimestamp =>
@@ -392,6 +437,20 @@ class MicroBatchExecution(
           cd.dataType, cd.timeZoneId)
     }
 
+    val triggerLogicalPlan = sink match {
+      case _: Sink => newAttributePlan
+      case s: MicroBatchWriteSupport =>
+        val writer = s.createMicroBatchWriter(
+          s"$runId",
+          currentBatchId,
+          newAttributePlan.schema,
+          outputMode,
+          new DataSourceV2Options(extraOptions.asJava))
+        assert(writer.isPresent, "microbatch writer must always be present")
+        WriteToDataSourceV2(writer.get, newAttributePlan)
+      case _ => throw new IllegalArgumentException(s"unknown sink type for $sink")
+    }
+
     reportTimeTaken("queryPlanning") {
       lastExecution = new IncrementalExecution(
         sparkSessionToRunBatch,
@@ -409,7 +468,12 @@ class MicroBatchExecution(
 
     reportTimeTaken("addBatch") {
       SQLExecution.withNewExecutionId(sparkSessionToRunBatch, lastExecution) {
-        sink.addBatch(currentBatchId, nextBatch)
+        sink match {
+          case s: Sink => s.addBatch(currentBatchId, nextBatch)
+          case s: MicroBatchWriteSupport =>
+            // This doesn't accumulate any data - it just forces execution of the microbatch writer.
+            nextBatch.collect()
+        }
       }
     }
 
@@ -421,4 +485,8 @@ class MicroBatchExecution(
       awaitProgressLock.unlock()
     }
   }
+
+  private def toJava(scalaOption: Option[OffsetV2]): Optional[OffsetV2] = {
+    Optional.ofNullable(scalaOption.orNull)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index 1c9043613cb69..d1e5be9c12762 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -53,7 +53,7 @@ trait ProgressReporter extends Logging {
   protected def triggerClock: Clock
   protected def logicalPlan: LogicalPlan
   protected def lastExecution: QueryExecution
-  protected def newData: Map[BaseStreamingSource, DataFrame]
+  protected def newData: Map[BaseStreamingSource, LogicalPlan]
   protected def availableOffsets: StreamProgress
   protected def committedOffsets: StreamProgress
   protected def sources: Seq[BaseStreamingSource]
@@ -225,8 +225,8 @@ trait ProgressReporter extends Logging {
     //
     // 3. For each source, we sum the metrics of the associated execution plan leaves.
     //
-    val logicalPlanLeafToSource = newData.flatMap { case (source, df) =>
-      df.logicalPlan.collectLeaves().map { leaf => leaf -> source }
+    val logicalPlanLeafToSource = newData.flatMap { case (source, logicalPlan) =>
+      logicalPlan.collectLeaves().map { leaf => leaf -> source }
     }
     val allLogicalPlanLeaves = lastExecution.logical.collectLeaves() // includes non-streaming
     val allExecPlanLeaves = lastExecution.executedPlan.collectLeaves()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
index d02cf882b61ac..66eb0169ac1ec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/RateSourceProvider.scala
@@ -29,12 +29,12 @@ import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousRateStreamReader
-import org.apache.spark.sql.execution.streaming.sources.RateStreamV2Reader
+import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
+import org.apache.spark.sql.execution.streaming.sources.RateStreamMicroBatchReader
 import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
 import org.apache.spark.sql.sources.v2._
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
-import org.apache.spark.sql.sources.v2.streaming.reader.ContinuousReader
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousReader, MicroBatchReader}
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{ManualClock, SystemClock}
 
@@ -112,7 +112,7 @@ class RateSourceProvider extends StreamSourceProvider with DataSourceRegister
       schema: Optional[StructType],
       checkpointLocation: String,
       options: DataSourceV2Options): ContinuousReader = {
-    new ContinuousRateStreamReader(options)
+    new RateStreamContinuousReader(options)
   }
 
   override def shortName(): String = "rate"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 3e76bf7b7ca8f..24a8b000df0c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -163,7 +163,7 @@ abstract class StreamExecution(
   var lastExecution: IncrementalExecution = _
 
   /** Holds the most recent input data for each source. */
-  protected var newData: Map[BaseStreamingSource, DataFrame] = _
+  protected var newData: Map[BaseStreamingSource, LogicalPlan] = _
 
   @volatile
   protected var streamDeathCause: StreamingQueryException = null
@@ -418,7 +418,7 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: Source, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
index a9d50e3a112e7..a0ee683a895d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingRelation.scala
@@ -61,7 +61,7 @@ case class StreamingRelation(dataSource: DataSource, sourceName: String, output:
  * [[org.apache.spark.sql.catalyst.plans.logical.LogicalPlan]].
  */
 case class StreamingExecutionRelation(
-    source: Source,
+    source: BaseStreamingSource,
     output: Seq[Attribute])(session: SparkSession)
   extends LeafNode {
 
@@ -92,7 +92,7 @@ case class StreamingRelationV2(
     sourceName: String,
     extraOptions: Map[String, String],
     output: Seq[Attribute],
-    v1DataSource: DataSource)(session: SparkSession)
+    v1Relation: Option[StreamingRelation])(session: SparkSession)
   extends LeafNode {
   override def isStreaming: Boolean = true
   override def toString: String = sourceName
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 2843ab13bde2b..9657b5e26d770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, CurrentBatchTimestamp, CurrentDate, CurrentTimestamp}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SQLExecution
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, StreamingDataSourceV2Relation, WriteToDataSourceV2}
 import org.apache.spark.sql.execution.streaming.{ContinuousExecutionRelation, StreamingRelationV2, _}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
 import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
@@ -174,7 +174,7 @@ class ContinuousExecution(
         val loggedOffset = offsets.offsets(0)
         val realOffset = loggedOffset.map(off => reader.deserializeOffset(off.json))
         reader.setOffset(java.util.Optional.ofNullable(realOffset.orNull))
-        DataSourceV2Relation(newOutput, reader)
+        new StreamingDataSourceV2Relation(newOutput, reader)
     }
 
     // Rewire the plan to use the new attributes that were returned by the source.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
index c9aa78a5a2e28..b4b21e7d2052f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousRateStreamSource.scala
@@ -32,10 +32,10 @@ import org.apache.spark.sql.sources.v2.reader._
 import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
 
-case class ContinuousRateStreamPartitionOffset(
+case class RateStreamPartitionOffset(
    partition: Int, currentValue: Long, currentTimeMs: Long) extends PartitionOffset
 
-class ContinuousRateStreamReader(options: DataSourceV2Options)
+class RateStreamContinuousReader(options: DataSourceV2Options)
   extends ContinuousReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
@@ -48,7 +48,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
   override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
     assert(offsets.length == numPartitions)
     val tuples = offsets.map {
-      case ContinuousRateStreamPartitionOffset(i, currVal, nextRead) =>
+      case RateStreamPartitionOffset(i, currVal, nextRead) =>
         (i, ValueRunTimeMsPair(currVal, nextRead))
     }
     RateStreamOffset(Map(tuples: _*))
@@ -86,7 +86,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
       val start = partitionStartMap(i)
       // Have each partition advance by numPartitions each row, with starting points staggered
       // by their partition index.
-      RateStreamReadTask(
+      RateStreamContinuousReadTask(
         start.value,
         start.runTimeMs,
         i,
@@ -101,7 +101,7 @@ class ContinuousRateStreamReader(options: DataSourceV2Options)
 
 }
 
-case class RateStreamReadTask(
+case class RateStreamContinuousReadTask(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -109,10 +109,11 @@ case class RateStreamReadTask(
     rowsPerSecond: Double)
   extends ReadTask[Row] {
   override def createDataReader(): DataReader[Row] =
-    new RateStreamDataReader(startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
+    new RateStreamContinuousDataReader(
+      startValue, startTimeMs, partitionIndex, increment, rowsPerSecond)
 }
 
-class RateStreamDataReader(
+class RateStreamContinuousDataReader(
     startValue: Long,
     startTimeMs: Long,
     partitionIndex: Int,
@@ -151,5 +152,5 @@ class RateStreamDataReader(
   override def close(): Unit = {}
 
   override def getOffset(): PartitionOffset =
-    ContinuousRateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
+    RateStreamPartitionOffset(partitionIndex, currentValue, nextReadTime)
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
index 97bada08bcd2b..c0ed12cec25ef 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamSourceV2.scala
@@ -28,17 +28,38 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.streaming.{RateStreamOffset, ValueRunTimeMsPair}
-import org.apache.spark.sql.sources.v2.DataSourceV2Options
+import org.apache.spark.sql.sources.DataSourceRegister
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
 import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.MicroBatchReadSupport
 import org.apache.spark.sql.sources.v2.streaming.reader.{MicroBatchReader, Offset}
 import org.apache.spark.sql.types.{LongType, StructField, StructType, TimestampType}
-import org.apache.spark.util.SystemClock
+import org.apache.spark.util.{ManualClock, SystemClock}
 
-class RateStreamV2Reader(options: DataSourceV2Options)
+/**
+ * This is a temporary register as we build out v2 migration. Microbatch read support should
+ * be implemented in the same register as v1.
+ */
+class RateSourceProviderV2 extends DataSourceV2 with MicroBatchReadSupport with DataSourceRegister {
+  override def createMicroBatchReader(
+      schema: Optional[StructType],
+      checkpointLocation: String,
+      options: DataSourceV2Options): MicroBatchReader = {
+    new RateStreamMicroBatchReader(options)
+  }
+
+  override def shortName(): String = "ratev2"
+}
+
+class RateStreamMicroBatchReader(options: DataSourceV2Options)
   extends MicroBatchReader {
   implicit val defaultFormats: DefaultFormats = DefaultFormats
 
-  val clock = new SystemClock
+  val clock = {
+    // The option to use a manual clock is provided only for unit testing purposes.
+    if (options.get("useManualClock").orElse("false").toBoolean) new ManualClock
+    else new SystemClock
+  }
 
   private val numPartitions =
     options.get(RateStreamSourceV2.NUM_PARTITIONS).orElse("5").toInt
@@ -111,7 +132,7 @@ class RateStreamV2Reader(options: DataSourceV2Options)
 
       val packedRows = mutable.ListBuffer[(Long, Long)]()
       var outVal = startVal + numPartitions
-      var outTimeMs = startTimeMs + msPerPartitionBetweenRows
+      var outTimeMs = startTimeMs
       while (outVal <= endVal) {
         packedRows.append((outTimeMs, outVal))
         outVal += numPartitions
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 2e92beecf2c17..52f2e2639cd86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.streaming
 
-import java.util.Locale
+import java.util.{Locale, Optional}
 
 import scala.collection.JavaConverters._
 
@@ -27,8 +27,9 @@ import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SparkSession
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.{StreamingRelation, StreamingRelationV2}
+import org.apache.spark.sql.sources.StreamSourceProvider
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -166,19 +167,31 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
       userSpecifiedSchema = userSpecifiedSchema,
       className = source,
       options = extraOptions.toMap)
+    val v1Relation = ds match {
+      case _: StreamSourceProvider => Some(StreamingRelation(v1DataSource))
+      case _ => None
+    }
     ds match {
+      case s: MicroBatchReadSupport =>
+        val tempReader = s.createMicroBatchReader(
+          Optional.ofNullable(userSpecifiedSchema.orNull),
+          Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
+          options)
+        Dataset.ofRows(
+          sparkSession,
+          StreamingRelationV2(
+            s, source, extraOptions.toMap,
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case s: ContinuousReadSupport =>
         val tempReader = s.createContinuousReader(
-          java.util.Optional.ofNullable(userSpecifiedSchema.orNull),
+          Optional.ofNullable(userSpecifiedSchema.orNull),
           Utils.createTempDir(namePrefix = s"temporaryReader").getCanonicalPath,
           options)
-        // Generate the V1 node to catch errors thrown within generation.
-        StreamingRelation(v1DataSource)
         Dataset.ofRows(
           sparkSession,
           StreamingRelationV2(
             s, source, extraOptions.toMap,
-            tempReader.readSchema().toAttributes, v1DataSource)(sparkSession))
+            tempReader.readSchema().toAttributes, v1Relation)(sparkSession))
       case _ =>
         // Code path for data source v1.
         Dataset.ofRows(sparkSession, StreamingRelation(v1DataSource))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
index b508f4406138f..4b27e0d4ef47b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala
@@ -29,10 +29,10 @@ import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, DataFrame, SparkSession}
 import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger}
 import org.apache.spark.sql.execution.streaming.state.StateStoreCoordinatorRef
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousWriteSupport, MicroBatchWriteSupport}
 import org.apache.spark.util.{Clock, SystemClock, Utils}
 
 /**
@@ -240,31 +240,35 @@ class StreamingQueryManager private[sql] (sparkSession: SparkSession) extends Lo
           "is not supported in streaming DataFrames/Datasets and will be disabled.")
     }
 
-    sink match {
-      case v1Sink: Sink =>
-        new StreamingQueryWrapper(new MicroBatchExecution(
+    (sink, trigger) match {
+      case (v2Sink: ContinuousWriteSupport, trigger: ContinuousTrigger) =>
+        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
+        new StreamingQueryWrapper(new ContinuousExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v1Sink,
+          v2Sink,
           trigger,
           triggerClock,
           outputMode,
+          extraOptions,
           deleteCheckpointOnStop))
-      case v2Sink: ContinuousWriteSupport =>
-        UnsupportedOperationChecker.checkForContinuous(analyzedPlan, outputMode)
-        new StreamingQueryWrapper(new ContinuousExecution(
+      case (_: MicroBatchWriteSupport, _) | (_: Sink, _) =>
+        new StreamingQueryWrapper(new MicroBatchExecution(
           sparkSession,
           userSpecifiedName.orNull,
           checkpointLocation,
           analyzedPlan,
-          v2Sink,
+          sink,
           trigger,
           triggerClock,
           outputMode,
           extraOptions,
           deleteCheckpointOnStop))
+      case (_: ContinuousWriteSupport, t) if !t.isInstanceOf[ContinuousTrigger] =>
+        throw new AnalysisException(
+          "Sink only supports continuous writes, but a continuous trigger was not specified.")
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
index e11705a227f48..85085d43061bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/RateSourceV2Suite.scala
@@ -18,20 +18,64 @@
 package org.apache.spark.sql.execution.streaming
 
 import java.util.Optional
+import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming.continuous._
-import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamSourceV2, RateStreamV2Reader}
+import org.apache.spark.sql.execution.streaming.sources.{RateStreamBatchTask, RateStreamMicroBatchReader, RateStreamSourceV2}
 import org.apache.spark.sql.sources.v2.DataSourceV2Options
-import org.apache.spark.sql.sources.v2.streaming.ContinuousReadSupport
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, MicroBatchReadSupport}
 import org.apache.spark.sql.streaming.StreamTest
+import org.apache.spark.util.ManualClock
 
 class RateSourceV2Suite extends StreamTest {
+  import testImplicits._
+
+  case class AdvanceRateManualClock(seconds: Long) extends AddData {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+      assert(query.nonEmpty)
+      val rateSource = query.get.logicalPlan.collect {
+        case StreamingExecutionRelation(source: RateStreamMicroBatchReader, _) => source
+      }.head
+      rateSource.clock.asInstanceOf[ManualClock].advance(TimeUnit.SECONDS.toMillis(seconds))
+      rateSource.setOffsetRange(Optional.empty(), Optional.empty())
+      (rateSource, rateSource.getEndOffset())
+    }
+  }
+
+  test("microbatch in registry") {
+    DataSource.lookupDataSource("ratev2", spark.sqlContext.conf).newInstance() match {
+      case ds: MicroBatchReadSupport =>
+        val reader = ds.createMicroBatchReader(Optional.empty(), "", DataSourceV2Options.empty())
+        assert(reader.isInstanceOf[RateStreamMicroBatchReader])
+      case _ =>
+        throw new IllegalStateException("Could not find v2 read support for rate")
+    }
+  }
+
+  test("basic microbatch execution") {
+    val input = spark.readStream
+      .format("rateV2")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "10")
+      .option("useManualClock", "true")
+      .load()
+    testStream(input, useV2Sink = true)(
+      AdvanceRateManualClock(seconds = 1),
+      CheckLastBatch((0 until 10).map(v => new java.sql.Timestamp(v * 100L) -> v): _*),
+      StopStream,
+      StartStream(),
+      // Advance 2 seconds because creating a new RateSource will also create a new ManualClock
+      AdvanceRateManualClock(seconds = 2),
+      CheckLastBatch((10 until 20).map(v => new java.sql.Timestamp(v * 100L) -> v): _*)
+    )
+  }
+
   test("microbatch - numPartitions propagated") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     reader.setOffsetRange(Optional.empty(), Optional.empty())
     val tasks = reader.createReadTasks()
@@ -39,7 +83,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - set offset") {
-    val reader = new RateStreamV2Reader(DataSourceV2Options.empty())
+    val reader = new RateStreamMicroBatchReader(DataSourceV2Options.empty())
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 2000))))
     reader.setOffsetRange(Optional.of(startOffset), Optional.of(endOffset))
@@ -48,7 +92,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - infer offsets") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "100").asJava))
     reader.clock.waitTillTime(reader.clock.getTimeMillis() + 100)
     reader.setOffsetRange(Optional.empty(), Optional.empty())
@@ -69,7 +113,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - predetermined batch size") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "1", "rowsPerSecond" -> "20").asJava))
     val startOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(0, 1000))))
     val endOffset = RateStreamOffset(Map((0, ValueRunTimeMsPair(20, 2000))))
@@ -80,7 +124,7 @@ class RateSourceV2Suite extends StreamTest {
   }
 
   test("microbatch - data read") {
-    val reader = new RateStreamV2Reader(
+    val reader = new RateStreamMicroBatchReader(
       new DataSourceV2Options(Map("numPartitions" -> "11", "rowsPerSecond" -> "33").asJava))
     val startOffset = RateStreamSourceV2.createInitialOffset(11, reader.creationTimeMs)
     val endOffset = RateStreamOffset(startOffset.partitionToValueAndRunTimeMs.toSeq.map {
@@ -107,14 +151,14 @@ class RateSourceV2Suite extends StreamTest {
     DataSource.lookupDataSource("rate", spark.sqlContext.conf).newInstance() match {
       case ds: ContinuousReadSupport =>
         val reader = ds.createContinuousReader(Optional.empty(), "", DataSourceV2Options.empty())
-        assert(reader.isInstanceOf[ContinuousRateStreamReader])
+        assert(reader.isInstanceOf[RateStreamContinuousReader])
       case _ =>
         throw new IllegalStateException("Could not find v2 read support for rate")
     }
   }
 
   test("continuous data") {
-    val reader = new ContinuousRateStreamReader(
+    val reader = new RateStreamContinuousReader(
       new DataSourceV2Options(Map("numPartitions" -> "2", "rowsPerSecond" -> "20").asJava))
     reader.setOffset(Optional.empty())
     val tasks = reader.createReadTasks()
@@ -122,17 +166,17 @@ class RateSourceV2Suite extends StreamTest {
 
     val data = scala.collection.mutable.ListBuffer[Row]()
     tasks.asScala.foreach {
-      case t: RateStreamReadTask =>
+      case t: RateStreamContinuousReadTask =>
         val startTimeMs = reader.getStartOffset()
           .asInstanceOf[RateStreamOffset]
           .partitionToValueAndRunTimeMs(t.partitionIndex)
           .runTimeMs
-        val r = t.createDataReader().asInstanceOf[RateStreamDataReader]
+        val r = t.createDataReader().asInstanceOf[RateStreamContinuousDataReader]
         for (rowIndex <- 0 to 9) {
           r.next()
           data.append(r.get())
           assert(r.getOffset() ==
-            ContinuousRateStreamPartitionOffset(
+            RateStreamPartitionOffset(
               t.partitionIndex,
               t.partitionIndex + rowIndex * 2,
               startTimeMs + (rowIndex + 1) * 100))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 4b7f0fbe97d4e..d46461fa9bf6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -105,7 +105,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
      * the active query, and then return the source object the data was added, as well as the
      * offset of added data.
      */
-    def addData(query: Option[StreamExecution]): (Source, Offset)
+    def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset)
   }
 
   /** A trait that can be extended when testing a source. */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
index eda0d8ad48313..9562c10feafe9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousSuite.scala
@@ -61,7 +61,7 @@ class ContinuousSuiteBase extends StreamTest {
       case s: ContinuousExecution =>
         assert(numTriggers >= 2, "must wait for at least 2 triggers to ensure query is initialized")
         val reader = s.lastExecution.executedPlan.collectFirst {
-          case DataSourceV2ScanExec(_, r: ContinuousRateStreamReader) => r
+          case DataSourceV2ScanExec(_, r: RateStreamContinuousReader) => r
         }.get
 
         val deltaMs = numTriggers * 1000 + 300

From 68ce792b5857f0291154f524ac651036db868bb9 Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Tue, 9 Jan 2018 10:15:01 +0800
Subject: [PATCH 289/356] [SPARK-22972] Couldn't find corresponding Hive SerDe
 for data source provider org.apache.spark.sql.hive.orc

## What changes were proposed in this pull request?
Fix the warning: Couldn't find corresponding Hive SerDe for data source provider org.apache.spark.sql.hive.orc.

## How was this patch tested?
 test("SPARK-22972: hive orc source")
    assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.hive.orc")
      .equals(HiveSerDe.sourceToSerDe("orc")))

Author: xubo245 <601450868@qq.com>

Closes #20165 from xubo245/HiveSerDe.
---
 .../apache/spark/sql/internal/HiveSerDe.scala |  1 +
 .../sql/hive/orc/HiveOrcSourceSuite.scala     | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
index b9515ec7bca2a..dac463641cfab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/HiveSerDe.scala
@@ -73,6 +73,7 @@ object HiveSerDe {
     val key = source.toLowerCase(Locale.ROOT) match {
       case s if s.startsWith("org.apache.spark.sql.parquet") => "parquet"
       case s if s.startsWith("org.apache.spark.sql.orc") => "orc"
+      case s if s.startsWith("org.apache.spark.sql.hive.orc") => "orc"
       case s if s.equals("orcfile") => "orc"
       case s if s.equals("parquetfile") => "parquet"
       case s if s.equals("avrofile") => "avro"
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
index 17b7d8cfe127e..d556a030e2186 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/HiveOrcSourceSuite.scala
@@ -20,8 +20,10 @@ package org.apache.spark.sql.hive.orc
 import java.io.File
 
 import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.orc.OrcSuite
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.util.Utils
 
 class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
@@ -62,6 +64,33 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
        """.stripMargin)
   }
 
+  test("SPARK-22972: hive orc source") {
+    val tableName = "normal_orc_as_source_hive"
+    withTable(tableName) {
+      sql(
+        s"""
+          |CREATE TABLE $tableName
+          |USING org.apache.spark.sql.hive.orc
+          |OPTIONS (
+          |  PATH '${new File(orcTableAsDir.getAbsolutePath).toURI}'
+          |)
+        """.stripMargin)
+
+      val tableMetadata = spark.sessionState.catalog.getTableMetadata(
+        TableIdentifier(tableName))
+      assert(tableMetadata.storage.inputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat"))
+      assert(tableMetadata.storage.outputFormat ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat"))
+      assert(tableMetadata.storage.serde ==
+        Option("org.apache.hadoop.hive.ql.io.orc.OrcSerde"))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.hive.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+      assert(HiveSerDe.sourceToSerDe("org.apache.spark.sql.orc")
+        .equals(HiveSerDe.sourceToSerDe("orc")))
+    }
+  }
+
   test("SPARK-19459/SPARK-18220: read char/varchar column written by Hive") {
     val location = Utils.createTempDir()
     val uri = location.toURI

From 849043ce1d28a976659278d29368da0799329db8 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Tue, 9 Jan 2018 10:44:21 +0800
Subject: [PATCH 290/356] [SPARK-22990][CORE] Fix method isFairScheduler in
 JobsTab and StagesTab

## What changes were proposed in this pull request?

In current implementation, the function `isFairScheduler` is always false, since it is comparing String with `SchedulingMode`

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20186 from gengliangwang/isFairScheduler.
---
 .../src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala | 8 ++++----
 .../main/scala/org/apache/spark/ui/jobs/StagesTab.scala   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
index 99eab1b2a27d8..ff1b75e5c5065 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobsTab.scala
@@ -34,10 +34,10 @@ private[ui] class JobsTab(parent: SparkUI, store: AppStatusStore)
   val killEnabled = parent.killEnabled
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def getSparkUser: String = parent.getSparkUser
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
index be05a963f0e68..10b032084ce4f 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagesTab.scala
@@ -37,10 +37,10 @@ private[ui] class StagesTab(val parent: SparkUI, val store: AppStatusStore)
   attachPage(new PoolPage(this))
 
   def isFairScheduler: Boolean = {
-    store.environmentInfo().sparkProperties.toMap
-      .get("spark.scheduler.mode")
-      .map { mode => mode == SchedulingMode.FAIR }
-      .getOrElse(false)
+    store
+      .environmentInfo()
+      .sparkProperties
+      .contains(("spark.scheduler.mode", SchedulingMode.FAIR.toString))
   }
 
   def handleKillRequest(request: HttpServletRequest): Unit = {

From f20131dd35939734fe16b0005a086aa72400893b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 9 Jan 2018 11:49:10 +0800
Subject: [PATCH 291/356] [SPARK-22984] Fix incorrect bitmap copying and offset
 adjustment in GenerateUnsafeRowJoiner

## What changes were proposed in this pull request?

This PR fixes a longstanding correctness bug in `GenerateUnsafeRowJoiner`. This class was introduced in https://github.com/apache/spark/pull/7821 (July 2015 / Spark 1.5.0+) and is used to combine pairs of UnsafeRows in TungstenAggregationIterator, CartesianProductExec, and AppendColumns.

### Bugs fixed by this patch

1. **Incorrect combining of null-tracking bitmaps**: when concatenating two UnsafeRows, the implementation "Concatenate the two bitsets together into a single one, taking padding into account". If one row has no columns then it has a bitset size of 0, but the code was incorrectly assuming that if the left row had a non-zero number of fields then the right row would also have at least one field, so it was copying invalid bytes and and treating them as part of the bitset. I'm not sure whether this bug was also present in the original implementation or whether it was introduced in https://github.com/apache/spark/pull/7892 (which fixed another bug in this code).
2. **Incorrect updating of data offsets for null variable-length fields**: after updating the bitsets and copying fixed-length and variable-length data, we need to perform adjustments to the offsets pointing the start of variable length fields's data. The existing code was _conditionally_ adding a fixed offset to correct for the new length of the combined row, but it is unsafe to do this if the variable-length field has a null value: we always represent nulls by storing `0` in the fixed-length slot, but this code was incorrectly incrementing those values. This bug was present since the original version of `GenerateUnsafeRowJoiner`.

### Why this bug remained latent for so long

The PR which introduced `GenerateUnsafeRowJoiner` features several randomized tests, including tests of the cases where one side of the join has no fields and where string-valued fields are null. However, the existing assertions were too weak to uncover this bug:

- If a null field has a non-zero value in its fixed-length data slot then this will not cause problems for field accesses because the null-tracking bitmap should still be correct and we will not try to use the incorrect offset for anything.
- If the null tracking bitmap is corrupted by joining against a row with no fields then the corruption occurs in field numbers past the actual field numbers contained in the row. Thus valid `isNullAt()` calls will not read the incorrectly-set bits.

The existing `GenerateUnsafeRowJoinerSuite` tests only exercised `.get()` and `isNullAt()`, but didn't actually check the UnsafeRows for bit-for-bit equality, preventing these bugs from failing assertions. It turns out that there was even a [GenerateUnsafeRowJoinerBitsetSuite](https://github.com/apache/spark/blob/03377d2522776267a07b7d6ae9bddf79a4e0f516/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerBitsetSuite.scala) but it looks like it also didn't catch this problem because it only tested the bitsets in an end-to-end fashion by accessing them through the `UnsafeRow` interface instead of actually comparing the bitsets' bytes.

### Impact of these bugs

- This bug will cause `equals()` and `hashCode()` to be incorrect for these rows, which will be problematic in case`GenerateUnsafeRowJoiner`'s results are used as join or grouping keys.
- Chained / repeated invocations of `GenerateUnsafeRowJoiner` may result in reads from invalid null bitmap positions causing fields to incorrectly become NULL (see the end-to-end example below).
  - It looks like this generally only happens in `CartesianProductExec`, which our query optimizer often avoids executing (usually we try to plan a `BroadcastNestedLoopJoin` instead).

### End-to-end test case demonstrating the problem

The following query demonstrates how this bug may result in incorrect query results:

```sql
set spark.sql.autoBroadcastJoinThreshold=-1; -- Needed to trigger CartesianProductExec

create table a as select * from values 1;
create table b as select * from values 2;

SELECT
  t3.col1,
  t1.col1
FROM a t1
CROSS JOIN b t2
CROSS JOIN b t3
```

This should return `(2, 1)` but instead was returning `(null, 1)`.

Column pruning ends up trimming off all columns from `t2`, so when `t2` joins with another table this triggers the bitmap-copying bug. This incorrect bitmap is subsequently copied again when performing the final join, causing the final output to have an incorrectly-set null bit for the first field.

## How was this patch tested?

Strengthened the assertions in existing tests in GenerateUnsafeRowJoinerSuite. Also verified that the end-to-end test case which uncovered this now passes.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20181 from JoshRosen/SPARK-22984-fix-generate-unsaferow-joiner-bitmap-bugs.
---
 .../codegen/GenerateUnsafeRowJoiner.scala     | 52 +++++++++-
 .../GenerateUnsafeRowJoinerSuite.scala        | 95 ++++++++++++++++++-
 2 files changed, 138 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
index be5f5a73b5d47..febf7b0c96c2a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoiner.scala
@@ -70,7 +70,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
 
     // --------------------- copy bitset from row 1 and row 2 --------------------------- //
     val copyBitset = Seq.tabulate(outputBitsetWords) { i =>
-      val bits = if (bitset1Remainder > 0) {
+      val bits = if (bitset1Remainder > 0 && bitset2Words != 0) {
         if (i < bitset1Words - 1) {
           s"$getLong(obj1, offset1 + ${i * 8})"
         } else if (i == bitset1Words - 1) {
@@ -152,7 +152,9 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
       } else {
         // Number of bytes to increase for the offset. Note that since in UnsafeRow we store the
         // offset in the upper 32 bit of the words, we can just shift the offset to the left by
-        // 32 and increment that amount in place.
+        // 32 and increment that amount in place. However, we need to handle the important special
+        // case of a null field, in which case the offset should be zero and should not have a
+        // shift added to it.
         val shift =
           if (i < schema1.size) {
             s"${(outputBitsetWords - bitset1Words + schema2.size) * 8}L"
@@ -160,14 +162,55 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
             s"(${(outputBitsetWords - bitset2Words + schema1.size) * 8}L + numBytesVariableRow1)"
           }
         val cursor = offset + outputBitsetWords * 8 + i * 8
-        s"$putLong(buf, $cursor, $getLong(buf, $cursor) + ($shift << 32));\n"
+        // UnsafeRow is a little underspecified, so in what follows we'll treat UnsafeRowWriter's
+        // output as a de-facto specification for the internal layout of data.
+        //
+        // Null-valued fields will always have a data offset of 0 because
+        // UnsafeRowWriter.setNullAt(ordinal) sets the null bit and stores 0 to in field's
+        // position in the fixed-length section of the row. As a result, we must NOT add
+        // `shift` to the offset for null fields.
+        //
+        // We could perform a null-check here by inspecting the null-tracking bitmap, but doing
+        // so could be expensive and will add significant bloat to the generated code. Instead,
+        // we'll rely on the invariant "stored offset == 0 for variable-length data type implies
+        // that the field's value is null."
+        //
+        // To establish that this invariant holds, we'll prove that a non-null field can never
+        // have a stored offset of 0. There are two cases to consider:
+        //
+        //   1. The non-null field's data is of non-zero length: reading this field's value
+        //      must read data from the variable-length section of the row, so the stored offset
+        //      will actually be used in address calculation and must be correct. The offsets
+        //      count bytes from the start of the UnsafeRow so these offsets will always be
+        //      non-zero because the storage of the offsets themselves takes up space at the
+        //      start of the row.
+        //   2. The non-null field's data is of zero length (i.e. its data is empty). In this
+        //      case, we have to worry about the possibility that an arbitrary offset value was
+        //      stored because we never actually read any bytes using this offset and therefore
+        //      would not crash if it was incorrect. The variable-sized data writing paths in
+        //      UnsafeRowWriter unconditionally calls setOffsetAndSize(ordinal, numBytes) with
+        //      no special handling for the case where `numBytes == 0`. Internally,
+        //      setOffsetAndSize computes the offset without taking the size into account. Thus
+        //      the stored offset is the same non-zero offset that would be used if the field's
+        //      dataSize was non-zero (and in (1) above we've shown that case behaves as we
+        //      expect).
+        //
+        // Thus it is safe to perform `existingOffset != 0` checks here in the place of
+        // more expensive null-bit checks.
+        s"""
+           |existingOffset = $getLong(buf, $cursor);
+           |if (existingOffset != 0) {
+           |    $putLong(buf, $cursor, existingOffset + ($shift << 32));
+           |}
+         """.stripMargin
       }
     }
 
     val updateOffsets = ctx.splitExpressions(
       expressions = updateOffset,
       funcName = "copyBitsetFunc",
-      arguments = ("long", "numBytesVariableRow1") :: Nil)
+      arguments = ("long", "numBytesVariableRow1") :: Nil,
+      makeSplitFunction = (s: String) => "long existingOffset;\n" + s)
 
     // ------------------------ Finally, put everything together  --------------------------- //
     val codeBody = s"""
@@ -200,6 +243,7 @@ object GenerateUnsafeRowJoiner extends CodeGenerator[(StructType, StructType), U
        |    $copyFixedLengthRow2
        |    $copyVariableLengthRow1
        |    $copyVariableLengthRow2
+       |    long existingOffset;
        |    $updateOffsets
        |
        |    out.pointTo(buf, sizeInBytes);
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
index f203f25ad10d4..75c6beeb32150 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeRowJoinerSuite.scala
@@ -22,8 +22,10 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.RandomDataGenerator
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
+import org.apache.spark.sql.catalyst.expressions.{JoinedRow, UnsafeProjection, UnsafeRow}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 /**
  * Test suite for [[GenerateUnsafeRowJoiner]].
@@ -45,6 +47,32 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     testConcat(64, 64, fixed)
   }
 
+  test("rows with all empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.EMPTY_UTF8))
+     testConcat(schema, row, schema, row)
+  }
+
+  test("rows with all empty int arrays") {
+    val schema = StructType(Seq(
+      StructField("f1", ArrayType(IntegerType)), StructField("f2", ArrayType(IntegerType))))
+    val emptyIntArray =
+      ExpressionEncoder[Array[Int]]().resolveAndBind().toRow(Array.emptyIntArray).getArray(0)
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(emptyIntArray, emptyIntArray))
+    testConcat(schema, row, schema, row)
+  }
+
+  test("alternating empty and non-empty strings") {
+    val schema = StructType(Seq(
+      StructField("f1", StringType), StructField("f2", StringType)))
+    val row: UnsafeRow = UnsafeProjection.create(schema).apply(
+      InternalRow(UTF8String.EMPTY_UTF8, UTF8String.fromString("foo")))
+    testConcat(schema, row, schema, row)
+  }
+
   test("randomized fix width types") {
     for (i <- 0 until 20) {
       testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
@@ -94,27 +122,84 @@ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {
     val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
     val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
     val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])
+    testConcat(schema1, row1, schema2, row2)
+  }
+
+  private def testConcat(
+      schema1: StructType,
+      row1: UnsafeRow,
+      schema2: StructType,
+      row2: UnsafeRow) {
 
     // Run the joiner.
     val mergedSchema = StructType(schema1 ++ schema2)
     val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
-    val output = concater.join(row1, row2)
+    val output: UnsafeRow = concater.join(row1, row2)
+
+    // We'll also compare to an UnsafeRow produced with JoinedRow + UnsafeProjection. This ensures
+    // that unused space in the row (e.g. leftover bits in the null-tracking bitmap) is written
+    // correctly.
+    val expectedOutput: UnsafeRow = {
+      val joinedRowProjection = UnsafeProjection.create(mergedSchema)
+      val joined = new JoinedRow()
+      joinedRowProjection.apply(joined.apply(row1, row2))
+    }
 
     // Test everything equals ...
     for (i <- mergedSchema.indices) {
+      val dataType = mergedSchema(i).dataType
       if (i < schema1.size) {
         assert(output.isNullAt(i) === row1.isNullAt(i))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row1.get(i, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       } else {
         assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
         if (!output.isNullAt(i)) {
-          assert(output.get(i, mergedSchema(i).dataType) ===
-            row2.get(i - schema1.size, mergedSchema(i).dataType))
+          assert(output.get(i, dataType) === row2.get(i - schema1.size, dataType))
+          assert(output.get(i, dataType) === expectedOutput.get(i, dataType))
         }
       }
     }
+
+
+    assert(
+      expectedOutput.getSizeInBytes == output.getSizeInBytes,
+      "output isn't same size in bytes as slow path")
+
+    // Compare the UnsafeRows byte-by-byte so that we can print more useful debug information in
+    // case this assertion fails:
+    val actualBytes = output.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(output.getSizeInBytes)
+    val expectedBytes = expectedOutput.getBaseObject.asInstanceOf[Array[Byte]]
+      .take(expectedOutput.getSizeInBytes)
+
+    val bitsetWidth = UnsafeRow.calculateBitSetWidthInBytes(expectedOutput.numFields())
+    val actualBitset = actualBytes.take(bitsetWidth)
+    val expectedBitset = expectedBytes.take(bitsetWidth)
+    assert(actualBitset === expectedBitset, "bitsets were not equal")
+
+    val fixedLengthSize = expectedOutput.numFields() * 8
+    val actualFixedLength = actualBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    val expectedFixedLength = expectedBytes.slice(bitsetWidth, bitsetWidth + fixedLengthSize)
+    if (actualFixedLength !== expectedFixedLength) {
+      actualFixedLength.grouped(8)
+        .zip(expectedFixedLength.grouped(8))
+        .zip(mergedSchema.fields.toIterator)
+        .foreach {
+          case ((actual, expected), field) =>
+            assert(actual === expected, s"Fixed length sections are not equal for field $field")
+      }
+      fail("Fixed length sections were not equal")
+    }
+
+    val variableLengthStart = bitsetWidth + fixedLengthSize
+    val actualVariableLength = actualBytes.drop(variableLengthStart)
+    val expectedVariableLength = expectedBytes.drop(variableLengthStart)
+    assert(actualVariableLength === expectedVariableLength, "fixed length sections were not equal")
+
+    assert(output.hashCode() == expectedOutput.hashCode(), "hash codes were not equal")
   }
 
 }

From 8486ad419d8f1779e277ec71c39e1516673a83ab Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 8 Jan 2018 21:58:26 -0800
Subject: [PATCH 292/356] [SPARK-21292][DOCS] refreshtable example

## What changes were proposed in this pull request?

doc update

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20198 from felixcheung/rrefreshdoc.
---
 docs/sql-programming-guide.md | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 3ccaaf4d5b1fa..72f79d6909ecc 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -915,6 +915,14 @@ spark.catalog.refreshTable("my_table")
 
 </div>
 
+<div data-lang="r"  markdown="1">
+
+{% highlight r %}
+refreshTable("my_table")
+{% endhighlight %}
+
+</div>
+
 <div data-lang="sql"  markdown="1">
 
 {% highlight sql %}
@@ -1498,10 +1506,10 @@ that these options will be deprecated in future release as more optimizations ar
 ## Broadcast Hint for SQL Queries
 
 The `BROADCAST` hint guides Spark to broadcast each specified table when joining them with another table or view.
-When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred, 
+When Spark deciding the join methods, the broadcast hash join (i.e., BHJ) is preferred,
 even if the statistics is above the configuration `spark.sql.autoBroadcastJoinThreshold`.
 When both sides of a join are specified, Spark broadcasts the one having the lower statistics.
-Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join) 
+Note Spark does not guarantee BHJ is always chosen, since not all cases (e.g. full outer join)
 support BHJ. When the broadcast nested loop join is selected, we still respect the hint.
 
 <div class="codetabs">
@@ -1780,7 +1788,7 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
- 
+
  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
 
  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
@@ -2167,7 +2175,7 @@ Not all the APIs of the Hive UDF/UDTF/UDAF are supported by Spark SQL. Below are
   Spark SQL currently does not support the reuse of aggregation.
 * `getWindowingEvaluator` (`GenericUDAFEvaluator`) is a function to optimize aggregation by evaluating
   an aggregate over a fixed window.
-  
+
 ### Incompatible Hive UDF
 
 Below are the scenarios in which Hive and Spark generate different results:

From 02214b094390e913f52e71d55c9bb8a81c9e7ef9 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Mon, 8 Jan 2018 22:08:19 -0800
Subject: [PATCH 293/356] [SPARK-21293][SPARKR][DOCS] structured streaming doc
 update

## What changes were proposed in this pull request?

doc update

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20197 from felixcheung/rwadoc.
---
 R/pkg/vignettes/sparkr-vignettes.Rmd          |  2 +-
 docs/sparkr.md                                |  2 +-
 .../structured-streaming-programming-guide.md | 32 +++++++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd
index 2e662424b25f2..feca617c2554c 100644
--- a/R/pkg/vignettes/sparkr-vignettes.Rmd
+++ b/R/pkg/vignettes/sparkr-vignettes.Rmd
@@ -1042,7 +1042,7 @@ unlink(modelPath)
 
 ## Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental).
+SparkR supports the Structured Streaming API.
 
 You can check the Structured Streaming Programming Guide for [an introduction](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#programming-model) to its programming model and basic concepts.
 
diff --git a/docs/sparkr.md b/docs/sparkr.md
index 997ea60fb6cf0..6685b585a393a 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -596,7 +596,7 @@ The following example shows how to save/load a MLlib model by SparkR.
 
 # Structured Streaming
 
-SparkR supports the Structured Streaming API (experimental). Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
+SparkR supports the Structured Streaming API. Structured Streaming is a scalable and fault-tolerant stream processing engine built on the Spark SQL engine. For more information see the R API on the [Structured Streaming Programming Guide](structured-streaming-programming-guide.html)
 
 # R Function Name Conflicts
 
diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 31fcfabb9cacc..de13e281916db 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -827,8 +827,8 @@ df.isStreaming()
 {% endhighlight %}
 </div>
 <div data-lang="r"  markdown="1">
-{% highlight bash %}
-Not available.
+{% highlight r %}
+isStreaming(df)
 {% endhighlight %}
 </div>
 </div>
@@ -885,6 +885,19 @@ windowedCounts = words.groupBy(
 ).count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 
@@ -959,6 +972,21 @@ windowedCounts = words \
     .count()
 {% endhighlight %}
 
+</div>
+<div data-lang="r"  markdown="1">
+{% highlight r %}
+words <- ...  # streaming DataFrame of schema { timestamp: Timestamp, word: String }
+
+# Group the data by window and word and compute the count of each group
+
+words <- withWatermark(words, "timestamp", "10 minutes")
+windowedCounts <- count(
+                    groupBy(
+                      words,
+                      window(words$timestamp, "10 minutes", "5 minutes"),
+                      words$word))
+{% endhighlight %}
+
 </div>
 </div>
 

From 0959aa581a399279be3f94214bcdffc6a1b6d60a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Tue, 9 Jan 2018 16:31:20 +0800
Subject: [PATCH 294/356] [SPARK-23000] Fix Flaky test suite
 DataSourceWithHiveMetastoreCatalogSuite in Spark 2.3

## What changes were proposed in this pull request?
https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.6/

The test suite DataSourceWithHiveMetastoreCatalogSuite of Branch 2.3 always failed in hadoop 2.6

The table `t` exists in `default`, but `runSQLHive` reported the table does not exist. Obviously, Hive client's default database is different. The fix is to clean the environment and use `DEFAULT` as the database.

```
org.apache.spark.sql.execution.QueryExecutionException: FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 't'
Stacktrace

sbt.ForkMain$ForkError: org.apache.spark.sql.execution.QueryExecutionException: FAILED: SemanticException [Error 10001]: Line 1:14 Table not found 't'
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:699)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:272)
	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:210)
	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:209)
	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:255)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:673)
```

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20196 from gatorsmile/testFix.
---
 .../org/apache/spark/sql/hive/client/HiveClientImpl.scala   | 6 +++++-
 .../apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala   | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 7b7f4e0f10210..102f40bacc985 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -823,7 +823,8 @@ private[hive] class HiveClientImpl(
   }
 
   def reset(): Unit = withHiveState {
-    client.getAllTables("default").asScala.foreach { t =>
+    try {
+      client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
         val table = client.getTable("default", t)
         client.getIndexes("default", t, 255).asScala.foreach { index =>
@@ -837,6 +838,9 @@ private[hive] class HiveClientImpl(
         logDebug(s"Dropping Database: $db")
         client.dropDatabase(db, true, false, true)
       }
+    } finally {
+      runSqlHive("USE default")
+    }
   }
 }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 18137e7ea1d63..cf4ce83124d88 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -146,6 +146,11 @@ class DataSourceWithHiveMetastoreCatalogSuite
     'id cast StringType as 'd2
   ).coalesce(1)
 
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    sparkSession.metadataHive.reset()
+  }
+
   Seq(
     "parquet" -> ((
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",

From 6a4206ff04746481d7c8e307dfd0d31ff1402555 Mon Sep 17 00:00:00 2001
From: Yinan Li <liyinan926@gmail.com>
Date: Tue, 9 Jan 2018 01:32:48 -0800
Subject: [PATCH 295/356] [SPARK-22998][K8S] Set missing value for
 SPARK_MOUNTED_CLASSPATH in the executors

## What changes were proposed in this pull request?

The environment variable `SPARK_MOUNTED_CLASSPATH` is referenced in the executor's Dockerfile, where its value is added to the classpath of the executor. However, the scheduler backend code missed setting it when creating the executor pods. This PR fixes it.

## How was this patch tested?

Unit tested.

vanzin Can you help take a look? Thanks!
foxish

Author: Yinan Li <liyinan926@gmail.com>

Closes #20193 from liyinan926/master.
---
 .../spark/scheduler/cluster/k8s/ExecutorPodFactory.scala     | 5 ++++-
 .../scheduler/cluster/k8s/ExecutorPodFactorySuite.scala      | 3 ++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index 066d7e9f70ca5..bcacb3934d36a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -94,6 +94,8 @@ private[spark] class ExecutorPodFactory(
   private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
   private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
 
+  private val executorJarsDownloadDir = sparkConf.get(JARS_DOWNLOAD_LOCATION)
+
   /**
    * Configure and construct an executor pod with the given parameters.
    */
@@ -145,7 +147,8 @@ private[spark] class ExecutorPodFactory(
       (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
       (ENV_EXECUTOR_MEMORY, executorMemoryString),
       (ENV_APPLICATION_ID, applicationId),
-      (ENV_EXECUTOR_ID, executorId)) ++ executorEnvs)
+      (ENV_EXECUTOR_ID, executorId),
+      (ENV_MOUNTED_CLASSPATH, s"$executorJarsDownloadDir/*")) ++ executorEnvs)
       .map(env => new EnvVarBuilder()
         .withName(env._1)
         .withValue(env._2)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 884da8aabd880..7cfbe54c95390 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -197,7 +197,8 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
       ENV_EXECUTOR_CORES -> "1",
       ENV_EXECUTOR_MEMORY -> "1g",
       ENV_APPLICATION_ID -> "dummy",
-      ENV_EXECUTOR_POD_IP -> null) ++ additionalEnvVars
+      ENV_EXECUTOR_POD_IP -> null,
+      ENV_MOUNTED_CLASSPATH -> "/var/spark-data/spark-jars/*") ++ additionalEnvVars
 
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)

From f44ba910f58083458e1133502e193a9d6f2bf766 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 9 Jan 2018 21:48:14 +0800
Subject: [PATCH 296/356] [SPARK-16060][SQL] Support Vectorized ORC Reader

## What changes were proposed in this pull request?

This PR adds an ORC columnar-batch reader to native `OrcFileFormat`. Since both Spark `ColumnarBatch` and ORC `RowBatch` are used together, it is faster than the current Spark implementation. This replaces the prior PR, #17924.

Also, this PR adds `OrcReadBenchmark` to show the performance improvement.

## How was this patch tested?

Pass the existing test cases.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #19943 from dongjoon-hyun/SPARK-16060.
---
 .../apache/spark/sql/internal/SQLConf.scala   |   7 +
 .../orc/OrcColumnarBatchReader.java           | 523 ++++++++++++++++++
 .../datasources/orc/OrcFileFormat.scala       |  75 ++-
 .../execution/datasources/orc/OrcUtils.scala  |   7 +-
 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 435 +++++++++++++++
 5 files changed, 1022 insertions(+), 25 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
 create mode 100644 sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5c61f10bb71ad..74949db883f7a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -386,6 +386,11 @@ object SQLConf {
     .checkValues(Set("hive", "native"))
     .createWithDefault("native")
 
+  val ORC_VECTORIZED_READER_ENABLED = buildConf("spark.sql.orc.enableVectorizedReader")
+    .doc("Enables vectorized orc decoding.")
+    .booleanConf
+    .createWithDefault(true)
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
@@ -1183,6 +1188,8 @@ class SQLConf extends Serializable with Logging {
 
   def orcCompressionCodec: String = getConf(ORC_COMPRESSION)
 
+  def orcVectorizedReaderEnabled: Boolean = getConf(ORC_VECTORIZED_READER_ENABLED)
+
   def parquetCompressionCodec: String = getConf(PARQUET_COMPRESSION)
 
   def parquetVectorizedReaderEnabled: Boolean = getConf(PARQUET_VECTORIZED_READER_ENABLED)
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
new file mode 100644
index 0000000000000..5c28d0e6e507a
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -0,0 +1,523 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.io.IOException;
+import java.util.stream.IntStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.orc.OrcConf;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.mapred.OrcInputFormat;
+import org.apache.orc.storage.common.type.HiveDecimal;
+import org.apache.orc.storage.ql.exec.vector.*;
+import org.apache.orc.storage.serde2.io.HiveDecimalWritable;
+
+import org.apache.spark.memory.MemoryMode;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.vectorized.ColumnVectorUtils;
+import org.apache.spark.sql.execution.vectorized.OffHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector;
+import org.apache.spark.sql.execution.vectorized.WritableColumnVector;
+import org.apache.spark.sql.types.*;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+
+/**
+ * To support vectorization in WholeStageCodeGen, this reader returns ColumnarBatch.
+ * After creating, `initialize` and `initBatch` should be called sequentially.
+ */
+public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
+
+  /**
+   * The default size of batch. We use this value for both ORC and Spark consistently
+   * because they have different default values like the following.
+   *
+   * - ORC's VectorizedRowBatch.DEFAULT_SIZE = 1024
+   * - Spark's ColumnarBatch.DEFAULT_BATCH_SIZE = 4 * 1024
+   */
+  public static final int DEFAULT_SIZE = 4 * 1024;
+
+  // ORC File Reader
+  private Reader reader;
+
+  // Vectorized ORC Row Batch
+  private VectorizedRowBatch batch;
+
+  /**
+   * The column IDs of the physical ORC file schema which are required by this reader.
+   * -1 means this required column doesn't exist in the ORC file.
+   */
+  private int[] requestedColIds;
+
+  // Record reader from ORC row batch.
+  private org.apache.orc.RecordReader recordReader;
+
+  private StructField[] requiredFields;
+
+  // The result columnar batch for vectorized execution by whole-stage codegen.
+  private ColumnarBatch columnarBatch;
+
+  // Writable column vectors of the result columnar batch.
+  private WritableColumnVector[] columnVectors;
+
+  /**
+   * The memory mode of the columnarBatch
+   */
+  private final MemoryMode MEMORY_MODE;
+
+  public OrcColumnarBatchReader(boolean useOffHeap) {
+    MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
+  }
+
+
+  @Override
+  public Void getCurrentKey() throws IOException, InterruptedException {
+    return null;
+  }
+
+  @Override
+  public ColumnarBatch getCurrentValue() throws IOException, InterruptedException {
+    return columnarBatch;
+  }
+
+  @Override
+  public float getProgress() throws IOException, InterruptedException {
+    return recordReader.getProgress();
+  }
+
+  @Override
+  public boolean nextKeyValue() throws IOException, InterruptedException {
+    return nextBatch();
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (columnarBatch != null) {
+      columnarBatch.close();
+      columnarBatch = null;
+    }
+    if (recordReader != null) {
+      recordReader.close();
+      recordReader = null;
+    }
+  }
+
+  /**
+   * Initialize ORC file reader and batch record reader.
+   * Please note that `initBatch` is needed to be called after this.
+   */
+  @Override
+  public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
+      throws IOException, InterruptedException {
+    FileSplit fileSplit = (FileSplit)inputSplit;
+    Configuration conf = taskAttemptContext.getConfiguration();
+    reader = OrcFile.createReader(
+      fileSplit.getPath(),
+      OrcFile.readerOptions(conf)
+        .maxLength(OrcConf.MAX_FILE_LENGTH.getLong(conf))
+        .filesystem(fileSplit.getPath().getFileSystem(conf)));
+
+    Reader.Options options =
+      OrcInputFormat.buildOptions(conf, reader, fileSplit.getStart(), fileSplit.getLength());
+    recordReader = reader.rows(options);
+  }
+
+  /**
+   * Initialize columnar batch by setting required schema and partition information.
+   * With this information, this creates ColumnarBatch with the full schema.
+   */
+  public void initBatch(
+      TypeDescription orcSchema,
+      int[] requestedColIds,
+      StructField[] requiredFields,
+      StructType partitionSchema,
+      InternalRow partitionValues) {
+    batch = orcSchema.createRowBatch(DEFAULT_SIZE);
+    assert(!batch.selectedInUse); // `selectedInUse` should be initialized with `false`.
+
+    this.requiredFields = requiredFields;
+    this.requestedColIds = requestedColIds;
+    assert(requiredFields.length == requestedColIds.length);
+
+    StructType resultSchema = new StructType(requiredFields);
+    for (StructField f : partitionSchema.fields()) {
+      resultSchema = resultSchema.add(f);
+    }
+
+    int capacity = DEFAULT_SIZE;
+    if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
+      columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
+    } else {
+      columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
+    }
+    columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
+
+    if (partitionValues.numFields() > 0) {
+      int partitionIdx = requiredFields.length;
+      for (int i = 0; i < partitionValues.numFields(); i++) {
+        ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+        columnVectors[i + partitionIdx].setIsConstant();
+      }
+    }
+
+    // Initialize the missing columns once.
+    for (int i = 0; i < requiredFields.length; i++) {
+      if (requestedColIds[i] == -1) {
+        columnVectors[i].putNulls(0, columnarBatch.capacity());
+        columnVectors[i].setIsConstant();
+      }
+    }
+  }
+
+  /**
+   * Return true if there exists more data in the next batch. If exists, prepare the next batch
+   * by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
+   */
+  private boolean nextBatch() throws IOException {
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+    columnarBatch.setNumRows(0);
+
+    recordReader.nextBatch(batch);
+    int batchSize = batch.size;
+    if (batchSize == 0) {
+      return false;
+    }
+    columnarBatch.setNumRows(batchSize);
+    for (int i = 0; i < requiredFields.length; i++) {
+      StructField field = requiredFields[i];
+      WritableColumnVector toColumn = columnVectors[i];
+
+      if (requestedColIds[i] >= 0) {
+        ColumnVector fromColumn = batch.cols[requestedColIds[i]];
+
+        if (fromColumn.isRepeating) {
+          putRepeatingValues(batchSize, field, fromColumn, toColumn);
+        } else if (fromColumn.noNulls) {
+          putNonNullValues(batchSize, field, fromColumn, toColumn);
+        } else {
+          putValues(batchSize, field, fromColumn, toColumn);
+        }
+      }
+    }
+    return true;
+  }
+
+  private void putRepeatingValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    if (fromColumn.isNull[0]) {
+      toColumn.putNulls(0, batchSize);
+    } else {
+      DataType type = field.dataType();
+      if (type instanceof BooleanType) {
+        toColumn.putBooleans(0, batchSize, ((LongColumnVector)fromColumn).vector[0] == 1);
+      } else if (type instanceof ByteType) {
+        toColumn.putBytes(0, batchSize, (byte)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof ShortType) {
+        toColumn.putShorts(0, batchSize, (short)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof IntegerType || type instanceof DateType) {
+        toColumn.putInts(0, batchSize, (int)((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof LongType) {
+        toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof TimestampType) {
+        toColumn.putLongs(0, batchSize,
+          fromTimestampColumnVector((TimestampColumnVector)fromColumn, 0));
+      } else if (type instanceof FloatType) {
+        toColumn.putFloats(0, batchSize, (float)((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof DoubleType) {
+        toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector[0]);
+      } else if (type instanceof StringType || type instanceof BinaryType) {
+        BytesColumnVector data = (BytesColumnVector)fromColumn;
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        int size = data.vector[0].length;
+        arrayData.reserve(size);
+        arrayData.putBytes(0, size, data.vector[0], 0);
+        for (int index = 0; index < batchSize; index++) {
+          toColumn.putArray(index, 0, size);
+        }
+      } else if (type instanceof DecimalType) {
+        DecimalType decimalType = (DecimalType)type;
+        putDecimalWritables(
+          toColumn,
+          batchSize,
+          decimalType.precision(),
+          decimalType.scale(),
+          ((DecimalColumnVector)fromColumn).vector[0]);
+      } else {
+        throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+      }
+    }
+  }
+
+  private void putNonNullValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putBoolean(index, data[index] == 1);
+      }
+    } else if (type instanceof ByteType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putByte(index, (byte)data[index]);
+      }
+    } else if (type instanceof ShortType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putShort(index, (short)data[index]);
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] data = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putInt(index, (int)data[index]);
+      }
+    } else if (type instanceof LongType) {
+      toColumn.putLongs(0, batchSize, ((LongColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector data = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putLong(index, fromTimestampColumnVector(data, index));
+      }
+    } else if (type instanceof FloatType) {
+      double[] data = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        toColumn.putFloat(index, (float)data[index]);
+      }
+    } else if (type instanceof DoubleType) {
+      toColumn.putDoubles(0, batchSize, ((DoubleColumnVector)fromColumn).vector, 0);
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector data = ((BytesColumnVector)fromColumn);
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(data.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += data.length[index], index++) {
+        arrayData.putBytes(pos, data.length[index], data.vector[index], data.start[index]);
+        toColumn.putArray(index, pos, data.length[index]);
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      DecimalColumnVector data = ((DecimalColumnVector)fromColumn);
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        putDecimalWritable(
+          toColumn,
+          index,
+          decimalType.precision(),
+          decimalType.scale(),
+          data.vector[index]);
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  private void putValues(
+      int batchSize,
+      StructField field,
+      ColumnVector fromColumn,
+      WritableColumnVector toColumn) {
+    DataType type = field.dataType();
+    if (type instanceof BooleanType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putBoolean(index, vector[index] == 1);
+        }
+      }
+    } else if (type instanceof ByteType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putByte(index, (byte)vector[index]);
+        }
+      }
+    } else if (type instanceof ShortType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putShort(index, (short)vector[index]);
+        }
+      }
+    } else if (type instanceof IntegerType || type instanceof DateType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putInt(index, (int)vector[index]);
+        }
+      }
+    } else if (type instanceof LongType) {
+      long[] vector = ((LongColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, vector[index]);
+        }
+      }
+    } else if (type instanceof TimestampType) {
+      TimestampColumnVector vector = ((TimestampColumnVector)fromColumn);
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putLong(index, fromTimestampColumnVector(vector, index));
+        }
+      }
+    } else if (type instanceof FloatType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putFloat(index, (float)vector[index]);
+        }
+      }
+    } else if (type instanceof DoubleType) {
+      double[] vector = ((DoubleColumnVector)fromColumn).vector;
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          toColumn.putDouble(index, vector[index]);
+        }
+      }
+    } else if (type instanceof StringType || type instanceof BinaryType) {
+      BytesColumnVector vector = (BytesColumnVector)fromColumn;
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      int totalNumBytes = IntStream.of(vector.length).sum();
+      arrayData.reserve(totalNumBytes);
+      for (int index = 0, pos = 0; index < batchSize; pos += vector.length[index], index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          arrayData.putBytes(pos, vector.length[index], vector.vector[index], vector.start[index]);
+          toColumn.putArray(index, pos, vector.length[index]);
+        }
+      }
+    } else if (type instanceof DecimalType) {
+      DecimalType decimalType = (DecimalType)type;
+      HiveDecimalWritable[] vector = ((DecimalColumnVector)fromColumn).vector;
+      if (decimalType.precision() > Decimal.MAX_LONG_DIGITS()) {
+        WritableColumnVector arrayData = toColumn.getChildColumn(0);
+        arrayData.reserve(batchSize * 16);
+      }
+      for (int index = 0; index < batchSize; index++) {
+        if (fromColumn.isNull[index]) {
+          toColumn.putNull(index);
+        } else {
+          putDecimalWritable(
+            toColumn,
+            index,
+            decimalType.precision(),
+            decimalType.scale(),
+            vector[index]);
+        }
+      }
+    } else {
+      throw new UnsupportedOperationException("Unsupported Data Type: " + type);
+    }
+  }
+
+  /**
+   * Returns the number of micros since epoch from an element of TimestampColumnVector.
+   */
+  private static long fromTimestampColumnVector(TimestampColumnVector vector, int index) {
+    return vector.time[index] * 1000L + vector.nanos[index] / 1000L;
+  }
+
+  /**
+   * Put a `HiveDecimalWritable` to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritable(
+      WritableColumnVector toColumn,
+      int index,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInt(index, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLong(index, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.putBytes(index * 16, bytes.length, bytes, 0);
+      toColumn.putArray(index, index * 16, bytes.length);
+    }
+  }
+
+  /**
+   * Put `HiveDecimalWritable`s to a `WritableColumnVector`.
+   */
+  private static void putDecimalWritables(
+      WritableColumnVector toColumn,
+      int size,
+      int precision,
+      int scale,
+      HiveDecimalWritable decimalWritable) {
+    HiveDecimal decimal = decimalWritable.getHiveDecimal();
+    Decimal value =
+      Decimal.apply(decimal.bigDecimalValue(), decimal.precision(), decimal.scale());
+    value.changePrecision(precision, scale);
+
+    if (precision <= Decimal.MAX_INT_DIGITS()) {
+      toColumn.putInts(0, size, (int) value.toUnscaledLong());
+    } else if (precision <= Decimal.MAX_LONG_DIGITS()) {
+      toColumn.putLongs(0, size, value.toUnscaledLong());
+    } else {
+      byte[] bytes = value.toJavaBigDecimal().unscaledValue().toByteArray();
+      WritableColumnVector arrayData = toColumn.getChildColumn(0);
+      arrayData.reserve(bytes.length);
+      arrayData.putBytes(0, bytes.length, bytes, 0);
+      for (int index = 0; index < size; index++) {
+        toColumn.putArray(index, 0, bytes.length);
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index f7471cd7debce..b8bacfa1838ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -118,6 +118,13 @@ class OrcFileFormat
     }
   }
 
+  override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
+    val conf = sparkSession.sessionState.conf
+    conf.orcVectorizedReaderEnabled && conf.wholeStageEnabled &&
+      schema.length <= conf.wholeStageMaxNumFields &&
+      schema.forall(_.dataType.isInstanceOf[AtomicType])
+  }
+
   override def isSplitable(
       sparkSession: SparkSession,
       options: Map[String, String],
@@ -139,6 +146,11 @@ class OrcFileFormat
       }
     }
 
+    val resultSchema = StructType(requiredSchema.fields ++ partitionSchema.fields)
+    val sqlConf = sparkSession.sessionState.conf
+    val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled
+    val enableVectorizedReader = supportBatch(sparkSession, resultSchema)
+
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
     val isCaseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
@@ -146,8 +158,14 @@ class OrcFileFormat
     (file: PartitionedFile) => {
       val conf = broadcastedConf.value.value
 
+      val filePath = new Path(new URI(file.filePath))
+
+      val fs = filePath.getFileSystem(conf)
+      val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+      val reader = OrcFile.createReader(filePath, readerOptions)
+
       val requestedColIdsOrEmptyFile = OrcUtils.requestedColumnIds(
-        isCaseSensitive, dataSchema, requiredSchema, new Path(new URI(file.filePath)), conf)
+        isCaseSensitive, dataSchema, requiredSchema, reader, conf)
 
       if (requestedColIdsOrEmptyFile.isEmpty) {
         Iterator.empty
@@ -155,29 +173,46 @@ class OrcFileFormat
         val requestedColIds = requestedColIdsOrEmptyFile.get
         assert(requestedColIds.length == requiredSchema.length,
           "[BUG] requested column IDs do not match required schema")
-        conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
+        val taskConf = new Configuration(conf)
+        taskConf.set(OrcConf.INCLUDE_COLUMNS.getAttribute,
           requestedColIds.filter(_ != -1).sorted.mkString(","))
 
-        val fileSplit =
-          new FileSplit(new Path(new URI(file.filePath)), file.start, file.length, Array.empty)
+        val fileSplit = new FileSplit(filePath, file.start, file.length, Array.empty)
         val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0)
-        val taskAttemptContext = new TaskAttemptContextImpl(conf, attemptId)
-
-        val orcRecordReader = new OrcInputFormat[OrcStruct]
-          .createRecordReader(fileSplit, taskAttemptContext)
-        val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
-        Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
-
-        val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
-        val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
-        val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
-
-        if (partitionSchema.length == 0) {
-          iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+        val taskAttemptContext = new TaskAttemptContextImpl(taskConf, attemptId)
+
+        val taskContext = Option(TaskContext.get())
+        if (enableVectorizedReader) {
+          val batchReader =
+            new OrcColumnarBatchReader(enableOffHeapColumnVector && taskContext.isDefined)
+          batchReader.initialize(fileSplit, taskAttemptContext)
+          batchReader.initBatch(
+            reader.getSchema,
+            requestedColIds,
+            requiredSchema.fields,
+            partitionSchema,
+            file.partitionValues)
+
+          val iter = new RecordReaderIterator(batchReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+          iter.asInstanceOf[Iterator[InternalRow]]
         } else {
-          val joinedRow = new JoinedRow()
-          iter.map(value =>
-            unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          val orcRecordReader = new OrcInputFormat[OrcStruct]
+            .createRecordReader(fileSplit, taskAttemptContext)
+          val iter = new RecordReaderIterator[OrcStruct](orcRecordReader)
+          Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => iter.close()))
+
+          val fullSchema = requiredSchema.toAttributes ++ partitionSchema.toAttributes
+          val unsafeProjection = GenerateUnsafeProjection.generate(fullSchema, fullSchema)
+          val deserializer = new OrcDeserializer(dataSchema, requiredSchema, requestedColIds)
+
+          if (partitionSchema.length == 0) {
+            iter.map(value => unsafeProjection(deserializer.deserialize(value)))
+          } else {
+            val joinedRow = new JoinedRow()
+            iter.map(value =>
+              unsafeProjection(joinedRow(deserializer.deserialize(value), file.partitionValues)))
+          }
         }
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index b03ee06d04a16..13a23996f4ade 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -21,7 +21,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
-import org.apache.orc.{OrcFile, TypeDescription}
+import org.apache.orc.{OrcFile, Reader, TypeDescription}
 
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
@@ -80,11 +80,8 @@ object OrcUtils extends Logging {
       isCaseSensitive: Boolean,
       dataSchema: StructType,
       requiredSchema: StructType,
-      file: Path,
+      reader: Reader,
       conf: Configuration): Option[Array[Int]] = {
-    val fs = file.getFileSystem(conf)
-    val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
     val orcFieldNames = reader.getSchema.getFieldNames.asScala
     if (orcFieldNames.isEmpty) {
       // SPARK-8501: Some old empty ORC files always have an empty schema stored in their footer.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
new file mode 100644
index 0000000000000..37ed846acd1eb
--- /dev/null
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -0,0 +1,435 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.orc
+
+import java.io.File
+
+import scala.util.{Random, Try}
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types._
+import org.apache.spark.util.{Benchmark, Utils}
+
+
+/**
+ * Benchmark to measure ORC read performance.
+ *
+ * This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources.
+ */
+// scalastyle:off line.size.limit
+object OrcReadBenchmark {
+  val conf = new SparkConf()
+  conf.set("orc.compression", "snappy")
+
+  private val spark = SparkSession.builder()
+    .master("local[1]")
+    .appName("OrcReadBenchmark")
+    .config(conf)
+    .getOrCreate()
+
+  // Set default configs. Individual cases will change them if necessary.
+  spark.conf.set(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key, "true")
+
+  def withTempPath(f: File => Unit): Unit = {
+    val path = Utils.createTempDir()
+    path.delete()
+    try f(path) finally Utils.deleteRecursively(path)
+  }
+
+  def withTempTable(tableNames: String*)(f: => Unit): Unit = {
+    try f finally tableNames.foreach(spark.catalog.dropTempView)
+  }
+
+  def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
+    val (keys, values) = pairs.unzip
+    val currentValues = keys.map(key => Try(spark.conf.get(key)).toOption)
+    (keys, values).zipped.foreach(spark.conf.set)
+    try f finally {
+      keys.zip(currentValues).foreach {
+        case (key, Some(value)) => spark.conf.set(key, value)
+        case (key, None) => spark.conf.unset(key)
+      }
+    }
+  }
+
+  private val NATIVE_ORC_FORMAT = classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat].getCanonicalName
+  private val HIVE_ORC_FORMAT = classOf[org.apache.spark.sql.hive.orc.OrcFileFormat].getCanonicalName
+
+  private def prepareTable(dir: File, df: DataFrame, partition: Option[String] = None): Unit = {
+    val dirORC = dir.getCanonicalPath
+
+    if (partition.isDefined) {
+      df.write.partitionBy(partition.get).orc(dirORC)
+    } else {
+      df.write.orc(dirORC)
+    }
+
+    spark.read.format(NATIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("nativeOrcTable")
+    spark.read.format(HIVE_ORC_FORMAT).load(dirORC).createOrReplaceTempView("hiveOrcTable")
+  }
+
+  def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
+    val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))
+
+        sqlBenchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1192 / 1221         13.2          75.8       1.0X
+        Native ORC Vectorized                          161 /  170         97.5          10.3       7.4X
+        Hive built-in ORC                             1399 / 1413         11.2          89.0       0.9X
+
+        SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1287 / 1333         12.2          81.8       1.0X
+        Native ORC Vectorized                          164 /  172         95.6          10.5       7.8X
+        Hive built-in ORC                             1629 / 1650          9.7         103.6       0.8X
+
+        SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1304 / 1388         12.1          82.9       1.0X
+        Native ORC Vectorized                          227 /  240         69.3          14.4       5.7X
+        Hive built-in ORC                             1866 / 1867          8.4         118.6       0.7X
+
+        SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1331 / 1357         11.8          84.6       1.0X
+        Native ORC Vectorized                          289 /  297         54.4          18.4       4.6X
+        Hive built-in ORC                             1922 / 1929          8.2         122.2       0.7X
+
+        SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1410 / 1428         11.2          89.7       1.0X
+        Native ORC Vectorized                          328 /  335         48.0          20.8       4.3X
+        Hive built-in ORC                             1929 / 2012          8.2         122.6       0.7X
+
+        SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1467 / 1485         10.7          93.3       1.0X
+        Native ORC Vectorized                          402 /  411         39.1          25.6       3.6X
+        Hive built-in ORC                             2023 / 2042          7.8         128.6       0.7X
+        */
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  def intStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Int and String Scan", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql("SELECT CAST(value AS INT) AS c1, CAST(value as STRING) AS c2 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2729 / 2744          3.8         260.2       1.0X
+        Native ORC Vectorized                         1318 / 1344          8.0         125.7       2.1X
+        Hive built-in ORC                             3731 / 3782          2.8         355.8       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def partitionTableScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Partitioned Table", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        spark.range(values).map(_ => Random.nextLong).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))
+
+        benchmark.addCase("Read data column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read data column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read data column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Read partition column - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read partition column - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read partition column - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
+        }
+
+        benchmark.addCase("Read both columns - Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Read both columns - Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Read both columns - Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Read data column - Native ORC MR               1531 / 1536         10.3          97.4       1.0X
+        Read data column - Native ORC Vectorized        295 /  298         53.3          18.8       5.2X
+        Read data column - Hive built-in ORC           2125 / 2126          7.4         135.1       0.7X
+        Read partition column - Native ORC MR          1049 / 1062         15.0          66.7       1.5X
+        Read partition column - Native ORC Vectorized    54 /   57        290.1           3.4      28.2X
+        Read partition column - Hive built-in ORC      1282 / 1291         12.3          81.5       1.2X
+        Read both columns - Native ORC MR              1594 / 1598          9.9         101.3       1.0X
+        Read both columns - Native ORC Vectorized       332 /  336         47.4          21.1       4.6X
+        Read both columns - Hive built-in ORC          2145 / 2187          7.3         136.4       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def repeatedStringScanBenchmark(values: Int): Unit = {
+    val benchmark = new Benchmark("Repeated String", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT CAST((id % 200) + 10000 as STRING) AS c1 FROM t1"))
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1325 / 1328          7.9         126.4       1.0X
+        Native ORC Vectorized                          320 /  330         32.8          30.5       4.1X
+        Hive built-in ORC                             1971 / 1972          5.3         188.0       0.7X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def stringWithNullsScanBenchmark(values: Int, fractionOfNulls: Double): Unit = {
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        spark.range(values).createOrReplaceTempView("t1")
+
+        prepareTable(
+          dir,
+          spark.sql(
+            s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " +
+            s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1"))
+
+        val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values)
+
+        benchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
+        benchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
+            "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        String with Nulls Scan (0.0%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2553 / 2554          4.1         243.4       1.0X
+        Native ORC Vectorized                          953 /  954         11.0          90.9       2.7X
+        Hive built-in ORC                             3875 / 3898          2.7         369.6       0.7X
+
+        String with Nulls Scan (0.5%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2389 / 2408          4.4         227.8       1.0X
+        Native ORC Vectorized                         1208 / 1209          8.7         115.2       2.0X
+        Hive built-in ORC                             2940 / 2952          3.6         280.4       0.8X
+
+        String with Nulls Scan (0.95%):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1295 / 1311          8.1         123.5       1.0X
+        Native ORC Vectorized                          449 /  457         23.4          42.8       2.9X
+        Hive built-in ORC                             1649 / 1660          6.4         157.3       0.8X
+        */
+        benchmark.run()
+      }
+    }
+  }
+
+  def columnsBenchmark(values: Int, width: Int): Unit = {
+    val sqlBenchmark = new Benchmark(s"SQL Single Column Scan from $width columns", values)
+
+    withTempPath { dir =>
+      withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
+        import spark.implicits._
+        val middle = width / 2
+        val selectExpr = (1 to width).map(i => s"value as c$i")
+        spark.range(values).map(_ => Random.nextLong).toDF()
+          .selectExpr(selectExpr: _*).createOrReplaceTempView("t1")
+
+        prepareTable(dir, spark.sql("SELECT * FROM t1"))
+
+        sqlBenchmark.addCase("Native ORC MR") { _ =>
+          withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+        }
+
+        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+          spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
+        }
+
+        /*
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
+        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+
+        SQL Single Column Scan from 100 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 1103 / 1124          1.0        1052.0       1.0X
+        Native ORC Vectorized                           92 /  100         11.4          87.9      12.0X
+        Hive built-in ORC                              383 /  390          2.7         365.4       2.9X
+
+        SQL Single Column Scan from 200 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 2245 / 2250          0.5        2141.0       1.0X
+        Native ORC Vectorized                          157 /  165          6.7         150.2      14.3X
+        Hive built-in ORC                              587 /  593          1.8         559.4       3.8X
+
+        SQL Single Column Scan from 300 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        ------------------------------------------------------------------------------------------------
+        Native ORC MR                                 3343 / 3350          0.3        3188.3       1.0X
+        Native ORC Vectorized                          265 /  280          3.9         253.2      12.6X
+        Hive built-in ORC                              828 /  842          1.3         789.8       4.0X
+        */
+        sqlBenchmark.run()
+      }
+    }
+  }
+
+  def main(args: Array[String]): Unit = {
+    Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType =>
+      numericScanBenchmark(1024 * 1024 * 15, dataType)
+    }
+    intStringScanBenchmark(1024 * 1024 * 10)
+    partitionTableScanBenchmark(1024 * 1024 * 15)
+    repeatedStringScanBenchmark(1024 * 1024 * 10)
+    for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
+      stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+    }
+    columnsBenchmark(1024 * 1024 * 1, 100)
+    columnsBenchmark(1024 * 1024 * 1, 200)
+    columnsBenchmark(1024 * 1024 * 1, 300)
+  }
+}
+// scalastyle:on line.size.limit

From 2250cb75b99d257e698fe5418a51d8cddb4d5104 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Tue, 9 Jan 2018 21:58:55 +0800
Subject: [PATCH 297/356] [SPARK-22981][SQL] Fix incorrect results of Casting
 Struct to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting structs into strings;
```
scala> val df = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
scala> df.write.saveAsTable("t")
scala> sql("SELECT CAST(a AS STRING) FROM t").show
+-------------------+
|                  a|
+-------------------+
|[0,1,1800000001,61]|
|[0,2,1800000001,62]|
+-------------------+
```
This pr modified the result into;
```
+------+
|     a|
+------+
|[1, a]|
|[2, b]|
+------+
```

## How was this patch tested?
Added tests in `CastSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20176 from maropu/SPARK-22981.
---
 .../spark/sql/catalyst/expressions/Cast.scala | 71 +++++++++++++++++++
 .../sql/catalyst/expressions/CastSuite.scala  | 16 +++++
 2 files changed, 87 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f2de4c8e30bec..f21aa1e9e3135 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -259,6 +259,29 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case StructType(fields) =>
+      buildCast[InternalRow](_, row => {
+        val builder = new UTF8StringBuilder
+        builder.append("[")
+        if (row.numFields > 0) {
+          val st = fields.map(_.dataType)
+          val toUTF8StringFuncs = st.map(castToString)
+          if (!row.isNullAt(0)) {
+            builder.append(toUTF8StringFuncs(0)(row.get(0, st(0))).asInstanceOf[UTF8String])
+          }
+          var i = 1
+          while (i < row.numFields) {
+            builder.append(",")
+            if (!row.isNullAt(i)) {
+              builder.append(" ")
+              builder.append(toUTF8StringFuncs(i)(row.get(i, st(i))).asInstanceOf[UTF8String])
+            }
+            i += 1
+          }
+        }
+        builder.append("]")
+        builder.build()
+      })
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -732,6 +755,41 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
      """.stripMargin
   }
 
+  private def writeStructToStringBuilder(
+      st: Seq[DataType],
+      row: String,
+      buffer: String,
+      ctx: CodegenContext): String = {
+    val structToStringCode = st.zipWithIndex.map { case (ft, i) =>
+      val fieldToStringCode = castToStringCode(ft, ctx)
+      val field = ctx.freshName("field")
+      val fieldStr = ctx.freshName("fieldStr")
+      s"""
+         |${if (i != 0) s"""$buffer.append(",");""" else ""}
+         |if (!$row.isNullAt($i)) {
+         |  ${if (i != 0) s"""$buffer.append(" ");""" else ""}
+         |
+         |  // Append $i field into the string buffer
+         |  ${ctx.javaType(ft)} $field = ${ctx.getValue(row, ft, s"$i")};
+         |  UTF8String $fieldStr = null;
+         |  ${fieldToStringCode(field, fieldStr, null /* resultIsNull won't be used */)}
+         |  $buffer.append($fieldStr);
+         |}
+       """.stripMargin
+    }
+
+    val writeStructCode = ctx.splitExpressions(
+      expressions = structToStringCode,
+      funcName = "fieldToString",
+      arguments = ("InternalRow", row) :: (classOf[UTF8StringBuilder].getName, buffer) :: Nil)
+
+    s"""
+       |$buffer.append("[");
+       |$writeStructCode
+       |$buffer.append("]");
+     """.stripMargin
+  }
+
   private[this] def castToStringCode(from: DataType, ctx: CodegenContext): CastFunction = {
     from match {
       case BinaryType =>
@@ -765,6 +823,19 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case StructType(fields) =>
+        (c, evPrim, evNull) => {
+          val row = ctx.freshName("row")
+          val buffer = ctx.freshName("buffer")
+          val bufferClass = classOf[UTF8StringBuilder].getName
+          val writeStructCode = writeStructToStringBuilder(fields.map(_.dataType), row, buffer, ctx)
+          s"""
+             |InternalRow $row = $c;
+             |$bufferClass $buffer = new $bufferClass();
+             |$writeStructCode
+             |$evPrim = $buffer.build();
+           """.stripMargin
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index 1445bb8a97d40..5b25bdf907c3a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -906,4 +906,20 @@ class CastSuite extends SparkFunSuite with ExpressionEvalHelper {
       StringType)
     checkEvaluation(ret5, "[1 -> [1, 2, 3], 2 -> [4, 5, 6]]")
   }
+
+  test("SPARK-22981 Cast struct to string") {
+    val ret1 = cast(Literal.create((1, "a", 0.1)), StringType)
+    checkEvaluation(ret1, "[1, a, 0.1]")
+    val ret2 = cast(Literal.create(Tuple3[Int, String, String](1, null, "a")), StringType)
+    checkEvaluation(ret2, "[1,, a]")
+    val ret3 = cast(Literal.create(
+      (Date.valueOf("2014-12-03"), Timestamp.valueOf("2014-12-03 15:05:00"))), StringType)
+    checkEvaluation(ret3, "[2014-12-03, 2014-12-03 15:05:00]")
+    val ret4 = cast(Literal.create(((1, "a"), 5, 0.1)), StringType)
+    checkEvaluation(ret4, "[[1, a], 5, 0.1]")
+    val ret5 = cast(Literal.create((Seq(1, 2, 3), "a", 0.1)), StringType)
+    checkEvaluation(ret5, "[[1, 2, 3], a, 0.1]")
+    val ret6 = cast(Literal.create((1, Map(1 -> "a", 2 -> "b", 3 -> "c"))), StringType)
+    checkEvaluation(ret6, "[1, [1 -> a, 2 -> b, 3 -> c]]")
+  }
 }

From 96ba217a06fbe1dad703447d7058cb7841653861 Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 10 Jan 2018 10:15:27 +0800
Subject: [PATCH 298/356] [SPARK-23005][CORE] Improve RDD.take on small number
 of partitions

## What changes were proposed in this pull request?
In current implementation of RDD.take, we overestimate the number of partitions we need to try by 50%:
`(1.5 * num * partsScanned / buf.size).toInt`
However, when the number is small, the result of `.toInt` is not what we want.
E.g, 2.9 will become 2, which should be 3.
Use Math.ceil to fix the problem.

Also clean up the code in RDD.scala.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20200 from gengliangwang/Take.
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 27 +++++++++----------
 .../spark/sql/execution/SparkPlan.scala       |  5 ++--
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index 8798dfc925362..7859781e98223 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -150,7 +150,7 @@ abstract class RDD[T: ClassTag](
   val id: Int = sc.newRddId()
 
   /** A friendly name for this RDD */
-  @transient var name: String = null
+  @transient var name: String = _
 
   /** Assign a name to this RDD */
   def setName(_name: String): this.type = {
@@ -224,8 +224,8 @@ abstract class RDD[T: ClassTag](
 
   // Our dependencies and partitions will be gotten by calling subclass's methods below, and will
   // be overwritten when we're checkpointed
-  private var dependencies_ : Seq[Dependency[_]] = null
-  @transient private var partitions_ : Array[Partition] = null
+  private var dependencies_ : Seq[Dependency[_]] = _
+  @transient private var partitions_ : Array[Partition] = _
 
   /** An Option holding our checkpoint RDD, if we are checkpointed */
   private def checkpointRDD: Option[CheckpointRDD[T]] = checkpointData.flatMap(_.checkpointRDD)
@@ -297,7 +297,7 @@ abstract class RDD[T: ClassTag](
   private[spark] def getNarrowAncestors: Seq[RDD[_]] = {
     val ancestors = new mutable.HashSet[RDD[_]]
 
-    def visit(rdd: RDD[_]) {
+    def visit(rdd: RDD[_]): Unit = {
       val narrowDependencies = rdd.dependencies.filter(_.isInstanceOf[NarrowDependency[_]])
       val narrowParents = narrowDependencies.map(_.rdd)
       val narrowParentsNotVisited = narrowParents.filterNot(ancestors.contains)
@@ -449,7 +449,7 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
-        var position = (new Random(hashing.byteswap32(index))).nextInt(numPartitions)
+        var position = new Random(hashing.byteswap32(index)).nextInt(numPartitions)
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
@@ -951,7 +951,7 @@ abstract class RDD[T: ClassTag](
     def collectPartition(p: Int): Array[T] = {
       sc.runJob(this, (iter: Iterator[T]) => iter.toArray, Seq(p)).head
     }
-    (0 until partitions.length).iterator.flatMap(i => collectPartition(i))
+    partitions.indices.iterator.flatMap(i => collectPartition(i))
   }
 
   /**
@@ -1338,6 +1338,7 @@ abstract class RDD[T: ClassTag](
         // The number of partitions to try in this iteration. It is ok for this number to be
         // greater than totalParts because we actually cap it at totalParts in runJob.
         var numPartsToTry = 1L
+        val left = num - buf.size
         if (partsScanned > 0) {
           // If we didn't find any rows after the previous iteration, quadruple and retry.
           // Otherwise, interpolate the number of partitions we need to try, but overestimate
@@ -1345,13 +1346,12 @@ abstract class RDD[T: ClassTag](
           if (buf.isEmpty) {
             numPartsToTry = partsScanned * scaleUpFactor
           } else {
-            // the left side of max is >=1 whenever partsScanned >= 2
-            numPartsToTry = Math.max((1.5 * num * partsScanned / buf.size).toInt - partsScanned, 1)
+            // As left > 0, numPartsToTry is always >= 1
+            numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
             numPartsToTry = Math.min(numPartsToTry, partsScanned * scaleUpFactor)
           }
         }
 
-        val left = num - buf.size
         val p = partsScanned.until(math.min(partsScanned + numPartsToTry, totalParts).toInt)
         val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)
 
@@ -1677,8 +1677,7 @@ abstract class RDD[T: ClassTag](
   // an RDD and its parent in every batch, in which case the parent may never be checkpointed
   // and its lineage never truncated, leading to OOMs in the long run (SPARK-6847).
   private val checkpointAllMarkedAncestors =
-    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS))
-      .map(_.toBoolean).getOrElse(false)
+    Option(sc.getLocalProperty(RDD.CHECKPOINT_ALL_MARKED_ANCESTORS)).exists(_.toBoolean)
 
   /** Returns the first parent RDD */
   protected[spark] def firstParent[U: ClassTag]: RDD[U] = {
@@ -1686,7 +1685,7 @@ abstract class RDD[T: ClassTag](
   }
 
   /** Returns the jth parent RDD: e.g. rdd.parent[T](0) is equivalent to rdd.firstParent[T] */
-  protected[spark] def parent[U: ClassTag](j: Int) = {
+  protected[spark] def parent[U: ClassTag](j: Int): RDD[U] = {
     dependencies(j).rdd.asInstanceOf[RDD[U]]
   }
 
@@ -1754,7 +1753,7 @@ abstract class RDD[T: ClassTag](
    * collected. Subclasses of RDD may override this method for implementing their own cleaning
    * logic. See [[org.apache.spark.rdd.UnionRDD]] for an example.
    */
-  protected def clearDependencies() {
+  protected def clearDependencies(): Unit = {
     dependencies_ = null
   }
 
@@ -1790,7 +1789,7 @@ abstract class RDD[T: ClassTag](
           val lastDepStrings =
             debugString(lastDep.rdd, prefix, lastDep.isInstanceOf[ShuffleDependency[_, _, _]], true)
 
-          (frontDepStrings ++ lastDepStrings)
+          frontDepStrings ++ lastDepStrings
       }
     }
     // The first RDD in the dependency stack has no parents, so no need for a +-
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index 82300efc01632..398758a3331b4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -351,8 +351,9 @@ abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializ
         if (buf.isEmpty) {
           numPartsToTry = partsScanned * limitScaleUpFactor
         } else {
-          // the left side of max is >=1 whenever partsScanned >= 2
-          numPartsToTry = Math.max((1.5 * n * partsScanned / buf.size).toInt - partsScanned, 1)
+          val left = n - buf.size
+          // As left > 0, numPartsToTry is always >= 1
+          numPartsToTry = Math.ceil(1.5 * left * partsScanned / buf.size).toInt
           numPartsToTry = Math.min(numPartsToTry, partsScanned * limitScaleUpFactor)
         }
       }

From 6f169ca9e1444fe8fd1ab6f3fbf0a8be1670f1b5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 10 Jan 2018 10:20:34 +0800
Subject: [PATCH 299/356] [MINOR] fix a typo in BroadcastJoinSuite

## What changes were proposed in this pull request?

`BroadcastNestedLoopJoinExec` should be `BroadcastHashJoinExec`

## How was this patch tested?

N/A

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20202 from cloud-fan/typo.
---
 .../apache/spark/sql/execution/joins/BroadcastJoinSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
index 6da46ea3480b3..0bcd54e1fceab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -318,7 +318,7 @@ class BroadcastJoinSuite extends QueryTest with SQLTestUtils {
       case b: BroadcastNestedLoopJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
-      case b: BroadcastNestedLoopJoinExec =>
+      case b: BroadcastHashJoinExec =>
         assert(b.getClass.getSimpleName === joinMethod)
         assert(b.buildSide === buildSide)
       case w: WholeStageCodegenExec =>

From 7bcc2666810cefc85dfa0d6679ac7a0de9e23154 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 10 Jan 2018 14:00:07 +0900
Subject: [PATCH 300/356] [SPARK-23018][PYTHON] Fix createDataFrame from Pandas
 timestamp series assignment

## What changes were proposed in this pull request?

This fixes createDataFrame from Pandas to only assign modified timestamp series back to a copied version of the Pandas DataFrame.  Previously, if the Pandas DataFrame was only a reference (e.g. a slice of another) each series will still get assigned back to the reference even if it is not a modified timestamp column.  This caused the following warning "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame."

## How was this patch tested?

existing tests

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20213 from BryanCutler/pyspark-createDataFrame-copy-slice-warn-SPARK-23018.
---
 python/pyspark/sql/session.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 6052fa9e84096..3e4574729a631 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -459,21 +459,23 @@ def _convert_from_pandas(self, pdf, schema, timezone):
                     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
                     if isinstance(field.dataType, TimestampType):
                         s = _check_series_convert_timestamps_tz_local(pdf[field.name], timezone)
-                        if not copied and s is not pdf[field.name]:
-                            # Copy once if the series is modified to prevent the original Pandas
-                            # DataFrame from being updated
-                            pdf = pdf.copy()
-                            copied = True
-                        pdf[field.name] = s
+                        if s is not pdf[field.name]:
+                            if not copied:
+                                # Copy once if the series is modified to prevent the original
+                                # Pandas DataFrame from being updated
+                                pdf = pdf.copy()
+                                copied = True
+                            pdf[field.name] = s
             else:
                 for column, series in pdf.iteritems():
-                    s = _check_series_convert_timestamps_tz_local(pdf[column], timezone)
-                    if not copied and s is not pdf[column]:
-                        # Copy once if the series is modified to prevent the original Pandas
-                        # DataFrame from being updated
-                        pdf = pdf.copy()
-                        copied = True
-                    pdf[column] = s
+                    s = _check_series_convert_timestamps_tz_local(series, timezone)
+                    if s is not series:
+                        if not copied:
+                            # Copy once if the series is modified to prevent the original
+                            # Pandas DataFrame from being updated
+                            pdf = pdf.copy()
+                            copied = True
+                        pdf[column] = s
 
         # Convert pandas.DataFrame to list of numpy records
         np_records = pdf.to_records(index=False)

From e5998372487af20114e160264a594957344ff433 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 10 Jan 2018 14:55:24 +0900
Subject: [PATCH 301/356] [SPARK-23009][PYTHON] Fix for non-str col names to
 createDataFrame from Pandas

## What changes were proposed in this pull request?

This the case when calling `SparkSession.createDataFrame` using a Pandas DataFrame that has non-str column labels.

The column name conversion logic to handle non-string or unicode in python2 is:
```
if column is not any type of string:
    name = str(column)
else if column is unicode in Python 2:
    name = column.encode('utf-8')
```

## How was this patch tested?

Added a new test with a Pandas DataFrame that has int column labels

Author: Bryan Cutler <cutlerb@gmail.com>

Closes #20210 from BryanCutler/python-createDataFrame-int-col-error-SPARK-23009.
---
 python/pyspark/sql/session.py | 4 +++-
 python/pyspark/sql/tests.py   | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
index 3e4574729a631..604021c1f45cc 100644
--- a/python/pyspark/sql/session.py
+++ b/python/pyspark/sql/session.py
@@ -648,7 +648,9 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
 
             # If no schema supplied by user then get the names of columns only
             if schema is None:
-                schema = [x.encode('utf-8') if not isinstance(x, str) else x for x in data.columns]
+                schema = [str(x) if not isinstance(x, basestring) else
+                          (x.encode('utf-8') if not isinstance(x, str) else x)
+                          for x in data.columns]
 
             if self.conf.get("spark.sql.execution.arrow.enabled", "false").lower() == "true" \
                     and len(data) > 0:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13576ff57001b..80a94a91a87b3 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3532,6 +3532,15 @@ def test_toPandas_with_array_type(self):
                 self.assertTrue(expected[r][e] == result_arrow[r][e] and
                                 result[r][e] == result_arrow[r][e])
 
+    def test_createDataFrame_with_int_col_names(self):
+        import numpy as np
+        import pandas as pd
+        pdf = pd.DataFrame(np.random.rand(4, 2))
+        df, df_arrow = self._createDataFrame_toggle(pdf)
+        pdf_col_names = [str(c) for c in pdf.columns]
+        self.assertEqual(pdf_col_names, df.columns)
+        self.assertEqual(pdf_col_names, df_arrow.columns)
+
 
 @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed")
 class PandasUDFTests(ReusedSQLTestCase):

From edf0a48c2ec696b92ed6a96dcee6eeb1a046b20b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 10 Jan 2018 15:01:11 +0800
Subject: [PATCH 302/356] [SPARK-22982] Remove unsafe asynchronous close() call
 from FileDownloadChannel

## What changes were proposed in this pull request?

This patch fixes a severe asynchronous IO bug in Spark's Netty-based file transfer code. At a high-level, the problem is that an unsafe asynchronous `close()` of a pipe's source channel creates a race condition where file transfer code closes a file descriptor then attempts to read from it. If the closed file descriptor's number has been reused by an `open()` call then this invalid read may cause unrelated file operations to return incorrect results. **One manifestation of this problem is incorrect query results.**

For a high-level overview of how file download works, take a look at the control flow in `NettyRpcEnv.openChannel()`: this code creates a pipe to buffer results, then submits an asynchronous stream request to a lower-level TransportClient. The callback passes received data to the sink end of the pipe. The source end of the pipe is passed back to the caller of `openChannel()`. Thus `openChannel()` returns immediately and callers interact with the returned pipe source channel.

Because the underlying stream request is asynchronous, errors may occur after `openChannel()` has returned and after that method's caller has started to `read()` from the returned channel. For example, if a client requests an invalid stream from a remote server then the "stream does not exist" error may not be received from the remote server until after `openChannel()` has returned. In order to be able to propagate the "stream does not exist" error to the file-fetching application thread, this code wraps the pipe's source channel in a special `FileDownloadChannel` which adds an `setError(t: Throwable)` method, then calls this `setError()` method in the FileDownloadCallback's `onFailure` method.

It is possible for `FileDownloadChannel`'s `read()` and `setError()` methods to be called concurrently from different threads: the `setError()` method is called from within the Netty RPC system's stream callback handlers, while the `read()` methods are called from higher-level application code performing remote stream reads.

The problem lies in `setError()`: the existing code closed the wrapped pipe source channel. Because `read()` and `setError()` occur in different threads, this means it is possible for one thread to be calling `source.read()` while another asynchronously calls `source.close()`. Java's IO libraries do not guarantee that this will be safe and, in fact, it's possible for these operations to interleave in such a way that a lower-level `read()` system call occurs right after a `close()` call. In the best-case, this fails as a read of a closed file descriptor; in the worst-case, the file descriptor number has been re-used by an intervening `open()` operation and the read corrupts the result of an unrelated file IO operation being performed by a different thread.

The solution here is to remove the `stream.close()` call in `onError()`: the thread that is performing the `read()` calls is responsible for closing the stream in a `finally` block, so there's no need to close it here. If that thread is blocked in a `read()` then it will become unblocked when the sink end of the pipe is closed in `FileDownloadCallback.onFailure()`.

After making this change, we also need to refine the `read()` method to always check for a `setError()` result, even if the underlying channel `read()` call has succeeded.

This patch also makes a slight cleanup to a dodgy-looking `catch e: Exception` block to use a safer `try-finally` error handling idiom.

This bug was introduced in SPARK-11956 / #9941 and is present in Spark 1.6.0+.

## How was this patch tested?

This fix was tested manually against a workload which non-deterministically hit this bug.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20179 from JoshRosen/SPARK-22982-fix-unsafe-async-io-in-file-download-channel.
---
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 37 +++++++++++--------
 .../shuffle/IndexShuffleBlockResolver.scala   | 21 +++++++++--
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index f951591e02a5c..a2936d6ad539c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -332,16 +332,14 @@ private[netty] class NettyRpcEnv(
 
     val pipe = Pipe.open()
     val source = new FileDownloadChannel(pipe.source())
-    try {
+    Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
       val client = downloadClient(parsedUri.getHost(), parsedUri.getPort())
       val callback = new FileDownloadCallback(pipe.sink(), source, client)
       client.stream(parsedUri.getPath(), callback)
-    } catch {
-      case e: Exception =>
-        pipe.sink().close()
-        source.close()
-        throw e
-    }
+    })(catchBlock = {
+      pipe.sink().close()
+      source.close()
+    })
 
     source
   }
@@ -370,24 +368,33 @@ private[netty] class NettyRpcEnv(
     fileDownloadFactory.createClient(host, port)
   }
 
-  private class FileDownloadChannel(source: ReadableByteChannel) extends ReadableByteChannel {
+  private class FileDownloadChannel(source: Pipe.SourceChannel) extends ReadableByteChannel {
 
     @volatile private var error: Throwable = _
 
     def setError(e: Throwable): Unit = {
+      // This setError callback is invoked by internal RPC threads in order to propagate remote
+      // exceptions to application-level threads which are reading from this channel. When an
+      // RPC error occurs, the RPC system will call setError() and then will close the
+      // Pipe.SinkChannel corresponding to the other end of the `source` pipe. Closing of the pipe
+      // sink will cause `source.read()` operations to return EOF, unblocking the application-level
+      // reading thread. Thus there is no need to actually call `source.close()` here in the
+      // onError() callback and, in fact, calling it here would be dangerous because the close()
+      // would be asynchronous with respect to the read() call and could trigger race-conditions
+      // that lead to data corruption. See the PR for SPARK-22982 for more details on this topic.
       error = e
-      source.close()
     }
 
     override def read(dst: ByteBuffer): Int = {
       Try(source.read(dst)) match {
+        // See the documentation above in setError(): if an RPC error has occurred then setError()
+        // will be called to propagate the RPC error and then `source`'s corresponding
+        // Pipe.SinkChannel will be closed, unblocking this read. In that case, we want to propagate
+        // the remote RPC exception (and not any exceptions triggered by the pipe close, such as
+        // ChannelClosedException), hence this `error != null` check:
+        case _ if error != null => throw error
         case Success(bytesRead) => bytesRead
-        case Failure(readErr) =>
-          if (error != null) {
-            throw error
-          } else {
-            throw readErr
-          }
+        case Failure(readErr) => throw readErr
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 15540485170d0..266ee42e39cca 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -18,8 +18,8 @@
 package org.apache.spark.shuffle
 
 import java.io._
-
-import com.google.common.io.ByteStreams
+import java.nio.channels.Channels
+import java.nio.file.Files
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.Logging
@@ -196,11 +196,24 @@ private[spark] class IndexShuffleBlockResolver(
     // find out the consolidated file, then the offset within that from our index
     val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId)
 
-    val in = new DataInputStream(new FileInputStream(indexFile))
+    // SPARK-22982: if this FileInputStream's position is seeked forward by another piece of code
+    // which is incorrectly using our file descriptor then this code will fetch the wrong offsets
+    // (which may cause a reducer to be sent a different reducer's data). The explicit position
+    // checks added here were a useful debugging aid during SPARK-22982 and may help prevent this
+    // class of issue from re-occurring in the future which is why they are left here even though
+    // SPARK-22982 is fixed.
+    val channel = Files.newByteChannel(indexFile.toPath)
+    channel.position(blockId.reduceId * 8)
+    val in = new DataInputStream(Channels.newInputStream(channel))
     try {
-      ByteStreams.skipFully(in, blockId.reduceId * 8)
       val offset = in.readLong()
       val nextOffset = in.readLong()
+      val actualPosition = channel.position()
+      val expectedPosition = blockId.reduceId * 8 + 16
+      if (actualPosition != expectedPosition) {
+        throw new Exception(s"SPARK-22982: Incorrect channel position after index file reads: " +
+          s"expected $expectedPosition but actual position was $actualPosition.")
+      }
       new FileSegmentManagedBuffer(
         transportConf,
         getDataFile(blockId.shuffleId, blockId.mapId),

From eaac60a1e20e29084b7151ffca964cfaa5ba99d1 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 10 Jan 2018 15:16:27 +0800
Subject: [PATCH 303/356] [SPARK-16060][SQL][FOLLOW-UP] add a wrapper solution
 for vectorized orc reader

## What changes were proposed in this pull request?

This is mostly from https://github.com/apache/spark/pull/13775

The wrapper solution is pretty good for string/binary type, as the ORC column vector doesn't keep bytes in a continuous memory region, and has a significant overhead when copying the data to Spark columnar batch. For other cases, the wrapper solution is almost same with the current solution.

I think we can treat the wrapper solution as a baseline and keep improving the writing to Spark solution.

## How was this patch tested?

existing tests.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #20205 from cloud-fan/orc.
---
 .../apache/spark/sql/internal/SQLConf.scala   |   7 +
 .../datasources/orc/OrcColumnVector.java      | 251 ++++++++++++++++++
 .../orc/OrcColumnarBatchReader.java           | 106 ++++++--
 .../datasources/orc/OrcFileFormat.scala       |   6 +-
 .../spark/sql/hive/orc/OrcReadBenchmark.scala | 236 ++++++++++------
 5 files changed, 490 insertions(+), 116 deletions(-)
 create mode 100644 sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 74949db883f7a..36e802a9faa6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -391,6 +391,13 @@ object SQLConf {
     .booleanConf
     .createWithDefault(true)
 
+  val ORC_COPY_BATCH_TO_SPARK = buildConf("spark.sql.orc.copyBatchToSpark")
+    .doc("Whether or not to copy the ORC columnar batch to Spark columnar batch in the " +
+      "vectorized ORC reader.")
+    .internal()
+    .booleanConf
+    .createWithDefault(false)
+
   val ORC_FILTER_PUSHDOWN_ENABLED = buildConf("spark.sql.orc.filterPushdown")
     .doc("When true, enable filter pushdown for ORC files.")
     .booleanConf
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
new file mode 100644
index 0000000000000..f94c55d860304
--- /dev/null
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -0,0 +1,251 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.orc;
+
+import java.math.BigDecimal;
+
+import org.apache.orc.storage.ql.exec.vector.*;
+
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.TimestampType;
+import org.apache.spark.unsafe.types.UTF8String;
+
+/**
+ * A column vector class wrapping Hive's ColumnVector. Because Spark ColumnarBatch only accepts
+ * Spark's vectorized.ColumnVector, this column vector is used to adapt Hive ColumnVector with
+ * Spark ColumnarVector.
+ */
+public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVector {
+  private ColumnVector baseData;
+  private LongColumnVector longData;
+  private DoubleColumnVector doubleData;
+  private BytesColumnVector bytesData;
+  private DecimalColumnVector decimalData;
+  private TimestampColumnVector timestampData;
+  final private boolean isTimestamp;
+
+  private int batchSize;
+
+  OrcColumnVector(DataType type, ColumnVector vector) {
+    super(type);
+
+    if (type instanceof TimestampType) {
+      isTimestamp = true;
+    } else {
+      isTimestamp = false;
+    }
+
+    baseData = vector;
+    if (vector instanceof LongColumnVector) {
+      longData = (LongColumnVector) vector;
+    } else if (vector instanceof DoubleColumnVector) {
+      doubleData = (DoubleColumnVector) vector;
+    } else if (vector instanceof BytesColumnVector) {
+      bytesData = (BytesColumnVector) vector;
+    } else if (vector instanceof DecimalColumnVector) {
+      decimalData = (DecimalColumnVector) vector;
+    } else if (vector instanceof TimestampColumnVector) {
+      timestampData = (TimestampColumnVector) vector;
+    } else {
+      throw new UnsupportedOperationException();
+    }
+  }
+
+  public void setBatchSize(int batchSize) {
+    this.batchSize = batchSize;
+  }
+
+  @Override
+  public void close() {
+
+  }
+
+  @Override
+  public int numNulls() {
+    if (baseData.isRepeating) {
+      if (baseData.isNull[0]) {
+        return batchSize;
+      } else {
+        return 0;
+      }
+    } else if (baseData.noNulls) {
+      return 0;
+    } else {
+      int count = 0;
+      for (int i = 0; i < batchSize; i++) {
+        if (baseData.isNull[i]) count++;
+      }
+      return count;
+    }
+  }
+
+  /* A helper method to get the row index in a column. */
+  private int getRowIndex(int rowId) {
+    return baseData.isRepeating ? 0 : rowId;
+  }
+
+  @Override
+  public boolean isNullAt(int rowId) {
+    return baseData.isNull[getRowIndex(rowId)];
+  }
+
+  @Override
+  public boolean getBoolean(int rowId) {
+    return longData.vector[getRowIndex(rowId)] == 1;
+  }
+
+  @Override
+  public boolean[] getBooleans(int rowId, int count) {
+    boolean[] res = new boolean[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getBoolean(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public byte getByte(int rowId) {
+    return (byte) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public byte[] getBytes(int rowId, int count) {
+    byte[] res = new byte[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getByte(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public short getShort(int rowId) {
+    return (short) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public short[] getShorts(int rowId, int count) {
+    short[] res = new short[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getShort(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getInt(int rowId) {
+    return (int) longData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public int[] getInts(int rowId, int count) {
+    int[] res = new int[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getInt(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public long getLong(int rowId) {
+    int index = getRowIndex(rowId);
+    if (isTimestamp) {
+      return timestampData.time[index] * 1000 + timestampData.nanos[index] / 1000;
+    } else {
+      return longData.vector[index];
+    }
+  }
+
+  @Override
+  public long[] getLongs(int rowId, int count) {
+    long[] res = new long[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getLong(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public float getFloat(int rowId) {
+    return (float) doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public float[] getFloats(int rowId, int count) {
+    float[] res = new float[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getFloat(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public double getDouble(int rowId) {
+    return doubleData.vector[getRowIndex(rowId)];
+  }
+
+  @Override
+  public double[] getDoubles(int rowId, int count) {
+    double[] res = new double[count];
+    for (int i = 0; i < count; i++) {
+      res[i] = getDouble(rowId + i);
+    }
+    return res;
+  }
+
+  @Override
+  public int getArrayLength(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public int getArrayOffset(int rowId) {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public Decimal getDecimal(int rowId, int precision, int scale) {
+    BigDecimal data = decimalData.vector[getRowIndex(rowId)].getHiveDecimal().bigDecimalValue();
+    return Decimal.apply(data, precision, scale);
+  }
+
+  @Override
+  public UTF8String getUTF8String(int rowId) {
+    int index = getRowIndex(rowId);
+    BytesColumnVector col = bytesData;
+    return UTF8String.fromBytes(col.vector[index], col.start[index], col.length[index]);
+  }
+
+  @Override
+  public byte[] getBinary(int rowId) {
+    int index = getRowIndex(rowId);
+    byte[] binary = new byte[bytesData.length[index]];
+    System.arraycopy(bytesData.vector[index], bytesData.start[index], binary, 0, binary.length);
+    return binary;
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector arrayData() {
+    throw new UnsupportedOperationException();
+  }
+
+  @Override
+  public org.apache.spark.sql.vectorized.ColumnVector getChildColumn(int ordinal) {
+    throw new UnsupportedOperationException();
+  }
+}
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
index 5c28d0e6e507a..36fdf2bdf84d2 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnarBatchReader.java
@@ -51,13 +51,13 @@
 public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
 
   /**
-   * The default size of batch. We use this value for both ORC and Spark consistently
-   * because they have different default values like the following.
+   * The default size of batch. We use this value for ORC reader to make it consistent with Spark's
+   * columnar batch, because their default batch sizes are different like the following:
    *
    * - ORC's VectorizedRowBatch.DEFAULT_SIZE = 1024
    * - Spark's ColumnarBatch.DEFAULT_BATCH_SIZE = 4 * 1024
    */
-  public static final int DEFAULT_SIZE = 4 * 1024;
+  private static final int DEFAULT_SIZE = 4 * 1024;
 
   // ORC File Reader
   private Reader reader;
@@ -82,13 +82,18 @@ public class OrcColumnarBatchReader extends RecordReader<Void, ColumnarBatch> {
   // Writable column vectors of the result columnar batch.
   private WritableColumnVector[] columnVectors;
 
-  /**
-   * The memory mode of the columnarBatch
-   */
+  // The wrapped ORC column vectors. It should be null if `copyToSpark` is true.
+  private org.apache.spark.sql.vectorized.ColumnVector[] orcVectorWrappers;
+
+  // The memory mode of the columnarBatch
   private final MemoryMode MEMORY_MODE;
 
-  public OrcColumnarBatchReader(boolean useOffHeap) {
+  // Whether or not to copy the ORC columnar batch to Spark columnar batch.
+  private final boolean copyToSpark;
+
+  public OrcColumnarBatchReader(boolean useOffHeap, boolean copyToSpark) {
     MEMORY_MODE = useOffHeap ? MemoryMode.OFF_HEAP : MemoryMode.ON_HEAP;
+    this.copyToSpark = copyToSpark;
   }
 
 
@@ -167,27 +172,61 @@ public void initBatch(
     }
 
     int capacity = DEFAULT_SIZE;
-    if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
-      columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
-    } else {
-      columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
-    }
-    columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
 
-    if (partitionValues.numFields() > 0) {
-      int partitionIdx = requiredFields.length;
-      for (int i = 0; i < partitionValues.numFields(); i++) {
-        ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
-        columnVectors[i + partitionIdx].setIsConstant();
+    if (copyToSpark) {
+      if (MEMORY_MODE == MemoryMode.OFF_HEAP) {
+        columnVectors = OffHeapColumnVector.allocateColumns(capacity, resultSchema);
+      } else {
+        columnVectors = OnHeapColumnVector.allocateColumns(capacity, resultSchema);
       }
-    }
 
-    // Initialize the missing columns once.
-    for (int i = 0; i < requiredFields.length; i++) {
-      if (requestedColIds[i] == -1) {
-        columnVectors[i].putNulls(0, columnarBatch.capacity());
-        columnVectors[i].setIsConstant();
+      // Initialize the missing columns once.
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] == -1) {
+          columnVectors[i].putNulls(0, capacity);
+          columnVectors[i].setIsConstant();
+        }
+      }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          ColumnVectorUtils.populate(columnVectors[i + partitionIdx], partitionValues, i);
+          columnVectors[i + partitionIdx].setIsConstant();
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(resultSchema, columnVectors, capacity);
+    } else {
+      // Just wrap the ORC column vector instead of copying it to Spark column vector.
+      orcVectorWrappers = new org.apache.spark.sql.vectorized.ColumnVector[resultSchema.length()];
+
+      for (int i = 0; i < requiredFields.length; i++) {
+        DataType dt = requiredFields[i].dataType();
+        int colId = requestedColIds[i];
+        // Initialize the missing columns once.
+        if (colId == -1) {
+          OnHeapColumnVector missingCol = new OnHeapColumnVector(capacity, dt);
+          missingCol.putNulls(0, capacity);
+          missingCol.setIsConstant();
+          orcVectorWrappers[i] = missingCol;
+        } else {
+          orcVectorWrappers[i] = new OrcColumnVector(dt, batch.cols[colId]);
+        }
       }
+
+      if (partitionValues.numFields() > 0) {
+        int partitionIdx = requiredFields.length;
+        for (int i = 0; i < partitionValues.numFields(); i++) {
+          DataType dt = partitionSchema.fields()[i].dataType();
+          OnHeapColumnVector partitionCol = new OnHeapColumnVector(capacity, dt);
+          ColumnVectorUtils.populate(partitionCol, partitionValues, i);
+          partitionCol.setIsConstant();
+          orcVectorWrappers[partitionIdx + i] = partitionCol;
+        }
+      }
+
+      columnarBatch = new ColumnarBatch(resultSchema, orcVectorWrappers, capacity);
     }
   }
 
@@ -196,17 +235,26 @@ public void initBatch(
    * by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
    */
   private boolean nextBatch() throws IOException {
-    for (WritableColumnVector vector : columnVectors) {
-      vector.reset();
-    }
-    columnarBatch.setNumRows(0);
-
     recordReader.nextBatch(batch);
     int batchSize = batch.size;
     if (batchSize == 0) {
       return false;
     }
     columnarBatch.setNumRows(batchSize);
+
+    if (!copyToSpark) {
+      for (int i = 0; i < requiredFields.length; i++) {
+        if (requestedColIds[i] != -1) {
+          ((OrcColumnVector) orcVectorWrappers[i]).setBatchSize(batchSize);
+        }
+      }
+      return true;
+    }
+
+    for (WritableColumnVector vector : columnVectors) {
+      vector.reset();
+    }
+
     for (int i = 0; i < requiredFields.length; i++) {
       StructField field = requiredFields[i];
       WritableColumnVector toColumn = columnVectors[i];
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
index b8bacfa1838ae..2dd314d165348 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -38,6 +38,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.util.SerializableConfiguration
@@ -150,6 +151,7 @@ class OrcFileFormat
     val sqlConf = sparkSession.sessionState.conf
     val enableOffHeapColumnVector = sqlConf.offHeapColumnVectorEnabled
     val enableVectorizedReader = supportBatch(sparkSession, resultSchema)
+    val copyToSpark = sparkSession.sessionState.conf.getConf(SQLConf.ORC_COPY_BATCH_TO_SPARK)
 
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
@@ -183,8 +185,8 @@ class OrcFileFormat
 
         val taskContext = Option(TaskContext.get())
         if (enableVectorizedReader) {
-          val batchReader =
-            new OrcColumnarBatchReader(enableOffHeapColumnVector && taskContext.isDefined)
+          val batchReader = new OrcColumnarBatchReader(
+            enableOffHeapColumnVector && taskContext.isDefined, copyToSpark)
           batchReader.initialize(fileSplit, taskAttemptContext)
           batchReader.initBatch(
             reader.getSchema,
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
index 37ed846acd1eb..bf6efa7c4c08c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -86,7 +86,7 @@ object OrcReadBenchmark {
   }
 
   def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
-    val sqlBenchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+    val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
 
     withTempPath { dir =>
       withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -95,61 +95,73 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql(s"SELECT CAST(value as ${dataType.sql}) id FROM t1"))
 
-        sqlBenchmark.addCase("Native ORC MR") { _ =>
+        benchmark.addCase("Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+        benchmark.addCase("Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
         }
 
-        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         SQL Single TINYINT Column Scan:          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1192 / 1221         13.2          75.8       1.0X
-        Native ORC Vectorized                          161 /  170         97.5          10.3       7.4X
-        Hive built-in ORC                             1399 / 1413         11.2          89.0       0.9X
+        Native ORC MR                                 1135 / 1171         13.9          72.2       1.0X
+        Native ORC Vectorized                          152 /  163        103.4           9.7       7.5X
+        Native ORC Vectorized with copy                149 /  162        105.4           9.5       7.6X
+        Hive built-in ORC                             1380 / 1384         11.4          87.7       0.8X
 
         SQL Single SMALLINT Column Scan:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1287 / 1333         12.2          81.8       1.0X
-        Native ORC Vectorized                          164 /  172         95.6          10.5       7.8X
-        Hive built-in ORC                             1629 / 1650          9.7         103.6       0.8X
+        Native ORC MR                                 1182 / 1244         13.3          75.2       1.0X
+        Native ORC Vectorized                          145 /  156        108.7           9.2       8.2X
+        Native ORC Vectorized with copy                148 /  158        106.4           9.4       8.0X
+        Hive built-in ORC                             1591 / 1636          9.9         101.2       0.7X
 
         SQL Single INT Column Scan:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1304 / 1388         12.1          82.9       1.0X
-        Native ORC Vectorized                          227 /  240         69.3          14.4       5.7X
-        Hive built-in ORC                             1866 / 1867          8.4         118.6       0.7X
+        Native ORC MR                                 1271 / 1271         12.4          80.8       1.0X
+        Native ORC Vectorized                          206 /  212         76.3          13.1       6.2X
+        Native ORC Vectorized with copy                200 /  213         78.8          12.7       6.4X
+        Hive built-in ORC                             1776 / 1787          8.9         112.9       0.7X
 
         SQL Single BIGINT Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1331 / 1357         11.8          84.6       1.0X
-        Native ORC Vectorized                          289 /  297         54.4          18.4       4.6X
-        Hive built-in ORC                             1922 / 1929          8.2         122.2       0.7X
+        Native ORC MR                                 1344 / 1355         11.7          85.4       1.0X
+        Native ORC Vectorized                          258 /  268         61.0          16.4       5.2X
+        Native ORC Vectorized with copy                252 /  257         62.4          16.0       5.3X
+        Hive built-in ORC                             1818 / 1823          8.7         115.6       0.7X
 
         SQL Single FLOAT Column Scan:            Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1410 / 1428         11.2          89.7       1.0X
-        Native ORC Vectorized                          328 /  335         48.0          20.8       4.3X
-        Hive built-in ORC                             1929 / 2012          8.2         122.6       0.7X
+        Native ORC MR                                 1333 / 1352         11.8          84.8       1.0X
+        Native ORC Vectorized                          310 /  324         50.7          19.7       4.3X
+        Native ORC Vectorized with copy                312 /  320         50.4          19.9       4.3X
+        Hive built-in ORC                             1904 / 1918          8.3         121.0       0.7X
 
         SQL Single DOUBLE Column Scan:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1467 / 1485         10.7          93.3       1.0X
-        Native ORC Vectorized                          402 /  411         39.1          25.6       3.6X
-        Hive built-in ORC                             2023 / 2042          7.8         128.6       0.7X
+        Native ORC MR                                 1408 / 1585         11.2          89.5       1.0X
+        Native ORC Vectorized                          359 /  368         43.8          22.8       3.9X
+        Native ORC Vectorized with copy                364 /  371         43.2          23.2       3.9X
+        Hive built-in ORC                             1881 / 1954          8.4         119.6       0.7X
         */
-        sqlBenchmark.run()
+        benchmark.run()
       }
     }
   }
@@ -176,19 +188,26 @@ object OrcReadBenchmark {
           spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(c1), sum(length(c2)) FROM nativeOrcTable").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Int and String Scan:                     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2729 / 2744          3.8         260.2       1.0X
-        Native ORC Vectorized                         1318 / 1344          8.0         125.7       2.1X
-        Hive built-in ORC                             3731 / 3782          2.8         355.8       0.7X
+        Native ORC MR                                 2566 / 2592          4.1         244.7       1.0X
+        Native ORC Vectorized                         1098 / 1113          9.6         104.7       2.3X
+        Native ORC Vectorized with copy               1527 / 1593          6.9         145.6       1.7X
+        Hive built-in ORC                             3561 / 3705          2.9         339.6       0.7X
         */
         benchmark.run()
       }
@@ -205,63 +224,84 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql("SELECT value % 2 AS p, value AS id FROM t1"), Some("p"))
 
-        benchmark.addCase("Read data column - Native ORC MR") { _ =>
+        benchmark.addCase("Data column - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read data column - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Data column - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read data column - Hive built-in ORC") { _ =>
+        benchmark.addCase("Data column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Data column - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
         }
 
-        benchmark.addCase("Read partition column - Native ORC MR") { _ =>
+        benchmark.addCase("Partition column - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read partition column - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Partition column - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read partition column - Hive built-in ORC") { _ =>
+        benchmark.addCase("Partition column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Partition column - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(p) FROM hiveOrcTable").collect()
         }
 
-        benchmark.addCase("Read both columns - Native ORC MR") { _ =>
+        benchmark.addCase("Both columns - Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
           }
         }
 
-        benchmark.addCase("Read both columns - Native ORC Vectorized") { _ =>
+        benchmark.addCase("Both columns - Native ORC Vectorized") { _ =>
           spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
         }
 
-        benchmark.addCase("Read both columns - Hive built-in ORC") { _ =>
+        benchmark.addCase("Both column - Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(p), sum(id) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Both columns - Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Partitioned Table:                       Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Read data column - Native ORC MR               1531 / 1536         10.3          97.4       1.0X
-        Read data column - Native ORC Vectorized        295 /  298         53.3          18.8       5.2X
-        Read data column - Hive built-in ORC           2125 / 2126          7.4         135.1       0.7X
-        Read partition column - Native ORC MR          1049 / 1062         15.0          66.7       1.5X
-        Read partition column - Native ORC Vectorized    54 /   57        290.1           3.4      28.2X
-        Read partition column - Hive built-in ORC      1282 / 1291         12.3          81.5       1.2X
-        Read both columns - Native ORC MR              1594 / 1598          9.9         101.3       1.0X
-        Read both columns - Native ORC Vectorized       332 /  336         47.4          21.1       4.6X
-        Read both columns - Hive built-in ORC          2145 / 2187          7.3         136.4       0.7X
+        Data only - Native ORC MR                      1447 / 1457         10.9          92.0       1.0X
+        Data only - Native ORC Vectorized               256 /  266         61.4          16.3       5.6X
+        Data only - Native ORC Vectorized with copy     263 /  273         59.8          16.7       5.5X
+        Data only - Hive built-in ORC                  1960 / 1988          8.0         124.6       0.7X
+        Partition only - Native ORC MR                 1039 / 1043         15.1          66.0       1.4X
+        Partition only - Native ORC Vectorized           48 /   53        326.6           3.1      30.1X
+        Partition only - Native ORC Vectorized with copy 48 /   53        328.4           3.0      30.2X
+        Partition only - Hive built-in ORC             1234 / 1242         12.7          78.4       1.2X
+        Both columns - Native ORC MR                   1465 / 1475         10.7          93.1       1.0X
+        Both columns - Native ORC Vectorized            292 /  301         53.9          18.6       5.0X
+        Both column - Native ORC Vectorized with copy   348 /  354         45.1          22.2       4.2X
+        Both columns - Hive built-in ORC               2051 / 2060          7.7         130.4       0.7X
         */
         benchmark.run()
       }
@@ -287,19 +327,26 @@ object OrcReadBenchmark {
           spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT sum(length(c1)) FROM nativeOrcTable").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         Repeated String:                         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1325 / 1328          7.9         126.4       1.0X
-        Native ORC Vectorized                          320 /  330         32.8          30.5       4.1X
-        Hive built-in ORC                             1971 / 1972          5.3         188.0       0.7X
+        Native ORC MR                                 1271 / 1278          8.3         121.2       1.0X
+        Native ORC Vectorized                          200 /  212         52.4          19.1       6.4X
+        Native ORC Vectorized with copy                342 /  347         30.7          32.6       3.7X
+        Hive built-in ORC                             1874 / 2105          5.6         178.7       0.7X
         */
         benchmark.run()
       }
@@ -331,32 +378,42 @@ object OrcReadBenchmark {
             "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
         }
 
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql("SELECT SUM(LENGTH(c2)) FROM nativeOrcTable " +
+              "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
+          }
+        }
+
         benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql("SELECT SUM(LENGTH(c2)) FROM hiveOrcTable " +
             "WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
         String with Nulls Scan (0.0%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2553 / 2554          4.1         243.4       1.0X
-        Native ORC Vectorized                          953 /  954         11.0          90.9       2.7X
-        Hive built-in ORC                             3875 / 3898          2.7         369.6       0.7X
+        Native ORC MR                                 2394 / 2886          4.4         228.3       1.0X
+        Native ORC Vectorized                          699 /  729         15.0          66.7       3.4X
+        Native ORC Vectorized with copy                959 / 1025         10.9          91.5       2.5X
+        Hive built-in ORC                             3899 / 3901          2.7         371.9       0.6X
 
         String with Nulls Scan (0.5%):           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2389 / 2408          4.4         227.8       1.0X
-        Native ORC Vectorized                         1208 / 1209          8.7         115.2       2.0X
-        Hive built-in ORC                             2940 / 2952          3.6         280.4       0.8X
+        Native ORC MR                                 2234 / 2255          4.7         213.1       1.0X
+        Native ORC Vectorized                          854 /  869         12.3          81.4       2.6X
+        Native ORC Vectorized with copy               1099 / 1128          9.5         104.8       2.0X
+        Hive built-in ORC                             2767 / 2793          3.8         263.9       0.8X
 
         String with Nulls Scan (0.95%):          Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1295 / 1311          8.1         123.5       1.0X
-        Native ORC Vectorized                          449 /  457         23.4          42.8       2.9X
-        Hive built-in ORC                             1649 / 1660          6.4         157.3       0.8X
+        Native ORC MR                                 1166 / 1202          9.0         111.2       1.0X
+        Native ORC Vectorized                          338 /  345         31.1          32.2       3.5X
+        Native ORC Vectorized with copy                418 /  428         25.1          39.9       2.8X
+        Hive built-in ORC                             1730 / 1761          6.1         164.9       0.7X
         */
         benchmark.run()
       }
@@ -364,7 +421,7 @@ object OrcReadBenchmark {
   }
 
   def columnsBenchmark(values: Int, width: Int): Unit = {
-    val sqlBenchmark = new Benchmark(s"SQL Single Column Scan from $width columns", values)
+    val benchmark = new Benchmark(s"Single Column Scan from $width columns", values)
 
     withTempPath { dir =>
       withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -376,43 +433,52 @@ object OrcReadBenchmark {
 
         prepareTable(dir, spark.sql("SELECT * FROM t1"))
 
-        sqlBenchmark.addCase("Native ORC MR") { _ =>
+        benchmark.addCase("Native ORC MR") { _ =>
           withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
             spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
           }
         }
 
-        sqlBenchmark.addCase("Native ORC Vectorized") { _ =>
+        benchmark.addCase("Native ORC Vectorized") { _ =>
           spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
         }
 
-        sqlBenchmark.addCase("Hive built-in ORC") { _ =>
+        benchmark.addCase("Native ORC Vectorized with copy") { _ =>
+          withSQLConf(SQLConf.ORC_COPY_BATCH_TO_SPARK.key -> "true") {
+            spark.sql(s"SELECT sum(c$middle) FROM nativeOrcTable").collect()
+          }
+        }
+
+        benchmark.addCase("Hive built-in ORC") { _ =>
           spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
         }
 
         /*
-        Java HotSpot(TM) 64-Bit Server VM 1.8.0_152-b16 on Mac OS X 10.13.2
-        Intel(R) Core(TM) i7-4770HQ CPU @ 2.20GHz
+        Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
+        Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
 
-        SQL Single Column Scan from 100 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 100 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 1103 / 1124          1.0        1052.0       1.0X
-        Native ORC Vectorized                           92 /  100         11.4          87.9      12.0X
-        Hive built-in ORC                              383 /  390          2.7         365.4       2.9X
+        Native ORC MR                                 1050 / 1053          1.0        1001.1       1.0X
+        Native ORC Vectorized                           95 /  101         11.0          90.9      11.0X
+        Native ORC Vectorized with copy                 95 /  102         11.0          90.9      11.0X
+        Hive built-in ORC                              348 /  358          3.0         331.8       3.0X
 
-        SQL Single Column Scan from 200 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 200 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 2245 / 2250          0.5        2141.0       1.0X
-        Native ORC Vectorized                          157 /  165          6.7         150.2      14.3X
-        Hive built-in ORC                              587 /  593          1.8         559.4       3.8X
+        Native ORC MR                                 2099 / 2108          0.5        2002.1       1.0X
+        Native ORC Vectorized                          179 /  187          5.8         171.1      11.7X
+        Native ORC Vectorized with copy                176 /  188          6.0         167.6      11.9X
+        Hive built-in ORC                              562 /  581          1.9         535.9       3.7X
 
-        SQL Single Column Scan from 300 columns: Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+        Single Column Scan from 300 columns:     Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
         ------------------------------------------------------------------------------------------------
-        Native ORC MR                                 3343 / 3350          0.3        3188.3       1.0X
-        Native ORC Vectorized                          265 /  280          3.9         253.2      12.6X
-        Hive built-in ORC                              828 /  842          1.3         789.8       4.0X
+        Native ORC MR                                 3221 / 3246          0.3        3071.4       1.0X
+        Native ORC Vectorized                          312 /  322          3.4         298.0      10.3X
+        Native ORC Vectorized with copy                306 /  320          3.4         291.6      10.5X
+        Hive built-in ORC                              815 /  824          1.3         777.3       4.0X
         */
-        sqlBenchmark.run()
+        benchmark.run()
       }
     }
   }

From 70bcc9d5ae33d6669bb5c97db29087ccead770fb Mon Sep 17 00:00:00 2001
From: sethah <shendrickson@cloudera.com>
Date: Tue, 9 Jan 2018 23:32:47 -0800
Subject: [PATCH 304/356] [SPARK-22993][ML] Clarify HasCheckpointInterval param
 doc

## What changes were proposed in this pull request?

Add a note to the `HasCheckpointInterval` parameter doc that clarifies that this setting is ignored when no checkpoint directory has been set on the spark context.

## How was this patch tested?

No tests necessary, just a doc update.

Author: sethah <shendrickson@cloudera.com>

Closes #20188 from sethah/als_checkpoint_doc.
---
 R/pkg/R/mllib_recommendation.R                              | 2 ++
 R/pkg/R/mllib_tree.R                                        | 6 ++++++
 .../apache/spark/ml/param/shared/SharedParamsCodeGen.scala  | 4 +++-
 .../org/apache/spark/ml/param/shared/sharedParams.scala     | 4 ++--
 python/pyspark/ml/param/_shared_params_code_gen.py          | 5 +++--
 python/pyspark/ml/param/shared.py                           | 4 ++--
 6 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/R/pkg/R/mllib_recommendation.R b/R/pkg/R/mllib_recommendation.R
index fa794249085d7..5441c4a4022a9 100644
--- a/R/pkg/R/mllib_recommendation.R
+++ b/R/pkg/R/mllib_recommendation.R
@@ -48,6 +48,8 @@ setClass("ALSModel", representation(jobj = "jobj"))
 #' @param numUserBlocks number of user blocks used to parallelize computation (> 0).
 #' @param numItemBlocks number of item blocks used to parallelize computation (> 0).
 #' @param checkpointInterval number of checkpoint intervals (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.als} returns a fitted ALS model.
 #' @rdname spark.als
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 89a58bf0aadae..4e5ddf22ee16d 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -161,6 +161,8 @@ print.summary.decisionTree <- function(x) {
 #'                            >= 1.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -382,6 +384,8 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
@@ -595,6 +599,8 @@ setMethod("write.ml", signature(object = "RandomForestClassificationModel", path
 #' @param minInstancesPerNode Minimum number of instances each child must have after split.
 #' @param minInfoGain Minimum information gain for a split to be considered at a tree node.
 #' @param checkpointInterval Param for set checkpoint interval (>= 1) or disable checkpoint (-1).
+#'                           Note: this setting will be ignored if the checkpoint directory is not
+#'                           set.
 #' @param maxMemoryInMB Maximum memory in MB allocated to histogram aggregation.
 #' @param cacheNodeIds If FALSE, the algorithm will pass trees to executors to match instances with
 #'                     nodes. If TRUE, the algorithm will cache node IDs for each instance. Caching
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index a5d57a15317e6..6ad44af9ef7eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -63,7 +63,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Array[String]]("outputCols", "output column names"),
       ParamDesc[Int]("checkpointInterval", "set checkpoint interval (>= 1) or " +
         "disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed " +
-        "every 10 iterations", isValid = "(interval: Int) => interval == -1 || interval >= 1"),
+        "every 10 iterations. Note: this setting will be ignored if the checkpoint directory " +
+        "is not set in the SparkContext",
+        isValid = "(interval: Int) => interval == -1 || interval >= 1"),
       ParamDesc[Boolean]("fitIntercept", "whether to fit an intercept term", Some("true")),
       ParamDesc[String]("handleInvalid", "how to handle invalid entries. Options are skip (which " +
         "will filter out rows with bad values), or error (which will throw an error). More " +
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index 13425dacc9f18..be8b2f273164b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -282,10 +282,10 @@ trait HasOutputCols extends Params {
 trait HasCheckpointInterval extends Params {
 
   /**
-   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * Param for set checkpoint interval (&gt;= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
    * @group param
    */
-  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations", (interval: Int) => interval == -1 || interval >= 1)
+  final val checkpointInterval: IntParam = new IntParam(this, "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext", (interval: Int) => interval == -1 || interval >= 1)
 
   /** @group getParam */
   final def getCheckpointInterval: Int = $(checkpointInterval)
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index d55d209d09398..1d0f60acc6983 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -121,8 +121,9 @@ def get$Name(self):
         ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
-         "E.g. 10 means that the cache will get checkpointed every 10 iterations.", None,
-         "TypeConverters.toInt"),
+         "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
+         "this setting will be ignored if the checkpoint directory is not set in the SparkContext.",
+         None, "TypeConverters.toInt"),
         ("seed", "random seed.", "hash(type(self).__name__)", "TypeConverters.toInt"),
         ("tol", "the convergence tolerance for iterative algorithms (>= 0).", None,
          "TypeConverters.toFloat"),
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index e5c5ddfba6c1f..813f7a59f3fd1 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -281,10 +281,10 @@ def getNumFeatures(self):
 
 class HasCheckpointInterval(Params):
     """
-    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.
     """
 
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.", typeConverter=TypeConverters.toInt)
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.", typeConverter=TypeConverters.toInt)
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()

From f340b6b3066033d40b7e163fd5fb68e9820adfb1 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 10 Jan 2018 00:45:47 -0800
Subject: [PATCH 305/356] [SPARK-22997] Add additional defenses against use of
 freed MemoryBlocks

## What changes were proposed in this pull request?

This patch modifies Spark's `MemoryAllocator` implementations so that `free(MemoryBlock)` mutates the passed block to clear pointers (in the off-heap case) or null out references to backing `long[]` arrays (in the on-heap case). The goal of this change is to add an extra layer of defense against use-after-free bugs because currently it's hard to detect corruption caused by blind writes to freed memory blocks.

## How was this patch tested?

New unit tests in `PlatformSuite`, including new tests for existing functionality because we did not have sufficient mutation coverage of the on-heap memory allocator's pooling logic.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #20191 from JoshRosen/SPARK-22997-add-defenses-against-use-after-free-bugs-in-memory-allocator.
---
 .../unsafe/memory/HeapMemoryAllocator.java    | 35 +++++++++----
 .../spark/unsafe/memory/MemoryBlock.java      | 21 +++++++-
 .../unsafe/memory/UnsafeMemoryAllocator.java  | 11 ++++
 .../spark/unsafe/PlatformUtilSuite.java       | 50 ++++++++++++++++++-
 .../spark/memory/TaskMemoryManager.java       | 13 ++++-
 .../spark/memory/TaskMemoryManagerSuite.java  | 29 +++++++++++
 6 files changed, 146 insertions(+), 13 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index cc9cc429643ad..3acfe3696cb1e 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -31,8 +31,7 @@
 public class HeapMemoryAllocator implements MemoryAllocator {
 
   @GuardedBy("this")
-  private final Map<Long, LinkedList<WeakReference<MemoryBlock>>> bufferPoolsBySize =
-    new HashMap<>();
+  private final Map<Long, LinkedList<WeakReference<long[]>>> bufferPoolsBySize = new HashMap<>();
 
   private static final int POOLING_THRESHOLD_BYTES = 1024 * 1024;
 
@@ -49,13 +48,14 @@ private boolean shouldPool(long size) {
   public MemoryBlock allocate(long size) throws OutOfMemoryError {
     if (shouldPool(size)) {
       synchronized (this) {
-        final LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        final LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool != null) {
           while (!pool.isEmpty()) {
-            final WeakReference<MemoryBlock> blockReference = pool.pop();
-            final MemoryBlock memory = blockReference.get();
-            if (memory != null) {
-              assert (memory.size() == size);
+            final WeakReference<long[]> arrayReference = pool.pop();
+            final long[] array = arrayReference.get();
+            if (array != null) {
+              assert (array.length * 8L >= size);
+              MemoryBlock memory = new MemoryBlock(array, Platform.LONG_ARRAY_OFFSET, size);
               if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
                 memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
               }
@@ -76,18 +76,35 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
 
   @Override
   public void free(MemoryBlock memory) {
+    assert (memory.obj != null) :
+      "baseObject was null; are you trying to use the on-heap allocator to free off-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator free()";
+
     final long size = memory.size();
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
+
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
+
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to null out its reference to the long[] array.
+    long[] array = (long[]) memory.obj;
+    memory.setObjAndOffset(null, 0);
+
     if (shouldPool(size)) {
       synchronized (this) {
-        LinkedList<WeakReference<MemoryBlock>> pool = bufferPoolsBySize.get(size);
+        LinkedList<WeakReference<long[]>> pool = bufferPoolsBySize.get(size);
         if (pool == null) {
           pool = new LinkedList<>();
           bufferPoolsBySize.put(size, pool);
         }
-        pool.add(new WeakReference<>(memory));
+        pool.add(new WeakReference<>(array));
       }
     } else {
       // Do nothing
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
index cd1d378bc1470..c333857358d30 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/MemoryBlock.java
@@ -26,6 +26,25 @@
  */
 public class MemoryBlock extends MemoryLocation {
 
+  /** Special `pageNumber` value for pages which were not allocated by TaskMemoryManagers */
+  public static final int NO_PAGE_NUMBER = -1;
+
+  /**
+   * Special `pageNumber` value for marking pages that have been freed in the TaskMemoryManager.
+   * We set `pageNumber` to this value in TaskMemoryManager.freePage() so that MemoryAllocator
+   * can detect if pages which were allocated by TaskMemoryManager have been freed in the TMM
+   * before being passed to MemoryAllocator.free() (it is an error to allocate a page in
+   * TaskMemoryManager and then directly free it in a MemoryAllocator without going through
+   * the TMM freePage() call).
+   */
+  public static final int FREED_IN_TMM_PAGE_NUMBER = -2;
+
+  /**
+   * Special `pageNumber` value for pages that have been freed by the MemoryAllocator. This allows
+   * us to detect double-frees.
+   */
+  public static final int FREED_IN_ALLOCATOR_PAGE_NUMBER = -3;
+
   private final long length;
 
   /**
@@ -33,7 +52,7 @@ public class MemoryBlock extends MemoryLocation {
    * TaskMemoryManager. This field is public so that it can be modified by the TaskMemoryManager,
    * which lives in a different package.
    */
-  public int pageNumber = -1;
+  public int pageNumber = NO_PAGE_NUMBER;
 
   public MemoryBlock(@Nullable Object obj, long offset, long length) {
     super(obj, offset);
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
index 55bcdf1ed7b06..4368fb615ba1e 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/UnsafeMemoryAllocator.java
@@ -38,9 +38,20 @@ public MemoryBlock allocate(long size) throws OutOfMemoryError {
   public void free(MemoryBlock memory) {
     assert (memory.obj == null) :
       "baseObject not null; are you trying to use the off-heap allocator to free on-heap memory?";
+    assert (memory.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "page has already been freed";
+    assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
+            || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
+      "TMM-allocated pages must be freed via TMM.freePage(), not directly in allocator free()";
+
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
       memory.fill(MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     }
     Platform.freeMemory(memory.offset);
+    // As an additional layer of defense against use-after-free bugs, we mutate the
+    // MemoryBlock to reset its pointer.
+    memory.offset = 0;
+    // Mark the page as freed (so we can detect double-frees).
+    memory.pageNumber = MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER;
   }
 }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
index 4b141339ec816..62854837b05ed 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -62,6 +62,52 @@ public void overlappingCopyMemory() {
     }
   }
 
+  @Test
+  public void onHeapMemoryAllocatorPoolingReUsesLongArrays() {
+    MemoryBlock block1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject1 = block1.getBaseObject();
+    MemoryAllocator.HEAP.free(block1);
+    MemoryBlock block2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object baseObject2 = block2.getBaseObject();
+    Assert.assertSame(baseObject1, baseObject2);
+    MemoryAllocator.HEAP.free(block2);
+  }
+
+  @Test
+  public void freeingOnHeapMemoryBlockResetsBaseObjectAndOffset() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    Assert.assertNotNull(block.getBaseObject());
+    MemoryAllocator.HEAP.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test
+  public void freeingOffHeapMemoryBlockResetsOffset() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertNotEquals(0, block.getBaseOffset());
+    MemoryAllocator.UNSAFE.free(block);
+    Assert.assertNull(block.getBaseObject());
+    Assert.assertEquals(0, block.getBaseOffset());
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, block.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void onHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.HEAP.allocate(1024);
+    MemoryAllocator.HEAP.free(block);
+    MemoryAllocator.HEAP.free(block);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void offHeapMemoryAllocatorThrowsAssertionErrorOnDoubleFree() {
+    MemoryBlock block = MemoryAllocator.UNSAFE.allocate(1024);
+    MemoryAllocator.UNSAFE.free(block);
+    MemoryAllocator.UNSAFE.free(block);
+  }
+
   @Test
   public void memoryDebugFillEnabledInTest() {
     Assert.assertTrue(MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED);
@@ -71,9 +117,11 @@ public void memoryDebugFillEnabledInTest() {
       MemoryAllocator.MEMORY_DEBUG_FILL_CLEAN_VALUE);
 
     MemoryBlock onheap1 = MemoryAllocator.HEAP.allocate(1024 * 1024);
+    Object onheap1BaseObject = onheap1.getBaseObject();
+    long onheap1BaseOffset = onheap1.getBaseOffset();
     MemoryAllocator.HEAP.free(onheap1);
     Assert.assertEquals(
-      Platform.getByte(onheap1.getBaseObject(), onheap1.getBaseOffset()),
+      Platform.getByte(onheap1BaseObject, onheap1BaseOffset),
       MemoryAllocator.MEMORY_DEBUG_FILL_FREED_VALUE);
     MemoryBlock onheap2 = MemoryAllocator.HEAP.allocate(1024 * 1024);
     Assert.assertEquals(
diff --git a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
index e8d3730daa7a4..632d718062212 100644
--- a/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
+++ b/core/src/main/java/org/apache/spark/memory/TaskMemoryManager.java
@@ -321,8 +321,12 @@ public MemoryBlock allocatePage(long size, MemoryConsumer consumer) {
    * Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}.
    */
   public void freePage(MemoryBlock page, MemoryConsumer consumer) {
-    assert (page.pageNumber != -1) :
+    assert (page.pageNumber != MemoryBlock.NO_PAGE_NUMBER) :
       "Called freePage() on memory that wasn't allocated with allocatePage()";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) :
+      "Called freePage() on a memory block that has already been freed";
+    assert (page.pageNumber != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) :
+            "Called freePage() on a memory block that has already been freed";
     assert(allocatedPages.get(page.pageNumber));
     pageTable[page.pageNumber] = null;
     synchronized (this) {
@@ -332,6 +336,10 @@ public void freePage(MemoryBlock page, MemoryConsumer consumer) {
       logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size());
     }
     long pageSize = page.size();
+    // Clear the page number before passing the block to the MemoryAllocator's free().
+    // Doing this allows the MemoryAllocator to detect when a TaskMemoryManager-managed
+    // page has been inappropriately directly freed without calling TMM.freePage().
+    page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
     memoryManager.tungstenMemoryAllocator().free(page);
     releaseExecutionMemory(pageSize, consumer);
   }
@@ -358,7 +366,7 @@ public long encodePageNumberAndOffset(MemoryBlock page, long offsetInPage) {
 
   @VisibleForTesting
   public static long encodePageNumberAndOffset(int pageNumber, long offsetInPage) {
-    assert (pageNumber != -1) : "encodePageNumberAndOffset called with invalid page";
+    assert (pageNumber >= 0) : "encodePageNumberAndOffset called with invalid page";
     return (((long) pageNumber) << OFFSET_BITS) | (offsetInPage & MASK_LONG_LOWER_51_BITS);
   }
 
@@ -424,6 +432,7 @@ public long cleanUpAllAllocatedMemory() {
       for (MemoryBlock page : pageTable) {
         if (page != null) {
           logger.debug("unreleased page: " + page + " in task " + taskAttemptId);
+          page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER;
           memoryManager.tungstenMemoryAllocator().free(page);
         }
       }
diff --git a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
index 46b0516e36141..a0664b30d6cc2 100644
--- a/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
+++ b/core/src/test/java/org/apache/spark/memory/TaskMemoryManagerSuite.java
@@ -21,6 +21,7 @@
 import org.junit.Test;
 
 import org.apache.spark.SparkConf;
+import org.apache.spark.unsafe.memory.MemoryAllocator;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 
 public class TaskMemoryManagerSuite {
@@ -68,6 +69,34 @@ public void encodePageNumberAndOffsetOnHeap() {
     Assert.assertEquals(64, manager.getOffsetInPage(encodedAddress));
   }
 
+  @Test
+  public void freeingPageSetsPageNumberToSpecialConstant() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    c.freePage(dataPage);
+    Assert.assertEquals(MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER, dataPage.pageNumber);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void freeingPageDirectlyInAllocatorTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = manager.allocatePage(256, c);
+    MemoryAllocator.HEAP.free(dataPage);
+  }
+
+  @Test(expected = AssertionError.class)
+  public void callingFreePageOnDirectlyAllocatedPageTriggersAssertionError() {
+    final TaskMemoryManager manager = new TaskMemoryManager(
+      new TestMemoryManager(new SparkConf().set("spark.memory.offHeap.enabled", "false")), 0);
+    final MemoryConsumer c = new TestMemoryConsumer(manager, MemoryMode.ON_HEAP);
+    final MemoryBlock dataPage = MemoryAllocator.HEAP.allocate(256);
+    manager.freePage(dataPage, c);
+  }
+
   @Test
   public void cooperativeSpilling() {
     final TestMemoryManager memoryManager = new TestMemoryManager(new SparkConf());

From 344e3aab87178e45957333479a07e07f202ca1fd Mon Sep 17 00:00:00 2001
From: Wang Gengliang <ltnwgl@gmail.com>
Date: Wed, 10 Jan 2018 09:44:30 -0800
Subject: [PATCH 306/356] [SPARK-23019][CORE] Wait until SparkContext.stop()
 finished in SparkLauncherSuite

## What changes were proposed in this pull request?
In current code ,the function `waitFor` call https://github.com/apache/spark/blob/cfcd746689c2b84824745fa6d327ffb584c7a17d/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java#L155 only wait until DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
https://github.com/apache/spark/blob/1c9f95cb771ac78775a77edd1abfeb2d8ae2a124/core/src/main/scala/org/apache/spark/SparkContext.scala#L1924

Thus, in the Jenkins test
https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-maven-hadoop-2.6/ ,  `JdbcRDDSuite` failed because the previous test `SparkLauncherSuite` exit before SparkContext.stop() is finished.

To repo:
```
$ build/sbt
> project core
> testOnly *SparkLauncherSuite *JavaJdbcRDDSuite
```

To Fix:
Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM in SparkLauncherSuite.
Can' come up with any better solution for now.

## How was this patch tested?

Unit test

Author: Wang Gengliang <ltnwgl@gmail.com>

Closes #20221 from gengliangwang/SPARK-23019.
---
 .../java/org/apache/spark/launcher/SparkLauncherSuite.java   | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index c2261c204cd45..9d2f563b2e367 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -23,6 +23,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
+import java.util.concurrent.TimeUnit;
 
 import org.junit.Test;
 import static org.junit.Assert.*;
@@ -133,6 +134,10 @@ public void testInProcessLauncher() throws Exception {
         p.put(e.getKey(), e.getValue());
       }
       System.setProperties(p);
+      // Here DAGScheduler is stopped, while SparkContext.clearActiveContext may not be called yet.
+      // Wait for a reasonable amount of time to avoid creating two active SparkContext in JVM.
+      // See SPARK-23019 and SparkContext.stop() for details.
+      TimeUnit.MILLISECONDS.sleep(500);
     }
   }
 

From 9b33dfc408de986f4203bb0ac0c3f5c56effd69d Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliu@databricks.com>
Date: Wed, 10 Jan 2018 14:25:04 -0800
Subject: [PATCH 307/356] [SPARK-22951][SQL] fix aggregation after
 dropDuplicates on empty data frames

## What changes were proposed in this pull request?

(courtesy of liancheng)

Spark SQL supports both global aggregation and grouping aggregation. Global aggregation always return a single row with the initial aggregation state as the output, even there are zero input rows. Spark implements this by simply checking the number of grouping keys and treats an aggregation as a global aggregation if it has zero grouping keys.

However, this simple principle drops the ball in the following case:

```scala
spark.emptyDataFrame.dropDuplicates().agg(count($"*") as "c").show()
// +---+
// | c |
// +---+
// | 1 |
// +---+
```

The reason is that:

1. `df.dropDuplicates()` is roughly translated into something equivalent to:

```scala
val allColumns = df.columns.map { col }
df.groupBy(allColumns: _*).agg(allColumns.head, allColumns.tail: _*)
```

This translation is implemented in the rule `ReplaceDeduplicateWithAggregate`.

2. `spark.emptyDataFrame` contains zero columns and zero rows.

Therefore, rule `ReplaceDeduplicateWithAggregate` makes a confusing transformation roughly equivalent to the following one:

```scala
spark.emptyDataFrame.dropDuplicates()
=> spark.emptyDataFrame.groupBy().agg(Map.empty[String, String])
```

The above transformation is confusing because the resulting aggregate operator contains no grouping keys (because `emptyDataFrame` contains no columns), and gets recognized as a global aggregation. As a result, Spark SQL allocates a single row filled by the initial aggregation state and uses it as the output, and returns a wrong result.

To fix this issue, this PR tweaks `ReplaceDeduplicateWithAggregate` by appending a literal `1` to the grouping key list of the resulting `Aggregate` operator when the input plan contains zero output columns. In this way, `spark.emptyDataFrame.dropDuplicates()` is now translated into a grouping aggregation, roughly depicted as:

```scala
spark.emptyDataFrame.dropDuplicates()
=> spark.emptyDataFrame.groupBy(lit(1)).agg(Map.empty[String, String])
```

Which is now properly treated as a grouping aggregation and returns the correct answer.

## How was this patch tested?

New unit tests added

Author: Feng Liu <fengliu@databricks.com>

Closes #20174 from liufengdb/fix-duplicate.
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  8 ++++++-
 .../optimizer/ReplaceOperatorSuite.scala      | 10 +++++++-
 .../spark/sql/DataFrameAggregateSuite.scala   | 24 +++++++++++++++++--
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index df0af8264a329..c794ba8619322 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -1222,7 +1222,13 @@ object ReplaceDeduplicateWithAggregate extends Rule[LogicalPlan] {
           Alias(new First(attr).toAggregateExpression(), attr.name)(attr.exprId)
         }
       }
-      Aggregate(keys, aggCols, child)
+      // SPARK-22951: Physical aggregate operators distinguishes global aggregation and grouping
+      // aggregations by checking the number of grouping keys. The key difference here is that a
+      // global aggregation always returns at least one row even if there are no input rows. Here
+      // we append a literal when the grouping key list is empty so that the result aggregate
+      // operator is properly treated as a grouping aggregation.
+      val nonemptyKeys = if (keys.isEmpty) Literal(1) :: Nil else keys
+      Aggregate(nonemptyKeys, aggCols, child)
   }
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
index 0fa1aaeb9e164..e9701ffd2c54b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/ReplaceOperatorSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, Not}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal, Not}
 import org.apache.spark.sql.catalyst.expressions.aggregate.First
 import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftSemi, PlanTest}
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -198,6 +198,14 @@ class ReplaceOperatorSuite extends PlanTest {
     comparePlans(optimized, correctAnswer)
   }
 
+  test("add one grouping key if necessary when replace Deduplicate with Aggregate") {
+    val input = LocalRelation()
+    val query = Deduplicate(Seq.empty, input) // dropDuplicates()
+    val optimized = Optimize.execute(query.analyze)
+    val correctAnswer = Aggregate(Seq(Literal(1)), input.output, input)
+    comparePlans(optimized, correctAnswer)
+  }
+
   test("don't replace streaming Deduplicate") {
     val input = LocalRelation(Seq('a.int, 'b.int), isStreaming = true)
     val attrA = input.output(0)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 06848e4d2b297..e7776e36702ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql
 
 import scala.util.Random
 
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal}
+import org.apache.spark.sql.catalyst.expressions.aggregate.Count
 import org.apache.spark.sql.execution.WholeStageCodegenExec
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
@@ -27,7 +29,7 @@ import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.test.SQLTestData.DecimalData
-import org.apache.spark.sql.types.{Decimal, DecimalType}
+import org.apache.spark.sql.types.DecimalType
 
 case class Fact(date: Int, hour: Int, minute: Int, room_name: String, temp: Double)
 
@@ -456,7 +458,6 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
 
   test("null moments") {
     val emptyTableData = Seq.empty[(Int, Int)].toDF("a", "b")
-
     checkAnswer(
       emptyTableData.agg(variance('a), var_samp('a), var_pop('a), skewness('a), kurtosis('a)),
       Row(null, null, null, null, null))
@@ -666,4 +667,23 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext {
       assert(exchangePlans.length == 1)
     }
   }
+
+  Seq(true, false).foreach { codegen =>
+    test("SPARK-22951: dropDuplicates on empty dataFrames should produce correct aggregate " +
+      s"results when codegen is enabled: $codegen") {
+      withSQLConf((SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, codegen.toString)) {
+        // explicit global aggregations
+        val emptyAgg = Map.empty[String, String]
+        checkAnswer(spark.emptyDataFrame.agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.groupBy().agg(count("*")), Seq(Row(0)))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(emptyAgg), Seq(Row()))
+        checkAnswer(spark.emptyDataFrame.dropDuplicates().groupBy().agg(count("*")), Seq(Row(0)))
+
+        // global aggregation is converted to grouping aggregation:
+        assert(spark.emptyDataFrame.dropDuplicates().count() == 0)
+      }
+    }
+  }
 }

From a6647ffbf7a312a3e119a9beef90880cc915aa60 Mon Sep 17 00:00:00 2001
From: Mingjie Tang <mtang@hortonworks.com>
Date: Thu, 11 Jan 2018 11:51:03 +0800
Subject: [PATCH 308/356] [SPARK-22587] Spark job fails if fs.defaultFS and
 application jar are different url

## What changes were proposed in this pull request?

Two filesystems comparing does not consider the authority of URI. This is specific for
WASB file storage system, where userInfo is honored to differentiate filesystems.
For example: wasbs://user1xyz.net, wasbs://user2xyz.net would consider as two filesystem.
Therefore, we have to add the authority to compare two filesystem, and  two filesystem with different authority can not be the same FS.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Mingjie Tang <mtang@hortonworks.com>

Closes #19885 from merlintang/EAR-7377.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 24 +++++++++++---
 .../spark/deploy/yarn/ClientSuite.scala       | 33 +++++++++++++++++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 15328d08b3b5c..8cd3cd9746a3a 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1421,15 +1421,20 @@ private object Client extends Logging {
   }
 
   /**
-   * Return whether the two file systems are the same.
+   * Return whether two URI represent file system are the same
    */
-  private def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
-    val srcUri = srcFs.getUri()
-    val dstUri = destFs.getUri()
+  private[spark] def compareUri(srcUri: URI, dstUri: URI): Boolean = {
+
     if (srcUri.getScheme() == null || srcUri.getScheme() != dstUri.getScheme()) {
       return false
     }
 
+    val srcAuthority = srcUri.getAuthority()
+    val dstAuthority = dstUri.getAuthority()
+    if (srcAuthority != null && !srcAuthority.equalsIgnoreCase(dstAuthority)) {
+      return false
+    }
+
     var srcHost = srcUri.getHost()
     var dstHost = dstUri.getHost()
 
@@ -1447,6 +1452,17 @@ private object Client extends Logging {
     }
 
     Objects.equal(srcHost, dstHost) && srcUri.getPort() == dstUri.getPort()
+
+  }
+
+  /**
+   * Return whether the two file systems are the same.
+   */
+  protected def compareFs(srcFs: FileSystem, destFs: FileSystem): Boolean = {
+    val srcUri = srcFs.getUri()
+    val dstUri = destFs.getUri()
+
+    compareUri(srcUri, dstUri)
   }
 
   /**
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
index 9d5f5eb621118..7fa597167f3f0 100644
--- a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
+++ b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/ClientSuite.scala
@@ -357,6 +357,39 @@ class ClientSuite extends SparkFunSuite with Matchers {
     sparkConf.get(SECONDARY_JARS) should be (Some(Seq(new File(jar2.toURI).getName)))
   }
 
+  private val matching = Seq(
+    ("files URI match test1", "file:///file1", "file:///file2"),
+    ("files URI match test2", "file:///c:file1", "file://c:file2"),
+    ("files URI match test3", "file://host/file1", "file://host/file2"),
+    ("wasb URI match test", "wasb://bucket1@user", "wasb://bucket1@user/"),
+    ("hdfs URI match test", "hdfs:/path1", "hdfs:/path1")
+  )
+
+  matching.foreach { t =>
+      test(t._1) {
+        assert(Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"No match between ${t._2} and ${t._3}")
+      }
+  }
+
+  private val unmatching = Seq(
+    ("files URI unmatch test1", "file:///file1", "file://host/file2"),
+    ("files URI unmatch test2", "file://host/file1", "file:///file2"),
+    ("files URI unmatch test3", "file://host/file1", "file://host2/file2"),
+    ("wasb URI unmatch test1", "wasb://bucket1@user", "wasb://bucket2@user/"),
+    ("wasb URI unmatch test2", "wasb://bucket1@user", "wasb://bucket1@user2/"),
+    ("s3 URI unmatch test", "s3a://user@pass:bucket1/", "s3a://user2@pass2:bucket1/"),
+    ("hdfs URI unmatch test1", "hdfs://namenode1/path1", "hdfs://namenode1:8080/path2"),
+    ("hdfs URI unmatch test2", "hdfs://namenode1:8020/path1", "hdfs://namenode1:8080/path2")
+  )
+
+  unmatching.foreach { t =>
+      test(t._1) {
+        assert(!Client.compareUri(new URI(t._2), new URI(t._3)),
+          s"match between ${t._2} and ${t._3}")
+      }
+  }
+
   object Fixtures {
 
     val knownDefYarnAppCP: Seq[String] =

From 87c98de8b23f0e978958fc83677fdc4c339b7e6a Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 18:17:34 +0800
Subject: [PATCH 309/356] [SPARK-23001][SQL] Fix NullPointerException when DESC
 a database with NULL description

## What changes were proposed in this pull request?
When users' DB description is NULL, users might hit `NullPointerException`. This PR is to fix the issue.

## How was this patch tested?
Added test cases

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20215 from gatorsmile/SPARK-23001.
---
 .../apache/spark/sql/hive/client/HiveClientImpl.scala    | 2 +-
 .../apache/spark/sql/hive/HiveExternalCatalogSuite.scala | 6 ++++++
 .../org/apache/spark/sql/hive/client/VersionsSuite.scala | 9 +++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
index 102f40bacc985..4b923f5235a90 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
@@ -330,7 +330,7 @@ private[hive] class HiveClientImpl(
     Option(client.getDatabase(dbName)).map { d =>
       CatalogDatabase(
         name = d.getName,
-        description = d.getDescription,
+        description = Option(d.getDescription).getOrElse(""),
         locationUri = CatalogUtils.stringToURI(d.getLocationUri),
         properties = Option(d.getParameters).map(_.asScala.toMap).orNull)
     }.getOrElse(throw new NoSuchDatabaseException(dbName))
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
index 2e35fdeba464d..0a522b6a11c80 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogSuite.scala
@@ -107,4 +107,10 @@ class HiveExternalCatalogSuite extends ExternalCatalogSuite {
       .filter(_.contains("Num Buckets")).head
     assert(bucketString.contains("10"))
   }
+
+  test("SPARK-23001: NullPointerException when running desc database") {
+    val catalog = newBasicCatalog()
+    catalog.createDatabase(newDb("dbWithNullDesc").copy(description = null), ignoreIfExists = false)
+    assert(catalog.getDatabase("dbWithNullDesc").description == "")
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 94473a08dd317..ff90e9dda5f7c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -163,6 +163,15 @@ class VersionsSuite extends SparkFunSuite with Logging {
       client.createDatabase(tempDB, ignoreIfExists = true)
     }
 
+    test(s"$version: createDatabase with null description") {
+      withTempDir { tmpDir =>
+        val dbWithNullDesc =
+          CatalogDatabase("dbWithNullDesc", description = null, tmpDir.toURI, Map())
+        client.createDatabase(dbWithNullDesc, ignoreIfExists = true)
+        assert(client.getDatabase("dbWithNullDesc").description == "")
+      }
+    }
+
     test(s"$version: setCurrentDatabase") {
       client.setCurrentDatabase("default")
     }

From 1c70da3bfbb4016e394de2c73eb0db7cdd9a6968 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 11 Jan 2018 19:41:48 +0800
Subject: [PATCH 310/356] [SPARK-20657][CORE] Speed up rendering of the stages
 page.

There are two main changes to speed up rendering of the tasks list
when rendering the stage page.

The first one makes the code only load the tasks being shown in the
current page of the tasks table, and information related to only
those tasks. One side-effect of this change is that the graph that
shows task-related events now only shows events for the tasks in
the current page, instead of the previously hardcoded limit of "events
for the first 1000 tasks". That ends up helping with readability,
though.

To make sorting efficient when using a disk store, the task wrapper
was extended to include many new indices, one for each of the sortable
columns in the UI, and metrics for which quantiles are calculated.

The second changes the way metric quantiles are calculated for stages.
Instead of using the "Distribution" class to process data for all task
metrics, which requires scanning all tasks of a stage, the code now
uses the KVStore "skip()" functionality to only read tasks that contain
interesting information for the quantiles that are desired.

This is still not cheap; because there are many metrics that the UI
and API track, the code needs to scan the index for each metric to
gather the information. Savings come mainly from skipping deserialization
when using the disk store, but the in-memory code also seems to be
faster than before (most probably because of other changes in this
patch).

To make subsequent calls faster, some quantiles are cached in the
status store. This makes UIs much faster after the first time a stage
has been loaded.

With the above changes, a lot of code in the UI layer could be simplified.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20013 from vanzin/SPARK-20657.
---
 .../apache/spark/util/kvstore/LevelDB.java    |   1 +
 .../spark/status/AppStatusListener.scala      |  57 +-
 .../apache/spark/status/AppStatusStore.scala  | 389 +++++---
 .../apache/spark/status/AppStatusUtils.scala  |  68 ++
 .../org/apache/spark/status/LiveEntity.scala  | 344 ++++---
 .../spark/status/api/v1/StagesResource.scala  |   3 +-
 .../org/apache/spark/status/api/v1/api.scala  |   3 +
 .../org/apache/spark/status/storeTypes.scala  | 327 ++++++-
 .../apache/spark/ui/jobs/ExecutorTable.scala  |   4 +-
 .../org/apache/spark/ui/jobs/JobPage.scala    |   2 +-
 .../org/apache/spark/ui/jobs/StagePage.scala  | 919 ++++++------------
 ...mmary_w__custom_quantiles_expectation.json |   3 +
 ...sk_summary_w_shuffle_read_expectation.json |   3 +
 ...k_summary_w_shuffle_write_expectation.json |   3 +
 .../spark/status/AppStatusListenerSuite.scala | 105 +-
 .../spark/status/AppStatusStoreSuite.scala    | 104 ++
 .../org/apache/spark/ui/StagePageSuite.scala  |  10 +-
 scalastyle-config.xml                         |   2 +-
 18 files changed, 1361 insertions(+), 986 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
 create mode 100644 core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala

diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
index 4f9e10ca20066..0e491efac9181 100644
--- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
+++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java
@@ -83,6 +83,7 @@ public LevelDB(File path, KVStoreSerializer serializer) throws Exception {
     if (versionData != null) {
       long version = serializer.deserializeLong(versionData);
       if (version != STORE_VERSION) {
+        close();
         throw new UnsupportedStoreVersionException();
       }
     } else {
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 88b75ddd5993a..b4edcf23abc09 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -377,6 +377,10 @@ private[spark] class AppStatusListener(
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       stage.activeTasks += 1
       stage.firstLaunchTime = math.min(stage.firstLaunchTime, event.taskInfo.launchTime)
+
+      val locality = event.taskInfo.taskLocality.toString()
+      val count = stage.localitySummary.getOrElse(locality, 0L) + 1L
+      stage.localitySummary = stage.localitySummary ++ Map(locality -> count)
       maybeUpdate(stage, now)
 
       stage.jobs.foreach { job =>
@@ -433,7 +437,7 @@ private[spark] class AppStatusListener(
       }
       task.errorMessage = errorMessage
       val delta = task.updateMetrics(event.taskMetrics)
-      update(task, now)
+      update(task, now, last = true)
       delta
     }.orNull
 
@@ -450,7 +454,7 @@ private[spark] class AppStatusListener(
 
     Option(liveStages.get((event.stageId, event.stageAttemptId))).foreach { stage =>
       if (metricsDelta != null) {
-        stage.metrics.update(metricsDelta)
+        stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, metricsDelta)
       }
       stage.activeTasks -= 1
       stage.completedTasks += completedDelta
@@ -486,7 +490,7 @@ private[spark] class AppStatusListener(
       esummary.failedTasks += failedDelta
       esummary.killedTasks += killedDelta
       if (metricsDelta != null) {
-        esummary.metrics.update(metricsDelta)
+        esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, metricsDelta)
       }
       maybeUpdate(esummary, now)
 
@@ -604,11 +608,11 @@ private[spark] class AppStatusListener(
         maybeUpdate(task, now)
 
         Option(liveStages.get((sid, sAttempt))).foreach { stage =>
-          stage.metrics.update(delta)
+          stage.metrics = LiveEntityHelpers.addMetrics(stage.metrics, delta)
           maybeUpdate(stage, now)
 
           val esummary = stage.executorSummary(event.execId)
-          esummary.metrics.update(delta)
+          esummary.metrics = LiveEntityHelpers.addMetrics(esummary.metrics, delta)
           maybeUpdate(esummary, now)
         }
       }
@@ -690,7 +694,7 @@ private[spark] class AppStatusListener(
     // can update the executor information too.
     liveRDDs.get(block.rddId).foreach { rdd =>
       if (updatedStorageLevel.isDefined) {
-        rdd.storageLevel = updatedStorageLevel.get
+        rdd.setStorageLevel(updatedStorageLevel.get)
       }
 
       val partition = rdd.partition(block.name)
@@ -814,7 +818,7 @@ private[spark] class AppStatusListener(
 
   /** Update a live entity only if it hasn't been updated in the last configured period. */
   private def maybeUpdate(entity: LiveEntity, now: Long): Unit = {
-    if (liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
+    if (live && liveUpdatePeriodNs >= 0 && now - entity.lastWriteTime > liveUpdatePeriodNs) {
       update(entity, now)
     }
   }
@@ -865,7 +869,7 @@ private[spark] class AppStatusListener(
     }
 
     stages.foreach { s =>
-      val key = s.id
+      val key = Array(s.info.stageId, s.info.attemptId)
       kvstore.delete(s.getClass(), key)
 
       val execSummaries = kvstore.view(classOf[ExecutorStageSummaryWrapper])
@@ -885,15 +889,15 @@ private[spark] class AppStatusListener(
         .asScala
 
       tasks.foreach { t =>
-        kvstore.delete(t.getClass(), t.info.taskId)
+        kvstore.delete(t.getClass(), t.taskId)
       }
 
       // Check whether there are remaining attempts for the same stage. If there aren't, then
       // also delete the RDD graph data.
       val remainingAttempts = kvstore.view(classOf[StageDataWrapper])
         .index("stageId")
-        .first(s.stageId)
-        .last(s.stageId)
+        .first(s.info.stageId)
+        .last(s.info.stageId)
         .closeableIterator()
 
       val hasMoreAttempts = try {
@@ -905,8 +909,10 @@ private[spark] class AppStatusListener(
       }
 
       if (!hasMoreAttempts) {
-        kvstore.delete(classOf[RDDOperationGraphWrapper], s.stageId)
+        kvstore.delete(classOf[RDDOperationGraphWrapper], s.info.stageId)
       }
+
+      cleanupCachedQuantiles(key)
     }
   }
 
@@ -919,9 +925,9 @@ private[spark] class AppStatusListener(
 
       // Try to delete finished tasks only.
       val toDelete = KVUtils.viewToSeq(view, countToDelete) { t =>
-        !live || t.info.status != TaskState.RUNNING.toString()
+        !live || t.status != TaskState.RUNNING.toString()
       }
-      toDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+      toDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
       stage.savedTasks.addAndGet(-toDelete.size)
 
       // If there are more running tasks than the configured limit, delete running tasks. This
@@ -930,13 +936,34 @@ private[spark] class AppStatusListener(
       val remaining = countToDelete - toDelete.size
       if (remaining > 0) {
         val runningTasksToDelete = view.max(remaining).iterator().asScala.toList
-        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.info.taskId) }
+        runningTasksToDelete.foreach { t => kvstore.delete(t.getClass(), t.taskId) }
         stage.savedTasks.addAndGet(-remaining)
       }
+
+      // On live applications, cleanup any cached quantiles for the stage. This makes sure that
+      // quantiles will be recalculated after tasks are replaced with newer ones.
+      //
+      // This is not needed in the SHS since caching only happens after the event logs are
+      // completely processed.
+      if (live) {
+        cleanupCachedQuantiles(stageKey)
+      }
     }
     stage.cleaning = false
   }
 
+  private def cleanupCachedQuantiles(stageKey: Array[Int]): Unit = {
+    val cachedQuantiles = kvstore.view(classOf[CachedQuantile])
+      .index("stage")
+      .first(stageKey)
+      .last(stageKey)
+      .asScala
+      .toList
+    cachedQuantiles.foreach { q =>
+      kvstore.delete(q.getClass(), q.id)
+    }
+  }
+
   /**
    * Remove at least (retainedSize / 10) items to reduce friction. Because tracking may be done
    * asynchronously, this method may return 0 in case enough items have been deleted already.
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
index 5a942f5284018..efc28538a33db 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusStore.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.{JobExecutionStatus, SparkConf}
 import org.apache.spark.status.api.v1
 import org.apache.spark.ui.scope._
-import org.apache.spark.util.Distribution
+import org.apache.spark.util.{Distribution, Utils}
 import org.apache.spark.util.kvstore.{InMemoryStore, KVStore}
 
 /**
@@ -98,7 +98,11 @@ private[spark] class AppStatusStore(
     val it = store.view(classOf[StageDataWrapper]).index("stageId").reverse().first(stageId)
       .closeableIterator()
     try {
-      it.next().info
+      if (it.hasNext()) {
+        it.next().info
+      } else {
+        throw new NoSuchElementException(s"No stage with id $stageId")
+      }
     } finally {
       it.close()
     }
@@ -110,107 +114,238 @@ private[spark] class AppStatusStore(
     if (details) stageWithDetails(stage) else stage
   }
 
+  def taskCount(stageId: Int, stageAttemptId: Int): Long = {
+    store.count(classOf[TaskDataWrapper], "stage", Array(stageId, stageAttemptId))
+  }
+
+  def localitySummary(stageId: Int, stageAttemptId: Int): Map[String, Long] = {
+    store.read(classOf[StageDataWrapper], Array(stageId, stageAttemptId)).locality
+  }
+
+  /**
+   * Calculates a summary of the task metrics for the given stage attempt, returning the
+   * requested quantiles for the recorded metrics.
+   *
+   * This method can be expensive if the requested quantiles are not cached; the method
+   * will only cache certain quantiles (every 0.05 step), so it's recommended to stick to
+   * those to avoid expensive scans of all task data.
+   */
   def taskSummary(
       stageId: Int,
       stageAttemptId: Int,
-      quantiles: Array[Double]): v1.TaskMetricDistributions = {
-
-    val stage = Array(stageId, stageAttemptId)
-
-    val rawMetrics = store.view(classOf[TaskDataWrapper])
-      .index("stage")
-      .first(stage)
-      .last(stage)
-      .asScala
-      .flatMap(_.info.taskMetrics)
-      .toList
-      .view
-
-    def metricQuantiles(f: v1.TaskMetrics => Double): IndexedSeq[Double] =
-      Distribution(rawMetrics.map { d => f(d) }).get.getQuantiles(quantiles)
-
-    // We need to do a lot of similar munging to nested metrics here.  For each one,
-    // we want (a) extract the values for nested metrics (b) make a distribution for each metric
-    // (c) shove the distribution into the right field in our return type and (d) only return
-    // a result if the option is defined for any of the tasks.  MetricHelper is a little util
-    // to make it a little easier to deal w/ all of the nested options.  Mostly it lets us just
-    // implement one "build" method, which just builds the quantiles for each field.
-
-    val inputMetrics =
-      new MetricHelper[v1.InputMetrics, v1.InputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.InputMetrics = raw.inputMetrics
-
-        def build: v1.InputMetricDistributions = new v1.InputMetricDistributions(
-          bytesRead = submetricQuantiles(_.bytesRead),
-          recordsRead = submetricQuantiles(_.recordsRead)
-        )
-      }.build
-
-    val outputMetrics =
-      new MetricHelper[v1.OutputMetrics, v1.OutputMetricDistributions](rawMetrics, quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.OutputMetrics = raw.outputMetrics
-
-        def build: v1.OutputMetricDistributions = new v1.OutputMetricDistributions(
-          bytesWritten = submetricQuantiles(_.bytesWritten),
-          recordsWritten = submetricQuantiles(_.recordsWritten)
-        )
-      }.build
-
-    val shuffleReadMetrics =
-      new MetricHelper[v1.ShuffleReadMetrics, v1.ShuffleReadMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleReadMetrics =
-          raw.shuffleReadMetrics
-
-        def build: v1.ShuffleReadMetricDistributions = new v1.ShuffleReadMetricDistributions(
-          readBytes = submetricQuantiles { s => s.localBytesRead + s.remoteBytesRead },
-          readRecords = submetricQuantiles(_.recordsRead),
-          remoteBytesRead = submetricQuantiles(_.remoteBytesRead),
-          remoteBytesReadToDisk = submetricQuantiles(_.remoteBytesReadToDisk),
-          remoteBlocksFetched = submetricQuantiles(_.remoteBlocksFetched),
-          localBlocksFetched = submetricQuantiles(_.localBlocksFetched),
-          totalBlocksFetched = submetricQuantiles { s =>
-            s.localBlocksFetched + s.remoteBlocksFetched
-          },
-          fetchWaitTime = submetricQuantiles(_.fetchWaitTime)
-        )
-      }.build
-
-    val shuffleWriteMetrics =
-      new MetricHelper[v1.ShuffleWriteMetrics, v1.ShuffleWriteMetricDistributions](rawMetrics,
-        quantiles) {
-        def getSubmetrics(raw: v1.TaskMetrics): v1.ShuffleWriteMetrics =
-          raw.shuffleWriteMetrics
-
-        def build: v1.ShuffleWriteMetricDistributions = new v1.ShuffleWriteMetricDistributions(
-          writeBytes = submetricQuantiles(_.bytesWritten),
-          writeRecords = submetricQuantiles(_.recordsWritten),
-          writeTime = submetricQuantiles(_.writeTime)
-        )
-      }.build
-
-    new v1.TaskMetricDistributions(
+      unsortedQuantiles: Array[Double]): Option[v1.TaskMetricDistributions] = {
+    val stageKey = Array(stageId, stageAttemptId)
+    val quantiles = unsortedQuantiles.sorted
+
+    // We don't know how many tasks remain in the store that actually have metrics. So scan one
+    // metric and count how many valid tasks there are. Use skip() instead of next() since it's
+    // cheaper for disk stores (avoids deserialization).
+    val count = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(TaskIndexNames.EXEC_RUN_TIME)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var _count = 0L
+        while (it.hasNext()) {
+          _count += 1
+          it.skip(1)
+        }
+        _count
+      }
+    }
+
+    if (count <= 0) {
+      return None
+    }
+
+    // Find out which quantiles are already cached. The data in the store must match the expected
+    // task count to be considered, otherwise it will be re-scanned and overwritten.
+    val cachedQuantiles = quantiles.filter(shouldCacheQuantile).flatMap { q =>
+      val qkey = Array(stageId, stageAttemptId, quantileToString(q))
+      asOption(store.read(classOf[CachedQuantile], qkey)).filter(_.taskCount == count)
+    }
+
+    // If there are no missing quantiles, return the data. Otherwise, just compute everything
+    // to make the code simpler.
+    if (cachedQuantiles.size == quantiles.size) {
+      def toValues(fn: CachedQuantile => Double): IndexedSeq[Double] = cachedQuantiles.map(fn)
+
+      val distributions = new v1.TaskMetricDistributions(
+        quantiles = quantiles,
+        executorDeserializeTime = toValues(_.executorDeserializeTime),
+        executorDeserializeCpuTime = toValues(_.executorDeserializeCpuTime),
+        executorRunTime = toValues(_.executorRunTime),
+        executorCpuTime = toValues(_.executorCpuTime),
+        resultSize = toValues(_.resultSize),
+        jvmGcTime = toValues(_.jvmGcTime),
+        resultSerializationTime = toValues(_.resultSerializationTime),
+        gettingResultTime = toValues(_.gettingResultTime),
+        schedulerDelay = toValues(_.schedulerDelay),
+        peakExecutionMemory = toValues(_.peakExecutionMemory),
+        memoryBytesSpilled = toValues(_.memoryBytesSpilled),
+        diskBytesSpilled = toValues(_.diskBytesSpilled),
+        inputMetrics = new v1.InputMetricDistributions(
+          toValues(_.bytesRead),
+          toValues(_.recordsRead)),
+        outputMetrics = new v1.OutputMetricDistributions(
+          toValues(_.bytesWritten),
+          toValues(_.recordsWritten)),
+        shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+          toValues(_.shuffleReadBytes),
+          toValues(_.shuffleRecordsRead),
+          toValues(_.shuffleRemoteBlocksFetched),
+          toValues(_.shuffleLocalBlocksFetched),
+          toValues(_.shuffleFetchWaitTime),
+          toValues(_.shuffleRemoteBytesRead),
+          toValues(_.shuffleRemoteBytesReadToDisk),
+          toValues(_.shuffleTotalBlocksFetched)),
+        shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+          toValues(_.shuffleWriteBytes),
+          toValues(_.shuffleWriteRecords),
+          toValues(_.shuffleWriteTime)))
+
+      return Some(distributions)
+    }
+
+    // Compute quantiles by scanning the tasks in the store. This is not really stable for live
+    // stages (e.g. the number of recorded tasks may change while this code is running), but should
+    // stabilize once the stage finishes. It's also slow, especially with disk stores.
+    val indices = quantiles.map { q => math.min((q * count).toLong, count - 1) }
+
+    def scanTasks(index: String)(fn: TaskDataWrapper => Long): IndexedSeq[Double] = {
+      Utils.tryWithResource(
+        store.view(classOf[TaskDataWrapper])
+          .parent(stageKey)
+          .index(index)
+          .first(0L)
+          .closeableIterator()
+      ) { it =>
+        var last = Double.NaN
+        var currentIdx = -1L
+        indices.map { idx =>
+          if (idx == currentIdx) {
+            last
+          } else {
+            val diff = idx - currentIdx
+            currentIdx = idx
+            if (it.skip(diff - 1)) {
+              last = fn(it.next()).toDouble
+              last
+            } else {
+              Double.NaN
+            }
+          }
+        }.toIndexedSeq
+      }
+    }
+
+    val computedQuantiles = new v1.TaskMetricDistributions(
       quantiles = quantiles,
-      executorDeserializeTime = metricQuantiles(_.executorDeserializeTime),
-      executorDeserializeCpuTime = metricQuantiles(_.executorDeserializeCpuTime),
-      executorRunTime = metricQuantiles(_.executorRunTime),
-      executorCpuTime = metricQuantiles(_.executorCpuTime),
-      resultSize = metricQuantiles(_.resultSize),
-      jvmGcTime = metricQuantiles(_.jvmGcTime),
-      resultSerializationTime = metricQuantiles(_.resultSerializationTime),
-      memoryBytesSpilled = metricQuantiles(_.memoryBytesSpilled),
-      diskBytesSpilled = metricQuantiles(_.diskBytesSpilled),
-      inputMetrics = inputMetrics,
-      outputMetrics = outputMetrics,
-      shuffleReadMetrics = shuffleReadMetrics,
-      shuffleWriteMetrics = shuffleWriteMetrics
-    )
+      executorDeserializeTime = scanTasks(TaskIndexNames.DESER_TIME) { t =>
+        t.executorDeserializeTime
+      },
+      executorDeserializeCpuTime = scanTasks(TaskIndexNames.DESER_CPU_TIME) { t =>
+        t.executorDeserializeCpuTime
+      },
+      executorRunTime = scanTasks(TaskIndexNames.EXEC_RUN_TIME) { t => t.executorRunTime },
+      executorCpuTime = scanTasks(TaskIndexNames.EXEC_CPU_TIME) { t => t.executorCpuTime },
+      resultSize = scanTasks(TaskIndexNames.RESULT_SIZE) { t => t.resultSize },
+      jvmGcTime = scanTasks(TaskIndexNames.GC_TIME) { t => t.jvmGcTime },
+      resultSerializationTime = scanTasks(TaskIndexNames.SER_TIME) { t =>
+        t.resultSerializationTime
+      },
+      gettingResultTime = scanTasks(TaskIndexNames.GETTING_RESULT_TIME) { t =>
+        t.gettingResultTime
+      },
+      schedulerDelay = scanTasks(TaskIndexNames.SCHEDULER_DELAY) { t => t.schedulerDelay },
+      peakExecutionMemory = scanTasks(TaskIndexNames.PEAK_MEM) { t => t.peakExecutionMemory },
+      memoryBytesSpilled = scanTasks(TaskIndexNames.MEM_SPILL) { t => t.memoryBytesSpilled },
+      diskBytesSpilled = scanTasks(TaskIndexNames.DISK_SPILL) { t => t.diskBytesSpilled },
+      inputMetrics = new v1.InputMetricDistributions(
+        scanTasks(TaskIndexNames.INPUT_SIZE) { t => t.inputBytesRead },
+        scanTasks(TaskIndexNames.INPUT_RECORDS) { t => t.inputRecordsRead }),
+      outputMetrics = new v1.OutputMetricDistributions(
+        scanTasks(TaskIndexNames.OUTPUT_SIZE) { t => t.outputBytesWritten },
+        scanTasks(TaskIndexNames.OUTPUT_RECORDS) { t => t.outputRecordsWritten }),
+      shuffleReadMetrics = new v1.ShuffleReadMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_READS) { m =>
+          m.shuffleLocalBytesRead + m.shuffleRemoteBytesRead
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_RECORDS) { t => t.shuffleRecordsRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_BLOCKS) { t => t.shuffleRemoteBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_LOCAL_BLOCKS) { t => t.shuffleLocalBlocksFetched },
+        scanTasks(TaskIndexNames.SHUFFLE_READ_TIME) { t => t.shuffleFetchWaitTime },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS) { t => t.shuffleRemoteBytesRead },
+        scanTasks(TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK) { t =>
+          t.shuffleRemoteBytesReadToDisk
+        },
+        scanTasks(TaskIndexNames.SHUFFLE_TOTAL_BLOCKS) { m =>
+          m.shuffleLocalBlocksFetched + m.shuffleRemoteBlocksFetched
+        }),
+      shuffleWriteMetrics = new v1.ShuffleWriteMetricDistributions(
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_SIZE) { t => t.shuffleBytesWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_RECORDS) { t => t.shuffleRecordsWritten },
+        scanTasks(TaskIndexNames.SHUFFLE_WRITE_TIME) { t => t.shuffleWriteTime }))
+
+    // Go through the computed quantiles and cache the values that match the caching criteria.
+    computedQuantiles.quantiles.zipWithIndex
+      .filter { case (q, _) => quantiles.contains(q) && shouldCacheQuantile(q) }
+      .foreach { case (q, idx) =>
+        val cached = new CachedQuantile(stageId, stageAttemptId, quantileToString(q), count,
+          executorDeserializeTime = computedQuantiles.executorDeserializeTime(idx),
+          executorDeserializeCpuTime = computedQuantiles.executorDeserializeCpuTime(idx),
+          executorRunTime = computedQuantiles.executorRunTime(idx),
+          executorCpuTime = computedQuantiles.executorCpuTime(idx),
+          resultSize = computedQuantiles.resultSize(idx),
+          jvmGcTime = computedQuantiles.jvmGcTime(idx),
+          resultSerializationTime = computedQuantiles.resultSerializationTime(idx),
+          gettingResultTime = computedQuantiles.gettingResultTime(idx),
+          schedulerDelay = computedQuantiles.schedulerDelay(idx),
+          peakExecutionMemory = computedQuantiles.peakExecutionMemory(idx),
+          memoryBytesSpilled = computedQuantiles.memoryBytesSpilled(idx),
+          diskBytesSpilled = computedQuantiles.diskBytesSpilled(idx),
+
+          bytesRead = computedQuantiles.inputMetrics.bytesRead(idx),
+          recordsRead = computedQuantiles.inputMetrics.recordsRead(idx),
+
+          bytesWritten = computedQuantiles.outputMetrics.bytesWritten(idx),
+          recordsWritten = computedQuantiles.outputMetrics.recordsWritten(idx),
+
+          shuffleReadBytes = computedQuantiles.shuffleReadMetrics.readBytes(idx),
+          shuffleRecordsRead = computedQuantiles.shuffleReadMetrics.readRecords(idx),
+          shuffleRemoteBlocksFetched =
+            computedQuantiles.shuffleReadMetrics.remoteBlocksFetched(idx),
+          shuffleLocalBlocksFetched = computedQuantiles.shuffleReadMetrics.localBlocksFetched(idx),
+          shuffleFetchWaitTime = computedQuantiles.shuffleReadMetrics.fetchWaitTime(idx),
+          shuffleRemoteBytesRead = computedQuantiles.shuffleReadMetrics.remoteBytesRead(idx),
+          shuffleRemoteBytesReadToDisk =
+            computedQuantiles.shuffleReadMetrics.remoteBytesReadToDisk(idx),
+          shuffleTotalBlocksFetched = computedQuantiles.shuffleReadMetrics.totalBlocksFetched(idx),
+
+          shuffleWriteBytes = computedQuantiles.shuffleWriteMetrics.writeBytes(idx),
+          shuffleWriteRecords = computedQuantiles.shuffleWriteMetrics.writeRecords(idx),
+          shuffleWriteTime = computedQuantiles.shuffleWriteMetrics.writeTime(idx))
+        store.write(cached)
+      }
+
+    Some(computedQuantiles)
   }
 
+  /**
+   * Whether to cache information about a specific metric quantile. We cache quantiles at every 0.05
+   * step, which covers the default values used both in the API and in the stages page.
+   */
+  private def shouldCacheQuantile(q: Double): Boolean = (math.round(q * 100) % 5) == 0
+
+  private def quantileToString(q: Double): String = math.round(q * 100).toString
+
   def taskList(stageId: Int, stageAttemptId: Int, maxTasks: Int): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     store.view(classOf[TaskDataWrapper]).index("stage").first(stageKey).last(stageKey).reverse()
-      .max(maxTasks).asScala.map(_.info).toSeq.reverse
+      .max(maxTasks).asScala.map(_.toApi).toSeq.reverse
   }
 
   def taskList(
@@ -219,18 +354,43 @@ private[spark] class AppStatusStore(
       offset: Int,
       length: Int,
       sortBy: v1.TaskSorting): Seq[v1.TaskData] = {
+    val (indexName, ascending) = sortBy match {
+      case v1.TaskSorting.ID =>
+        (None, true)
+      case v1.TaskSorting.INCREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), true)
+      case v1.TaskSorting.DECREASING_RUNTIME =>
+        (Some(TaskIndexNames.EXEC_RUN_TIME), false)
+    }
+    taskList(stageId, stageAttemptId, offset, length, indexName, ascending)
+  }
+
+  def taskList(
+      stageId: Int,
+      stageAttemptId: Int,
+      offset: Int,
+      length: Int,
+      sortBy: Option[String],
+      ascending: Boolean): Seq[v1.TaskData] = {
     val stageKey = Array(stageId, stageAttemptId)
     val base = store.view(classOf[TaskDataWrapper])
     val indexed = sortBy match {
-      case v1.TaskSorting.ID =>
+      case Some(index) =>
+        base.index(index).parent(stageKey)
+
+      case _ =>
+        // Sort by ID, which is the "stage" index.
         base.index("stage").first(stageKey).last(stageKey)
-      case v1.TaskSorting.INCREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(-1L)).last(stageKey ++ Array(Long.MaxValue))
-      case v1.TaskSorting.DECREASING_RUNTIME =>
-        base.index("runtime").first(stageKey ++ Array(Long.MaxValue)).last(stageKey ++ Array(-1L))
-          .reverse()
     }
-    indexed.skip(offset).max(length).asScala.map(_.info).toSeq
+
+    val ordered = if (ascending) indexed else indexed.reverse()
+    ordered.skip(offset).max(length).asScala.map(_.toApi).toSeq
+  }
+
+  def executorSummary(stageId: Int, attemptId: Int): Map[String, v1.ExecutorStageSummary] = {
+    val stageKey = Array(stageId, attemptId)
+    store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey).last(stageKey)
+      .asScala.map { exec => (exec.executorId -> exec.info) }.toMap
   }
 
   def rddList(cachedOnly: Boolean = true): Seq[v1.RDDStorageInfo] = {
@@ -256,12 +416,6 @@ private[spark] class AppStatusStore(
       .map { t => (t.taskId, t) }
       .toMap
 
-    val stageKey = Array(stage.stageId, stage.attemptId)
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage").first(stageKey)
-      .last(stageKey).closeableIterator().asScala
-      .map { exec => (exec.executorId -> exec.info) }
-      .toMap
-
     new v1.StageData(
       stage.status,
       stage.stageId,
@@ -295,7 +449,7 @@ private[spark] class AppStatusStore(
       stage.rddIds,
       stage.accumulatorUpdates,
       Some(tasks),
-      Some(execs),
+      Some(executorSummary(stage.stageId, stage.attemptId)),
       stage.killedTasksSummary)
   }
 
@@ -352,22 +506,3 @@ private[spark] object AppStatusStore {
   }
 
 }
-
-/**
- * Helper for getting distributions from nested metric types.
- */
-private abstract class MetricHelper[I, O](
-    rawMetrics: Seq[v1.TaskMetrics],
-    quantiles: Array[Double]) {
-
-  def getSubmetrics(raw: v1.TaskMetrics): I
-
-  def build: O
-
-  val data: Seq[I] = rawMetrics.map(getSubmetrics)
-
-  /** applies the given function to all input metrics, and returns the quantiles */
-  def submetricQuantiles(f: I => Double): IndexedSeq[Double] = {
-    Distribution(data.map { d => f(d) }).get.getQuantiles(quantiles)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
new file mode 100644
index 0000000000000..341bd4e0cd016
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusUtils.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.status.api.v1.{TaskData, TaskMetrics}
+
+private[spark] object AppStatusUtils {
+
+  def schedulerDelay(task: TaskData): Long = {
+    if (task.taskMetrics.isDefined && task.duration.isDefined) {
+      val m = task.taskMetrics.get
+      schedulerDelay(task.launchTime.getTime(), fetchStart(task), task.duration.get,
+        m.executorDeserializeTime, m.resultSerializationTime, m.executorRunTime)
+    } else {
+      0L
+    }
+  }
+
+  def gettingResultTime(task: TaskData): Long = {
+    gettingResultTime(task.launchTime.getTime(), fetchStart(task), task.duration.getOrElse(-1L))
+  }
+
+  def schedulerDelay(
+      launchTime: Long,
+      fetchStart: Long,
+      duration: Long,
+      deserializeTime: Long,
+      serializeTime: Long,
+      runTime: Long): Long = {
+    math.max(0, duration - runTime - deserializeTime - serializeTime -
+      gettingResultTime(launchTime, fetchStart, duration))
+  }
+
+  def gettingResultTime(launchTime: Long, fetchStart: Long, duration: Long): Long = {
+    if (fetchStart > 0) {
+      if (duration > 0) {
+        launchTime + duration - fetchStart
+      } else {
+        System.currentTimeMillis() - fetchStart
+      }
+    } else {
+      0L
+    }
+  }
+
+  private def fetchStart(task: TaskData): Long = {
+    if (task.resultFetchStart.isDefined) {
+      task.resultFetchStart.get.getTime()
+    } else {
+      -1
+    }
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 305c2fafa6aac..4295e664e131c 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -22,6 +22,8 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.mutable.HashMap
 
+import com.google.common.collect.Interners
+
 import org.apache.spark.JobExecutionStatus
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler.{AccumulableInfo, StageInfo, TaskInfo}
@@ -119,7 +121,9 @@ private class LiveTask(
 
   import LiveEntityHelpers._
 
-  private var recordedMetrics: v1.TaskMetrics = null
+  // The task metrics use a special value when no metrics have been reported. The special value is
+  // checked when calculating indexed values when writing to the store (see [[TaskDataWrapper]]).
+  private var metrics: v1.TaskMetrics = createMetrics(default = -1L)
 
   var errorMessage: Option[String] = None
 
@@ -129,8 +133,8 @@ private class LiveTask(
    */
   def updateMetrics(metrics: TaskMetrics): v1.TaskMetrics = {
     if (metrics != null) {
-      val old = recordedMetrics
-      recordedMetrics = new v1.TaskMetrics(
+      val old = this.metrics
+      val newMetrics = createMetrics(
         metrics.executorDeserializeTime,
         metrics.executorDeserializeCpuTime,
         metrics.executorRunTime,
@@ -141,73 +145,35 @@ private class LiveTask(
         metrics.memoryBytesSpilled,
         metrics.diskBytesSpilled,
         metrics.peakExecutionMemory,
-        new v1.InputMetrics(
-          metrics.inputMetrics.bytesRead,
-          metrics.inputMetrics.recordsRead),
-        new v1.OutputMetrics(
-          metrics.outputMetrics.bytesWritten,
-          metrics.outputMetrics.recordsWritten),
-        new v1.ShuffleReadMetrics(
-          metrics.shuffleReadMetrics.remoteBlocksFetched,
-          metrics.shuffleReadMetrics.localBlocksFetched,
-          metrics.shuffleReadMetrics.fetchWaitTime,
-          metrics.shuffleReadMetrics.remoteBytesRead,
-          metrics.shuffleReadMetrics.remoteBytesReadToDisk,
-          metrics.shuffleReadMetrics.localBytesRead,
-          metrics.shuffleReadMetrics.recordsRead),
-        new v1.ShuffleWriteMetrics(
-          metrics.shuffleWriteMetrics.bytesWritten,
-          metrics.shuffleWriteMetrics.writeTime,
-          metrics.shuffleWriteMetrics.recordsWritten))
-      if (old != null) calculateMetricsDelta(recordedMetrics, old) else recordedMetrics
+        metrics.inputMetrics.bytesRead,
+        metrics.inputMetrics.recordsRead,
+        metrics.outputMetrics.bytesWritten,
+        metrics.outputMetrics.recordsWritten,
+        metrics.shuffleReadMetrics.remoteBlocksFetched,
+        metrics.shuffleReadMetrics.localBlocksFetched,
+        metrics.shuffleReadMetrics.fetchWaitTime,
+        metrics.shuffleReadMetrics.remoteBytesRead,
+        metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+        metrics.shuffleReadMetrics.localBytesRead,
+        metrics.shuffleReadMetrics.recordsRead,
+        metrics.shuffleWriteMetrics.bytesWritten,
+        metrics.shuffleWriteMetrics.writeTime,
+        metrics.shuffleWriteMetrics.recordsWritten)
+
+      this.metrics = newMetrics
+
+      // Only calculate the delta if the old metrics contain valid information, otherwise
+      // the new metrics are the delta.
+      if (old.executorDeserializeTime >= 0L) {
+        subtractMetrics(newMetrics, old)
+      } else {
+        newMetrics
+      }
     } else {
       null
     }
   }
 
-  /**
-   * Return a new TaskMetrics object containing the delta of the various fields of the given
-   * metrics objects. This is currently targeted at updating stage data, so it does not
-   * necessarily calculate deltas for all the fields.
-   */
-  private def calculateMetricsDelta(
-      metrics: v1.TaskMetrics,
-      old: v1.TaskMetrics): v1.TaskMetrics = {
-    val shuffleWriteDelta = new v1.ShuffleWriteMetrics(
-      metrics.shuffleWriteMetrics.bytesWritten - old.shuffleWriteMetrics.bytesWritten,
-      0L,
-      metrics.shuffleWriteMetrics.recordsWritten - old.shuffleWriteMetrics.recordsWritten)
-
-    val shuffleReadDelta = new v1.ShuffleReadMetrics(
-      0L, 0L, 0L,
-      metrics.shuffleReadMetrics.remoteBytesRead - old.shuffleReadMetrics.remoteBytesRead,
-      metrics.shuffleReadMetrics.remoteBytesReadToDisk -
-        old.shuffleReadMetrics.remoteBytesReadToDisk,
-      metrics.shuffleReadMetrics.localBytesRead - old.shuffleReadMetrics.localBytesRead,
-      metrics.shuffleReadMetrics.recordsRead - old.shuffleReadMetrics.recordsRead)
-
-    val inputDelta = new v1.InputMetrics(
-      metrics.inputMetrics.bytesRead - old.inputMetrics.bytesRead,
-      metrics.inputMetrics.recordsRead - old.inputMetrics.recordsRead)
-
-    val outputDelta = new v1.OutputMetrics(
-      metrics.outputMetrics.bytesWritten - old.outputMetrics.bytesWritten,
-      metrics.outputMetrics.recordsWritten - old.outputMetrics.recordsWritten)
-
-    new v1.TaskMetrics(
-      0L, 0L,
-      metrics.executorRunTime - old.executorRunTime,
-      metrics.executorCpuTime - old.executorCpuTime,
-      0L, 0L, 0L,
-      metrics.memoryBytesSpilled - old.memoryBytesSpilled,
-      metrics.diskBytesSpilled - old.diskBytesSpilled,
-      0L,
-      inputDelta,
-      outputDelta,
-      shuffleReadDelta,
-      shuffleWriteDelta)
-  }
-
   override protected def doUpdate(): Any = {
     val duration = if (info.finished) {
       info.duration
@@ -215,22 +181,48 @@ private class LiveTask(
       info.timeRunning(lastUpdateTime.getOrElse(System.currentTimeMillis()))
     }
 
-    val task = new v1.TaskData(
+    new TaskDataWrapper(
       info.taskId,
       info.index,
       info.attemptNumber,
-      new Date(info.launchTime),
-      if (info.gettingResult) Some(new Date(info.gettingResultTime)) else None,
-      Some(duration),
-      info.executorId,
-      info.host,
-      info.status,
-      info.taskLocality.toString(),
+      info.launchTime,
+      if (info.gettingResult) info.gettingResultTime else -1L,
+      duration,
+      weakIntern(info.executorId),
+      weakIntern(info.host),
+      weakIntern(info.status),
+      weakIntern(info.taskLocality.toString()),
       info.speculative,
       newAccumulatorInfos(info.accumulables),
       errorMessage,
-      Option(recordedMetrics))
-    new TaskDataWrapper(task, stageId, stageAttemptId)
+
+      metrics.executorDeserializeTime,
+      metrics.executorDeserializeCpuTime,
+      metrics.executorRunTime,
+      metrics.executorCpuTime,
+      metrics.resultSize,
+      metrics.jvmGcTime,
+      metrics.resultSerializationTime,
+      metrics.memoryBytesSpilled,
+      metrics.diskBytesSpilled,
+      metrics.peakExecutionMemory,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBlocksFetched,
+      metrics.shuffleReadMetrics.localBlocksFetched,
+      metrics.shuffleReadMetrics.fetchWaitTime,
+      metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.remoteBytesReadToDisk,
+      metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.writeTime,
+      metrics.shuffleWriteMetrics.recordsWritten,
+
+      stageId,
+      stageAttemptId)
   }
 
 }
@@ -313,50 +305,19 @@ private class LiveExecutor(val executorId: String, _addTime: Long) extends LiveE
 
 }
 
-/** Metrics tracked per stage (both total and per executor). */
-private class MetricsTracker {
-  var executorRunTime = 0L
-  var executorCpuTime = 0L
-  var inputBytes = 0L
-  var inputRecords = 0L
-  var outputBytes = 0L
-  var outputRecords = 0L
-  var shuffleReadBytes = 0L
-  var shuffleReadRecords = 0L
-  var shuffleWriteBytes = 0L
-  var shuffleWriteRecords = 0L
-  var memoryBytesSpilled = 0L
-  var diskBytesSpilled = 0L
-
-  def update(delta: v1.TaskMetrics): Unit = {
-    executorRunTime += delta.executorRunTime
-    executorCpuTime += delta.executorCpuTime
-    inputBytes += delta.inputMetrics.bytesRead
-    inputRecords += delta.inputMetrics.recordsRead
-    outputBytes += delta.outputMetrics.bytesWritten
-    outputRecords += delta.outputMetrics.recordsWritten
-    shuffleReadBytes += delta.shuffleReadMetrics.localBytesRead +
-      delta.shuffleReadMetrics.remoteBytesRead
-    shuffleReadRecords += delta.shuffleReadMetrics.recordsRead
-    shuffleWriteBytes += delta.shuffleWriteMetrics.bytesWritten
-    shuffleWriteRecords += delta.shuffleWriteMetrics.recordsWritten
-    memoryBytesSpilled += delta.memoryBytesSpilled
-    diskBytesSpilled += delta.diskBytesSpilled
-  }
-
-}
-
 private class LiveExecutorStageSummary(
     stageId: Int,
     attemptId: Int,
     executorId: String) extends LiveEntity {
 
+  import LiveEntityHelpers._
+
   var taskTime = 0L
   var succeededTasks = 0
   var failedTasks = 0
   var killedTasks = 0
 
-  val metrics = new MetricsTracker()
+  var metrics = createMetrics(default = 0L)
 
   override protected def doUpdate(): Any = {
     val info = new v1.ExecutorStageSummary(
@@ -364,14 +325,14 @@ private class LiveExecutorStageSummary(
       failedTasks,
       succeededTasks,
       killedTasks,
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.remoteBytesRead + metrics.shuffleReadMetrics.localBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled)
     new ExecutorStageSummaryWrapper(stageId, attemptId, executorId, info)
@@ -402,7 +363,9 @@ private class LiveStage extends LiveEntity {
 
   var firstLaunchTime = Long.MaxValue
 
-  val metrics = new MetricsTracker()
+  var localitySummary: Map[String, Long] = Map()
+
+  var metrics = createMetrics(default = 0L)
 
   val executorSummaries = new HashMap[String, LiveExecutorStageSummary]()
 
@@ -435,14 +398,14 @@ private class LiveStage extends LiveEntity {
       info.completionTime.map(new Date(_)),
       info.failureReason,
 
-      metrics.inputBytes,
-      metrics.inputRecords,
-      metrics.outputBytes,
-      metrics.outputRecords,
-      metrics.shuffleReadBytes,
-      metrics.shuffleReadRecords,
-      metrics.shuffleWriteBytes,
-      metrics.shuffleWriteRecords,
+      metrics.inputMetrics.bytesRead,
+      metrics.inputMetrics.recordsRead,
+      metrics.outputMetrics.bytesWritten,
+      metrics.outputMetrics.recordsWritten,
+      metrics.shuffleReadMetrics.localBytesRead + metrics.shuffleReadMetrics.remoteBytesRead,
+      metrics.shuffleReadMetrics.recordsRead,
+      metrics.shuffleWriteMetrics.bytesWritten,
+      metrics.shuffleWriteMetrics.recordsWritten,
       metrics.memoryBytesSpilled,
       metrics.diskBytesSpilled,
 
@@ -459,13 +422,15 @@ private class LiveStage extends LiveEntity {
   }
 
   override protected def doUpdate(): Any = {
-    new StageDataWrapper(toApi(), jobIds)
+    new StageDataWrapper(toApi(), jobIds, localitySummary)
   }
 
 }
 
 private class LiveRDDPartition(val blockName: String) {
 
+  import LiveEntityHelpers._
+
   // Pointers used by RDDPartitionSeq.
   @volatile var prev: LiveRDDPartition = null
   @volatile var next: LiveRDDPartition = null
@@ -485,7 +450,7 @@ private class LiveRDDPartition(val blockName: String) {
       diskUsed: Long): Unit = {
     value = new v1.RDDPartitionInfo(
       blockName,
-      storageLevel,
+      weakIntern(storageLevel),
       memoryUsed,
       diskUsed,
       executors)
@@ -495,6 +460,8 @@ private class LiveRDDPartition(val blockName: String) {
 
 private class LiveRDDDistribution(exec: LiveExecutor) {
 
+  import LiveEntityHelpers._
+
   val executorId = exec.executorId
   var memoryUsed = 0L
   var diskUsed = 0L
@@ -508,7 +475,7 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
   def toApi(): v1.RDDDataDistribution = {
     if (lastUpdate == null) {
       lastUpdate = new v1.RDDDataDistribution(
-        exec.hostPort,
+        weakIntern(exec.hostPort),
         memoryUsed,
         exec.maxMemory - exec.memoryUsed,
         diskUsed,
@@ -524,7 +491,9 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
 
 private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
-  var storageLevel: String = info.storageLevel.description
+  import LiveEntityHelpers._
+
+  var storageLevel: String = weakIntern(info.storageLevel.description)
   var memoryUsed = 0L
   var diskUsed = 0L
 
@@ -533,6 +502,10 @@ private class LiveRDD(val info: RDDInfo) extends LiveEntity {
 
   private val distributions = new HashMap[String, LiveRDDDistribution]()
 
+  def setStorageLevel(level: String): Unit = {
+    this.storageLevel = weakIntern(level)
+  }
+
   def partition(blockName: String): LiveRDDPartition = {
     partitions.getOrElseUpdate(blockName, {
       val part = new LiveRDDPartition(blockName)
@@ -593,6 +566,9 @@ private class SchedulerPool(name: String) extends LiveEntity {
 
 private object LiveEntityHelpers {
 
+  private val stringInterner = Interners.newWeakInterner[String]()
+
+
   def newAccumulatorInfos(accums: Iterable[AccumulableInfo]): Seq[v1.AccumulableInfo] = {
     accums
       .filter { acc =>
@@ -604,13 +580,119 @@ private object LiveEntityHelpers {
       .map { acc =>
         new v1.AccumulableInfo(
           acc.id,
-          acc.name.orNull,
+          acc.name.map(weakIntern).orNull,
           acc.update.map(_.toString()),
           acc.value.map(_.toString()).orNull)
       }
       .toSeq
   }
 
+  /** String interning to reduce the memory usage. */
+  def weakIntern(s: String): String = {
+    stringInterner.intern(s)
+  }
+
+  // scalastyle:off argcount
+  def createMetrics(
+      executorDeserializeTime: Long,
+      executorDeserializeCpuTime: Long,
+      executorRunTime: Long,
+      executorCpuTime: Long,
+      resultSize: Long,
+      jvmGcTime: Long,
+      resultSerializationTime: Long,
+      memoryBytesSpilled: Long,
+      diskBytesSpilled: Long,
+      peakExecutionMemory: Long,
+      inputBytesRead: Long,
+      inputRecordsRead: Long,
+      outputBytesWritten: Long,
+      outputRecordsWritten: Long,
+      shuffleRemoteBlocksFetched: Long,
+      shuffleLocalBlocksFetched: Long,
+      shuffleFetchWaitTime: Long,
+      shuffleRemoteBytesRead: Long,
+      shuffleRemoteBytesReadToDisk: Long,
+      shuffleLocalBytesRead: Long,
+      shuffleRecordsRead: Long,
+      shuffleBytesWritten: Long,
+      shuffleWriteTime: Long,
+      shuffleRecordsWritten: Long): v1.TaskMetrics = {
+    new v1.TaskMetrics(
+      executorDeserializeTime,
+      executorDeserializeCpuTime,
+      executorRunTime,
+      executorCpuTime,
+      resultSize,
+      jvmGcTime,
+      resultSerializationTime,
+      memoryBytesSpilled,
+      diskBytesSpilled,
+      peakExecutionMemory,
+      new v1.InputMetrics(
+        inputBytesRead,
+        inputRecordsRead),
+      new v1.OutputMetrics(
+        outputBytesWritten,
+        outputRecordsWritten),
+      new v1.ShuffleReadMetrics(
+        shuffleRemoteBlocksFetched,
+        shuffleLocalBlocksFetched,
+        shuffleFetchWaitTime,
+        shuffleRemoteBytesRead,
+        shuffleRemoteBytesReadToDisk,
+        shuffleLocalBytesRead,
+        shuffleRecordsRead),
+      new v1.ShuffleWriteMetrics(
+        shuffleBytesWritten,
+        shuffleWriteTime,
+        shuffleRecordsWritten))
+  }
+  // scalastyle:on argcount
+
+  def createMetrics(default: Long): v1.TaskMetrics = {
+    createMetrics(default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default,
+      default, default, default, default, default, default, default, default)
+  }
+
+  /** Add m2 values to m1. */
+  def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = addMetrics(m1, m2, 1)
+
+  /** Subtract m2 values from m1. */
+  def subtractMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics): v1.TaskMetrics = {
+    addMetrics(m1, m2, -1)
+  }
+
+  private def addMetrics(m1: v1.TaskMetrics, m2: v1.TaskMetrics, mult: Int): v1.TaskMetrics = {
+    createMetrics(
+      m1.executorDeserializeTime + m2.executorDeserializeTime * mult,
+      m1.executorDeserializeCpuTime + m2.executorDeserializeCpuTime * mult,
+      m1.executorRunTime + m2.executorRunTime * mult,
+      m1.executorCpuTime + m2.executorCpuTime * mult,
+      m1.resultSize + m2.resultSize * mult,
+      m1.jvmGcTime + m2.jvmGcTime * mult,
+      m1.resultSerializationTime + m2.resultSerializationTime * mult,
+      m1.memoryBytesSpilled + m2.memoryBytesSpilled * mult,
+      m1.diskBytesSpilled + m2.diskBytesSpilled * mult,
+      m1.peakExecutionMemory + m2.peakExecutionMemory * mult,
+      m1.inputMetrics.bytesRead + m2.inputMetrics.bytesRead * mult,
+      m1.inputMetrics.recordsRead + m2.inputMetrics.recordsRead * mult,
+      m1.outputMetrics.bytesWritten + m2.outputMetrics.bytesWritten * mult,
+      m1.outputMetrics.recordsWritten + m2.outputMetrics.recordsWritten * mult,
+      m1.shuffleReadMetrics.remoteBlocksFetched + m2.shuffleReadMetrics.remoteBlocksFetched * mult,
+      m1.shuffleReadMetrics.localBlocksFetched + m2.shuffleReadMetrics.localBlocksFetched * mult,
+      m1.shuffleReadMetrics.fetchWaitTime + m2.shuffleReadMetrics.fetchWaitTime * mult,
+      m1.shuffleReadMetrics.remoteBytesRead + m2.shuffleReadMetrics.remoteBytesRead * mult,
+      m1.shuffleReadMetrics.remoteBytesReadToDisk +
+        m2.shuffleReadMetrics.remoteBytesReadToDisk * mult,
+      m1.shuffleReadMetrics.localBytesRead + m2.shuffleReadMetrics.localBytesRead * mult,
+      m1.shuffleReadMetrics.recordsRead + m2.shuffleReadMetrics.recordsRead * mult,
+      m1.shuffleWriteMetrics.bytesWritten + m2.shuffleWriteMetrics.bytesWritten * mult,
+      m1.shuffleWriteMetrics.writeTime + m2.shuffleWriteMetrics.writeTime * mult,
+      m1.shuffleWriteMetrics.recordsWritten + m2.shuffleWriteMetrics.recordsWritten * mult)
+  }
+
 }
 
 /**
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
index 3b879545b3d2e..96249e4bfd5fa 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/StagesResource.scala
@@ -87,7 +87,8 @@ private[v1] class StagesResource extends BaseAppResource {
       }
     }
 
-    ui.store.taskSummary(stageId, stageAttemptId, quantiles)
+    ui.store.taskSummary(stageId, stageAttemptId, quantiles).getOrElse(
+      throw new NotFoundException(s"No tasks reported metrics for $stageId / $stageAttemptId yet."))
   }
 
   @GET
diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
index 45eaf935fb083..7d8e4de3c8efb 100644
--- a/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
+++ b/core/src/main/scala/org/apache/spark/status/api/v1/api.scala
@@ -261,6 +261,9 @@ class TaskMetricDistributions private[spark](
     val resultSize: IndexedSeq[Double],
     val jvmGcTime: IndexedSeq[Double],
     val resultSerializationTime: IndexedSeq[Double],
+    val gettingResultTime: IndexedSeq[Double],
+    val schedulerDelay: IndexedSeq[Double],
+    val peakExecutionMemory: IndexedSeq[Double],
     val memoryBytesSpilled: IndexedSeq[Double],
     val diskBytesSpilled: IndexedSeq[Double],
 
diff --git a/core/src/main/scala/org/apache/spark/status/storeTypes.scala b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
index 1cfd30df49091..c9cb996a55fcc 100644
--- a/core/src/main/scala/org/apache/spark/status/storeTypes.scala
+++ b/core/src/main/scala/org/apache/spark/status/storeTypes.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.status
 
-import java.lang.{Integer => JInteger, Long => JLong}
+import java.lang.{Long => JLong}
+import java.util.Date
 
 import com.fasterxml.jackson.annotation.JsonIgnore
+import com.fasterxml.jackson.databind.annotation.JsonDeserialize
 
 import org.apache.spark.status.KVUtils._
 import org.apache.spark.status.api.v1._
@@ -49,10 +51,10 @@ private[spark] class ApplicationEnvironmentInfoWrapper(val info: ApplicationEnvi
 private[spark] class ExecutorSummaryWrapper(val info: ExecutorSummary) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: String = info.id
+  private def id: String = info.id
 
   @JsonIgnore @KVIndex("active")
-  private[this] val active: Boolean = info.isActive
+  private def active: Boolean = info.isActive
 
   @JsonIgnore @KVIndex("host")
   val host: String = info.hostPort.split(":")(0)
@@ -69,51 +71,271 @@ private[spark] class JobDataWrapper(
     val skippedStages: Set[Int]) {
 
   @JsonIgnore @KVIndex
-  private[this] val id: Int = info.jobId
+  private def id: Int = info.jobId
 
 }
 
 private[spark] class StageDataWrapper(
     val info: StageData,
-    val jobIds: Set[Int]) {
+    val jobIds: Set[Int],
+    @JsonDeserialize(contentAs = classOf[JLong])
+    val locality: Map[String, Long]) {
 
   @JsonIgnore @KVIndex
-  def id: Array[Int] = Array(info.stageId, info.attemptId)
+  private[this] val id: Array[Int] = Array(info.stageId, info.attemptId)
 
   @JsonIgnore @KVIndex("stageId")
-  def stageId: Int = info.stageId
+  private def stageId: Int = info.stageId
 
+  @JsonIgnore @KVIndex("active")
+  private def active: Boolean = info.status == StageStatus.ACTIVE
+
+}
+
+/**
+ * Tasks have a lot of indices that are used in a few different places. This object keeps logical
+ * names for these indices, mapped to short strings to save space when using a disk store.
+ */
+private[spark] object TaskIndexNames {
+  final val ACCUMULATORS = "acc"
+  final val ATTEMPT = "att"
+  final val DESER_CPU_TIME = "dct"
+  final val DESER_TIME = "des"
+  final val DISK_SPILL = "dbs"
+  final val DURATION = "dur"
+  final val ERROR = "err"
+  final val EXECUTOR = "exe"
+  final val EXEC_CPU_TIME = "ect"
+  final val EXEC_RUN_TIME = "ert"
+  final val GC_TIME = "gc"
+  final val GETTING_RESULT_TIME = "grt"
+  final val INPUT_RECORDS = "ir"
+  final val INPUT_SIZE = "is"
+  final val LAUNCH_TIME = "lt"
+  final val LOCALITY = "loc"
+  final val MEM_SPILL = "mbs"
+  final val OUTPUT_RECORDS = "or"
+  final val OUTPUT_SIZE = "os"
+  final val PEAK_MEM = "pem"
+  final val RESULT_SIZE = "rs"
+  final val SCHEDULER_DELAY = "dly"
+  final val SER_TIME = "rst"
+  final val SHUFFLE_LOCAL_BLOCKS = "slbl"
+  final val SHUFFLE_READ_RECORDS = "srr"
+  final val SHUFFLE_READ_TIME = "srt"
+  final val SHUFFLE_REMOTE_BLOCKS = "srbl"
+  final val SHUFFLE_REMOTE_READS = "srby"
+  final val SHUFFLE_REMOTE_READS_TO_DISK = "srbd"
+  final val SHUFFLE_TOTAL_READS = "stby"
+  final val SHUFFLE_TOTAL_BLOCKS = "stbl"
+  final val SHUFFLE_WRITE_RECORDS = "swr"
+  final val SHUFFLE_WRITE_SIZE = "sws"
+  final val SHUFFLE_WRITE_TIME = "swt"
+  final val STAGE = "stage"
+  final val STATUS = "sta"
+  final val TASK_INDEX = "idx"
 }
 
 /**
- * The task information is always indexed with the stage ID, since that is how the UI and API
- * consume it. That means every indexed value has the stage ID and attempt ID included, aside
- * from the actual data being indexed.
+ * Unlike other data types, the task data wrapper does not keep a reference to the API's TaskData.
+ * That is to save memory, since for large applications there can be a large number of these
+ * elements (by default up to 100,000 per stage), and every bit of wasted memory adds up.
+ *
+ * It also contains many secondary indices, which are used to sort data efficiently in the UI at the
+ * expense of storage space (and slower write times).
  */
 private[spark] class TaskDataWrapper(
-    val info: TaskData,
+    // Storing this as an object actually saves memory; it's also used as the key in the in-memory
+    // store, so in that case you'd save the extra copy of the value here.
+    @KVIndexParam
+    val taskId: JLong,
+    @KVIndexParam(value = TaskIndexNames.TASK_INDEX, parent = TaskIndexNames.STAGE)
+    val index: Int,
+    @KVIndexParam(value = TaskIndexNames.ATTEMPT, parent = TaskIndexNames.STAGE)
+    val attempt: Int,
+    @KVIndexParam(value = TaskIndexNames.LAUNCH_TIME, parent = TaskIndexNames.STAGE)
+    val launchTime: Long,
+    val resultFetchStart: Long,
+    @KVIndexParam(value = TaskIndexNames.DURATION, parent = TaskIndexNames.STAGE)
+    val duration: Long,
+    @KVIndexParam(value = TaskIndexNames.EXECUTOR, parent = TaskIndexNames.STAGE)
+    val executorId: String,
+    val host: String,
+    @KVIndexParam(value = TaskIndexNames.STATUS, parent = TaskIndexNames.STAGE)
+    val status: String,
+    @KVIndexParam(value = TaskIndexNames.LOCALITY, parent = TaskIndexNames.STAGE)
+    val taskLocality: String,
+    val speculative: Boolean,
+    val accumulatorUpdates: Seq[AccumulableInfo],
+    val errorMessage: Option[String],
+
+    // The following is an exploded view of a TaskMetrics API object. This saves 5 objects
+    // (= 80 bytes of Java object overhead) per instance of this wrapper. If the first value
+    // (executorDeserializeTime) is -1L, it means the metrics for this task have not been
+    // recorded.
+    @KVIndexParam(value = TaskIndexNames.DESER_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeTime: Long,
+    @KVIndexParam(value = TaskIndexNames.DESER_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorDeserializeCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_RUN_TIME, parent = TaskIndexNames.STAGE)
+    val executorRunTime: Long,
+    @KVIndexParam(value = TaskIndexNames.EXEC_CPU_TIME, parent = TaskIndexNames.STAGE)
+    val executorCpuTime: Long,
+    @KVIndexParam(value = TaskIndexNames.RESULT_SIZE, parent = TaskIndexNames.STAGE)
+    val resultSize: Long,
+    @KVIndexParam(value = TaskIndexNames.GC_TIME, parent = TaskIndexNames.STAGE)
+    val jvmGcTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SER_TIME, parent = TaskIndexNames.STAGE)
+    val resultSerializationTime: Long,
+    @KVIndexParam(value = TaskIndexNames.MEM_SPILL, parent = TaskIndexNames.STAGE)
+    val memoryBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.DISK_SPILL, parent = TaskIndexNames.STAGE)
+    val diskBytesSpilled: Long,
+    @KVIndexParam(value = TaskIndexNames.PEAK_MEM, parent = TaskIndexNames.STAGE)
+    val peakExecutionMemory: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val inputBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.INPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val inputRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_SIZE, parent = TaskIndexNames.STAGE)
+    val outputBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.OUTPUT_RECORDS, parent = TaskIndexNames.STAGE)
+    val outputRecordsWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_LOCAL_BLOCKS, parent = TaskIndexNames.STAGE)
+    val shuffleLocalBlocksFetched: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleFetchWaitTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS, parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_REMOTE_READS_TO_DISK,
+      parent = TaskIndexNames.STAGE)
+    val shuffleRemoteBytesReadToDisk: Long,
+    val shuffleLocalBytesRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_READ_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsRead: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_SIZE, parent = TaskIndexNames.STAGE)
+    val shuffleBytesWritten: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_TIME, parent = TaskIndexNames.STAGE)
+    val shuffleWriteTime: Long,
+    @KVIndexParam(value = TaskIndexNames.SHUFFLE_WRITE_RECORDS, parent = TaskIndexNames.STAGE)
+    val shuffleRecordsWritten: Long,
+
     val stageId: Int,
     val stageAttemptId: Int) {
 
-  @JsonIgnore @KVIndex
-  def id: Long = info.taskId
+  def hasMetrics: Boolean = executorDeserializeTime >= 0
+
+  def toApi: TaskData = {
+    val metrics = if (hasMetrics) {
+      Some(new TaskMetrics(
+        executorDeserializeTime,
+        executorDeserializeCpuTime,
+        executorRunTime,
+        executorCpuTime,
+        resultSize,
+        jvmGcTime,
+        resultSerializationTime,
+        memoryBytesSpilled,
+        diskBytesSpilled,
+        peakExecutionMemory,
+        new InputMetrics(
+          inputBytesRead,
+          inputRecordsRead),
+        new OutputMetrics(
+          outputBytesWritten,
+          outputRecordsWritten),
+        new ShuffleReadMetrics(
+          shuffleRemoteBlocksFetched,
+          shuffleLocalBlocksFetched,
+          shuffleFetchWaitTime,
+          shuffleRemoteBytesRead,
+          shuffleRemoteBytesReadToDisk,
+          shuffleLocalBytesRead,
+          shuffleRecordsRead),
+        new ShuffleWriteMetrics(
+          shuffleBytesWritten,
+          shuffleWriteTime,
+          shuffleRecordsWritten)))
+    } else {
+      None
+    }
 
-  @JsonIgnore @KVIndex("stage")
-  def stage: Array[Int] = Array(stageId, stageAttemptId)
+    new TaskData(
+      taskId,
+      index,
+      attempt,
+      new Date(launchTime),
+      if (resultFetchStart > 0L) Some(new Date(resultFetchStart)) else None,
+      if (duration > 0L) Some(duration) else None,
+      executorId,
+      host,
+      status,
+      taskLocality,
+      speculative,
+      accumulatorUpdates,
+      errorMessage,
+      metrics)
+  }
+
+  @JsonIgnore @KVIndex(TaskIndexNames.STAGE)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
 
-  @JsonIgnore @KVIndex("runtime")
-  def runtime: Array[AnyRef] = {
-    val _runtime = info.taskMetrics.map(_.executorRunTime).getOrElse(-1L)
-    Array(stageId: JInteger, stageAttemptId: JInteger, _runtime: JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SCHEDULER_DELAY, parent = TaskIndexNames.STAGE)
+  def schedulerDelay: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.schedulerDelay(launchTime, resultFetchStart, duration, executorDeserializeTime,
+        resultSerializationTime, executorRunTime)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("startTime")
-  def startTime: Array[AnyRef] = {
-    Array(stageId: JInteger, stageAttemptId: JInteger, info.launchTime.getTime(): JLong)
+  @JsonIgnore @KVIndex(value = TaskIndexNames.GETTING_RESULT_TIME, parent = TaskIndexNames.STAGE)
+  def gettingResultTime: Long = {
+    if (hasMetrics) {
+      AppStatusUtils.gettingResultTime(launchTime, resultFetchStart, duration)
+    } else {
+      -1L
+    }
   }
 
-  @JsonIgnore @KVIndex("active")
-  def active: Boolean = info.duration.isEmpty
+  /**
+   * Sorting by accumulators is a little weird, and the previous behavior would generate
+   * insanely long keys in the index. So this implementation just considers the first
+   * accumulator and its String representation.
+   */
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ACCUMULATORS, parent = TaskIndexNames.STAGE)
+  private def accumulators: String = {
+    if (accumulatorUpdates.nonEmpty) {
+      val acc = accumulatorUpdates.head
+      s"${acc.name}:${acc.value}"
+    } else {
+      ""
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_READS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalReads: Long = {
+    if (hasMetrics) {
+      shuffleLocalBytesRead + shuffleRemoteBytesRead
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.SHUFFLE_TOTAL_BLOCKS, parent = TaskIndexNames.STAGE)
+  private def shuffleTotalBlocks: Long = {
+    if (hasMetrics) {
+      shuffleLocalBlocksFetched + shuffleRemoteBlocksFetched
+    } else {
+      -1L
+    }
+  }
+
+  @JsonIgnore @KVIndex(value = TaskIndexNames.ERROR, parent = TaskIndexNames.STAGE)
+  private def error: String = if (errorMessage.isDefined) errorMessage.get else ""
 
 }
 
@@ -134,10 +356,13 @@ private[spark] class ExecutorStageSummaryWrapper(
     val info: ExecutorStageSummary) {
 
   @JsonIgnore @KVIndex
-  val id: Array[Any] = Array(stageId, stageAttemptId, executorId)
+  private val _id: Array[Any] = Array(stageId, stageAttemptId, executorId)
 
   @JsonIgnore @KVIndex("stage")
-  private[this] val stage: Array[Int] = Array(stageId, stageAttemptId)
+  private def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+  @JsonIgnore
+  def id: Array[Any] = _id
 
 }
 
@@ -203,3 +428,53 @@ private[spark] class AppSummary(
   def id: String = classOf[AppSummary].getName()
 
 }
+
+/**
+ * A cached view of a specific quantile for one stage attempt's metrics.
+ */
+private[spark] class CachedQuantile(
+    val stageId: Int,
+    val stageAttemptId: Int,
+    val quantile: String,
+    val taskCount: Long,
+
+    // The following fields are an exploded view of a single entry for TaskMetricDistributions.
+    val executorDeserializeTime: Double,
+    val executorDeserializeCpuTime: Double,
+    val executorRunTime: Double,
+    val executorCpuTime: Double,
+    val resultSize: Double,
+    val jvmGcTime: Double,
+    val resultSerializationTime: Double,
+    val gettingResultTime: Double,
+    val schedulerDelay: Double,
+    val peakExecutionMemory: Double,
+    val memoryBytesSpilled: Double,
+    val diskBytesSpilled: Double,
+
+    val bytesRead: Double,
+    val recordsRead: Double,
+
+    val bytesWritten: Double,
+    val recordsWritten: Double,
+
+    val shuffleReadBytes: Double,
+    val shuffleRecordsRead: Double,
+    val shuffleRemoteBlocksFetched: Double,
+    val shuffleLocalBlocksFetched: Double,
+    val shuffleFetchWaitTime: Double,
+    val shuffleRemoteBytesRead: Double,
+    val shuffleRemoteBytesReadToDisk: Double,
+    val shuffleTotalBlocksFetched: Double,
+
+    val shuffleWriteBytes: Double,
+    val shuffleWriteRecords: Double,
+    val shuffleWriteTime: Double) {
+
+  @KVIndex @JsonIgnore
+  def id: Array[Any] = Array(stageId, stageAttemptId, quantile)
+
+  @KVIndex("stage") @JsonIgnore
+  def stage: Array[Int] = Array(stageId, stageAttemptId)
+
+}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index 41d42b52430a5..95c12b1e73653 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -87,7 +87,9 @@ private[ui] class ExecutorTable(stage: StageData, store: AppStatusStore) {
   }
 
   private def createExecutorTable(stage: StageData) : Seq[Node] = {
-    stage.executorSummary.getOrElse(Map.empty).toSeq.sortBy(_._1).map { case (k, v) =>
+    val executorSummary = store.executorSummary(stage.stageId, stage.attemptId)
+
+    executorSummary.toSeq.sortBy(_._1).map { case (k, v) =>
       val executor = store.asOption(store.executorSummary(k))
       <tr>
         <td>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
index 740f12e7d13d4..bf59152c8c0cd 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobPage.scala
@@ -201,7 +201,7 @@ private[ui] class JobPage(parent: JobsTab, store: AppStatusStore) extends WebUIP
     val stages = jobData.stageIds.map { stageId =>
       // This could be empty if the listener hasn't received information about the
       // stage or if the stage information has been garbage collected
-      store.stageData(stageId).lastOption.getOrElse {
+      store.asOption(store.lastStageAttempt(stageId)).getOrElse {
         new v1.StageData(
           v1.StageStatus.PENDING,
           stageId,
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 11a6a34344976..7c6e06cf183ba 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ui.jobs
 
 import java.net.URLEncoder
 import java.util.Date
+import java.util.concurrent.TimeUnit
 import javax.servlet.http.HttpServletRequest
 
 import scala.collection.mutable.{HashMap, HashSet}
@@ -29,15 +30,14 @@ import org.apache.commons.lang3.StringEscapeUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.internal.config._
 import org.apache.spark.scheduler.TaskLocality
-import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status._
 import org.apache.spark.status.api.v1._
 import org.apache.spark.ui._
-import org.apache.spark.util.{Distribution, Utils}
+import org.apache.spark.util.Utils
 
 /** Page showing statistics and task list for a given stage */
 private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends WebUIPage("stage") {
   import ApiHelper._
-  import StagePage._
 
   private val TIMELINE_LEGEND = {
     <div class="legend-area">
@@ -67,17 +67,17 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
   // if we find that it's okay.
   private val MAX_TIMELINE_TASKS = parent.conf.getInt("spark.ui.timeline.tasks.maximum", 1000)
 
-  private def getLocalitySummaryString(stageData: StageData, taskList: Seq[TaskData]): String = {
-    val localities = taskList.map(_.taskLocality)
-    val localityCounts = localities.groupBy(identity).mapValues(_.size)
+  private def getLocalitySummaryString(localitySummary: Map[String, Long]): String = {
     val names = Map(
       TaskLocality.PROCESS_LOCAL.toString() -> "Process local",
       TaskLocality.NODE_LOCAL.toString() -> "Node local",
       TaskLocality.RACK_LOCAL.toString() -> "Rack local",
       TaskLocality.ANY.toString() -> "Any")
-    val localityNamesAndCounts = localityCounts.toSeq.map { case (locality, count) =>
-      s"${names(locality)}: $count"
-    }
+    val localityNamesAndCounts = names.flatMap { case (key, name) =>
+      localitySummary.get(key).map { count =>
+        s"$name: $count"
+      }
+    }.toSeq
     localityNamesAndCounts.sorted.mkString("; ")
   }
 
@@ -108,7 +108,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
     val stageHeader = s"Details for Stage $stageId (Attempt $stageAttemptId)"
     val stageData = parent.store
-      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = true))
+      .asOption(parent.store.stageAttempt(stageId, stageAttemptId, details = false))
       .getOrElse {
         val content =
           <div id="no-info">
@@ -117,8 +117,11 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         return UIUtils.headerSparkPage(stageHeader, content, parent)
       }
 
-    val tasks = stageData.tasks.getOrElse(Map.empty).values.toSeq
-    if (tasks.isEmpty) {
+    val localitySummary = store.localitySummary(stageData.stageId, stageData.attemptId)
+
+    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
+      stageData.numFailedTasks + stageData.numKilledTasks
+    if (totalTasks == 0) {
       val content =
         <div>
           <h4>Summary Metrics</h4> No tasks have started yet
@@ -127,18 +130,14 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       return UIUtils.headerSparkPage(stageHeader, content, parent)
     }
 
+    val storedTasks = store.taskCount(stageData.stageId, stageData.attemptId)
     val numCompleted = stageData.numCompleteTasks
-    val totalTasks = stageData.numActiveTasks + stageData.numCompleteTasks +
-      stageData.numFailedTasks + stageData.numKilledTasks
-    val totalTasksNumStr = if (totalTasks == tasks.size) {
+    val totalTasksNumStr = if (totalTasks == storedTasks) {
       s"$totalTasks"
     } else {
-      s"$totalTasks, showing ${tasks.size}"
+      s"$totalTasks, showing ${storedTasks}"
     }
 
-    val externalAccumulables = stageData.accumulatorUpdates
-    val hasAccumulators = externalAccumulables.size > 0
-
     val summary =
       <div>
         <ul class="unstyled">
@@ -148,7 +147,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
           </li>
           <li>
             <strong>Locality Level Summary: </strong>
-            {getLocalitySummaryString(stageData, tasks)}
+            {getLocalitySummaryString(localitySummary)}
           </li>
           {if (hasInput(stageData)) {
             <li>
@@ -266,7 +265,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val accumulableTable = UIUtils.listingTable(
       accumulableHeaders,
       accumulableRow,
-      externalAccumulables.toSeq)
+      stageData.accumulatorUpdates.toSeq)
 
     val page: Int = {
       // If the user has changed to a larger page size, then go to page 1 in order to avoid
@@ -280,16 +279,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
     val currentTime = System.currentTimeMillis()
     val (taskTable, taskTableHTML) = try {
       val _taskTable = new TaskPagedTable(
-        parent.conf,
+        stageData,
         UIUtils.prependBaseUri(parent.basePath) +
           s"/stages/stage?id=${stageId}&attempt=${stageAttemptId}",
-        tasks,
-        hasAccumulators,
-        hasInput(stageData),
-        hasOutput(stageData),
-        hasShuffleRead(stageData),
-        hasShuffleWrite(stageData),
-        hasBytesSpilled(stageData),
         currentTime,
         pageSize = taskPageSize,
         sortColumn = taskSortColumn,
@@ -320,217 +312,155 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
             |  }
             |});
           """.stripMargin
-         }
+          }
         }
       </script>
 
-    val taskIdsInPage = if (taskTable == null) Set.empty[Long]
-      else taskTable.dataSource.slicedTaskIds
+    val metricsSummary = store.taskSummary(stageData.stageId, stageData.attemptId,
+      Array(0, 0.25, 0.5, 0.75, 1.0))
 
-    // Excludes tasks which failed and have incomplete metrics
-    val validTasks = tasks.filter(t => t.status == "SUCCESS" && t.taskMetrics.isDefined)
-
-    val summaryTable: Option[Seq[Node]] =
-      if (validTasks.size == 0) {
-        None
-      } else {
-        def getDistributionQuantiles(data: Seq[Double]): IndexedSeq[Double] = {
-          Distribution(data).get.getQuantiles()
-        }
-        def getFormattedTimeQuantiles(times: Seq[Double]): Seq[Node] = {
-          getDistributionQuantiles(times).map { millis =>
-            <td>{UIUtils.formatDuration(millis.toLong)}</td>
-          }
-        }
-        def getFormattedSizeQuantiles(data: Seq[Double]): Seq[Elem] = {
-          getDistributionQuantiles(data).map(d => <td>{Utils.bytesToString(d.toLong)}</td>)
+    val summaryTable = metricsSummary.map { metrics =>
+      def timeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { millis =>
+          <td>{UIUtils.formatDuration(millis.toLong)}</td>
         }
+      }
 
-        val deserializationTimes = validTasks.map { task =>
-          task.taskMetrics.get.executorDeserializeTime.toDouble
-        }
-        val deserializationQuantiles =
-          <td>
-            <span data-toggle="tooltip" title={ToolTips.TASK_DESERIALIZATION_TIME}
-                  data-placement="right">
-              Task Deserialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(deserializationTimes)
-
-        val serviceTimes = validTasks.map(_.taskMetrics.get.executorRunTime.toDouble)
-        val serviceQuantiles = <td>Duration</td> +: getFormattedTimeQuantiles(serviceTimes)
-
-        val gcTimes = validTasks.map(_.taskMetrics.get.jvmGcTime.toDouble)
-        val gcQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GC_TIME} data-placement="right">GC Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(gcTimes)
-
-        val serializationTimes = validTasks.map(_.taskMetrics.get.resultSerializationTime.toDouble)
-        val serializationQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.RESULT_SERIALIZATION_TIME} data-placement="right">
-              Result Serialization Time
-            </span>
-          </td> +: getFormattedTimeQuantiles(serializationTimes)
-
-        val gettingResultTimes = validTasks.map(getGettingResultTime(_, currentTime).toDouble)
-        val gettingResultQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                title={ToolTips.GETTING_RESULT_TIME} data-placement="right">
-              Getting Result Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(gettingResultTimes)
-
-        val peakExecutionMemory = validTasks.map(_.taskMetrics.get.peakExecutionMemory.toDouble)
-        val peakExecutionMemoryQuantiles = {
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.PEAK_EXECUTION_MEMORY} data-placement="right">
-              Peak Execution Memory
-            </span>
-          </td> +: getFormattedSizeQuantiles(peakExecutionMemory)
+      def sizeQuantiles(data: IndexedSeq[Double]): Seq[Node] = {
+        data.map { size =>
+          <td>{Utils.bytesToString(size.toLong)}</td>
         }
+      }
 
-        // The scheduler delay includes the network delay to send the task to the worker
-        // machine and to send back the result (but not the time to fetch the task result,
-        // if it needed to be fetched from the block manager on the worker).
-        val schedulerDelays = validTasks.map { task =>
-          getSchedulerDelay(task, task.taskMetrics.get, currentTime).toDouble
-        }
-        val schedulerDelayTitle = <td><span data-toggle="tooltip"
-          title={ToolTips.SCHEDULER_DELAY} data-placement="right">Scheduler Delay</span></td>
-        val schedulerDelayQuantiles = schedulerDelayTitle +:
-          getFormattedTimeQuantiles(schedulerDelays)
-        def getFormattedSizeQuantilesWithRecords(data: Seq[Double], records: Seq[Double])
-          : Seq[Elem] = {
-          val recordDist = getDistributionQuantiles(records).iterator
-          getDistributionQuantiles(data).map(d =>
-            <td>{s"${Utils.bytesToString(d.toLong)} / ${recordDist.next().toLong}"}</td>
-          )
+      def sizeQuantilesWithRecords(
+          data: IndexedSeq[Double],
+          records: IndexedSeq[Double]) : Seq[Node] = {
+        data.zip(records).map { case (d, r) =>
+          <td>{s"${Utils.bytesToString(d.toLong)} / ${r.toLong}"}</td>
         }
+      }
 
-        val inputSizes = validTasks.map(_.taskMetrics.get.inputMetrics.bytesRead.toDouble)
-        val inputRecords = validTasks.map(_.taskMetrics.get.inputMetrics.recordsRead.toDouble)
-        val inputQuantiles = <td>Input Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(inputSizes, inputRecords)
+      def titleCell(title: String, tooltip: String): Seq[Node] = {
+        <td>
+          <span data-toggle="tooltip" title={tooltip} data-placement="right">
+            {title}
+          </span>
+        </td>
+      }
 
-        val outputSizes = validTasks.map(_.taskMetrics.get.outputMetrics.bytesWritten.toDouble)
-        val outputRecords = validTasks.map(_.taskMetrics.get.outputMetrics.recordsWritten.toDouble)
-        val outputQuantiles = <td>Output Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(outputSizes, outputRecords)
+      def simpleTitleCell(title: String): Seq[Node] = <td>{title}</td>
 
-        val shuffleReadBlockedTimes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.fetchWaitTime.toDouble
-        }
-        val shuffleReadBlockedQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_BLOCKED_TIME} data-placement="right">
-              Shuffle Read Blocked Time
-            </span>
-          </td> +:
-          getFormattedTimeQuantiles(shuffleReadBlockedTimes)
-
-        val shuffleReadTotalSizes = validTasks.map { task =>
-          totalBytesRead(task.taskMetrics.get.shuffleReadMetrics).toDouble
-        }
-        val shuffleReadTotalRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.recordsRead.toDouble
-        }
-        val shuffleReadTotalQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ} data-placement="right">
-              Shuffle Read Size / Records
-            </span>
-          </td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleReadTotalSizes, shuffleReadTotalRecords)
-
-        val shuffleReadRemoteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleReadMetrics.remoteBytesRead.toDouble
-        }
-        val shuffleReadRemoteQuantiles =
-          <td>
-            <span data-toggle="tooltip"
-                  title={ToolTips.SHUFFLE_READ_REMOTE_SIZE} data-placement="right">
-              Shuffle Remote Reads
-            </span>
-          </td> +:
-          getFormattedSizeQuantiles(shuffleReadRemoteSizes)
-
-        val shuffleWriteSizes = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.bytesWritten.toDouble
-        }
+      val deserializationQuantiles = titleCell("Task Deserialization Time",
+        ToolTips.TASK_DESERIALIZATION_TIME) ++ timeQuantiles(metrics.executorDeserializeTime)
 
-        val shuffleWriteRecords = validTasks.map { task =>
-          task.taskMetrics.get.shuffleWriteMetrics.recordsWritten.toDouble
-        }
+      val serviceQuantiles = simpleTitleCell("Duration") ++ timeQuantiles(metrics.executorRunTime)
 
-        val shuffleWriteQuantiles = <td>Shuffle Write Size / Records</td> +:
-          getFormattedSizeQuantilesWithRecords(shuffleWriteSizes, shuffleWriteRecords)
+      val gcQuantiles = titleCell("GC Time", ToolTips.GC_TIME) ++ timeQuantiles(metrics.jvmGcTime)
 
-        val memoryBytesSpilledSizes = validTasks.map(_.taskMetrics.get.memoryBytesSpilled.toDouble)
-        val memoryBytesSpilledQuantiles = <td>Shuffle spill (memory)</td> +:
-          getFormattedSizeQuantiles(memoryBytesSpilledSizes)
+      val serializationQuantiles = titleCell("Result Serialization Time",
+        ToolTips.RESULT_SERIALIZATION_TIME) ++ timeQuantiles(metrics.resultSerializationTime)
 
-        val diskBytesSpilledSizes = validTasks.map(_.taskMetrics.get.diskBytesSpilled.toDouble)
-        val diskBytesSpilledQuantiles = <td>Shuffle spill (disk)</td> +:
-          getFormattedSizeQuantiles(diskBytesSpilledSizes)
+      val gettingResultQuantiles = titleCell("Getting Result Time", ToolTips.GETTING_RESULT_TIME) ++
+        timeQuantiles(metrics.gettingResultTime)
 
-        val listings: Seq[Seq[Node]] = Seq(
-          <tr>{serviceQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-            {deserializationQuantiles}
-          </tr>
-          <tr>{gcQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-            {serializationQuantiles}
-          </tr>,
-          <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
-          <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-            {peakExecutionMemoryQuantiles}
-          </tr>,
-          if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
-          if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
-          if (hasShuffleRead(stageData)) {
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-              {shuffleReadBlockedQuantiles}
-            </tr>
-            <tr>{shuffleReadTotalQuantiles}</tr>
-            <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-              {shuffleReadRemoteQuantiles}
-            </tr>
-          } else {
-            Nil
-          },
-          if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
-          if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
-
-        val quantileHeaders = Seq("Metric", "Min", "25th percentile",
-          "Median", "75th percentile", "Max")
-        // The summary table does not use CSS to stripe rows, which doesn't work with hidden
-        // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
-        Some(UIUtils.listingTable(
-          quantileHeaders,
-          identity[Seq[Node]],
-          listings,
-          fixedWidth = true,
-          id = Some("task-summary-table"),
-          stripeRowsWithCss = false))
+      val peakExecutionMemoryQuantiles = titleCell("Peak Execution Memory",
+        ToolTips.PEAK_EXECUTION_MEMORY) ++ sizeQuantiles(metrics.peakExecutionMemory)
+
+      // The scheduler delay includes the network delay to send the task to the worker
+      // machine and to send back the result (but not the time to fetch the task result,
+      // if it needed to be fetched from the block manager on the worker).
+      val schedulerDelayQuantiles = titleCell("Scheduler Delay", ToolTips.SCHEDULER_DELAY) ++
+        timeQuantiles(metrics.schedulerDelay)
+
+      def inputQuantiles: Seq[Node] = {
+        simpleTitleCell("Input Size / Records") ++
+          sizeQuantilesWithRecords(metrics.inputMetrics.bytesRead, metrics.inputMetrics.recordsRead)
+      }
+
+      def outputQuantiles: Seq[Node] = {
+        simpleTitleCell("Output Size / Records") ++
+          sizeQuantilesWithRecords(metrics.outputMetrics.bytesWritten,
+            metrics.outputMetrics.recordsWritten)
       }
 
+      def shuffleReadBlockedQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Blocked Time", ToolTips.SHUFFLE_READ_BLOCKED_TIME) ++
+          timeQuantiles(metrics.shuffleReadMetrics.fetchWaitTime)
+      }
+
+      def shuffleReadTotalQuantiles: Seq[Node] = {
+        titleCell("Shuffle Read Size / Records", ToolTips.SHUFFLE_READ) ++
+          sizeQuantilesWithRecords(metrics.shuffleReadMetrics.readBytes,
+            metrics.shuffleReadMetrics.readRecords)
+      }
+
+      def shuffleReadRemoteQuantiles: Seq[Node] = {
+        titleCell("Shuffle Remote Reads", ToolTips.SHUFFLE_READ_REMOTE_SIZE) ++
+          sizeQuantiles(metrics.shuffleReadMetrics.remoteBytesRead)
+      }
+
+      def shuffleWriteQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle Write Size / Records") ++
+          sizeQuantilesWithRecords(metrics.shuffleWriteMetrics.writeBytes,
+            metrics.shuffleWriteMetrics.writeRecords)
+      }
+
+      def memoryBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (memory)") ++ sizeQuantiles(metrics.memoryBytesSpilled)
+      }
+
+      def diskBytesSpilledQuantiles: Seq[Node] = {
+        simpleTitleCell("Shuffle spill (disk)") ++ sizeQuantiles(metrics.diskBytesSpilled)
+      }
+
+      val listings: Seq[Seq[Node]] = Seq(
+        <tr>{serviceQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.SCHEDULER_DELAY}>{schedulerDelayQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
+          {deserializationQuantiles}
+        </tr>
+        <tr>{gcQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
+          {serializationQuantiles}
+        </tr>,
+        <tr class={TaskDetailsClassNames.GETTING_RESULT_TIME}>{gettingResultQuantiles}</tr>,
+        <tr class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
+          {peakExecutionMemoryQuantiles}
+        </tr>,
+        if (hasInput(stageData)) <tr>{inputQuantiles}</tr> else Nil,
+        if (hasOutput(stageData)) <tr>{outputQuantiles}</tr> else Nil,
+        if (hasShuffleRead(stageData)) {
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
+            {shuffleReadBlockedQuantiles}
+          </tr>
+          <tr>{shuffleReadTotalQuantiles}</tr>
+          <tr class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
+            {shuffleReadRemoteQuantiles}
+          </tr>
+        } else {
+          Nil
+        },
+        if (hasShuffleWrite(stageData)) <tr>{shuffleWriteQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{memoryBytesSpilledQuantiles}</tr> else Nil,
+        if (hasBytesSpilled(stageData)) <tr>{diskBytesSpilledQuantiles}</tr> else Nil)
+
+      val quantileHeaders = Seq("Metric", "Min", "25th percentile", "Median", "75th percentile",
+        "Max")
+      // The summary table does not use CSS to stripe rows, which doesn't work with hidden
+      // rows (instead, JavaScript in table.js is used to stripe the non-hidden rows).
+      UIUtils.listingTable(
+        quantileHeaders,
+        identity[Seq[Node]],
+        listings,
+        fixedWidth = true,
+        id = Some("task-summary-table"),
+        stripeRowsWithCss = false)
+    }
+
     val executorTable = new ExecutorTable(stageData, parent.store)
 
     val maybeAccumulableTable: Seq[Node] =
-      if (hasAccumulators) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
+      if (hasAccumulators(stageData)) { <h4>Accumulators</h4> ++ accumulableTable } else Seq()
 
     val aggMetrics =
       <span class="collapse-aggregated-metrics collapse-table"
@@ -550,7 +480,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       showAdditionalMetrics ++
       makeTimeline(
         // Only show the tasks in the table
-        tasks.filter { t => taskIdsInPage.contains(t.taskId) },
+        Option(taskTable).map(_.dataSource.tasks).getOrElse(Nil),
         currentTime) ++
       <h4>Summary Metrics for <a href="#tasks-section">{numCompleted} Completed Tasks</a></h4> ++
       <div>{summaryTable.getOrElse("No tasks have reported metrics yet.")}</div> ++
@@ -593,10 +523,9 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
         val serializationTimeProportion = toProportion(serializationTime)
         val deserializationTime = metricsOpt.map(_.executorDeserializeTime).getOrElse(0L)
         val deserializationTimeProportion = toProportion(deserializationTime)
-        val gettingResultTime = getGettingResultTime(taskInfo, currentTime)
+        val gettingResultTime = AppStatusUtils.gettingResultTime(taskInfo)
         val gettingResultTimeProportion = toProportion(gettingResultTime)
-        val schedulerDelay =
-          metricsOpt.map(getSchedulerDelay(taskInfo, _, currentTime)).getOrElse(0L)
+        val schedulerDelay = AppStatusUtils.schedulerDelay(taskInfo)
         val schedulerDelayProportion = toProportion(schedulerDelay)
 
         val executorOverhead = serializationTime + deserializationTime
@@ -708,7 +637,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
       {
         if (MAX_TIMELINE_TASKS < tasks.size) {
           <strong>
-            This stage has more than the maximum number of tasks that can be shown in the
+            This page has more than the maximum number of tasks that can be shown in the
             visualization! Only the most recent {MAX_TIMELINE_TASKS} tasks
             (of {tasks.size} total) are shown.
           </strong>
@@ -733,402 +662,49 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
 
 }
 
-private[ui] object StagePage {
-  private[ui] def getGettingResultTime(info: TaskData, currentTime: Long): Long = {
-    info.resultFetchStart match {
-      case Some(start) =>
-        info.duration match {
-          case Some(duration) =>
-            info.launchTime.getTime() + duration - start.getTime()
-
-          case _ =>
-            currentTime - start.getTime()
-        }
-
-      case _ =>
-        0L
-    }
-  }
-
-  private[ui] def getSchedulerDelay(
-      info: TaskData,
-      metrics: TaskMetrics,
-      currentTime: Long): Long = {
-    info.duration match {
-      case Some(duration) =>
-        val executorOverhead = metrics.executorDeserializeTime + metrics.resultSerializationTime
-        math.max(
-          0,
-          duration - metrics.executorRunTime - executorOverhead -
-            getGettingResultTime(info, currentTime))
-
-      case _ =>
-        // The task is still running and the metrics like executorRunTime are not available.
-        0L
-    }
-  }
-
-}
-
-private[ui] case class TaskTableRowInputData(inputSortable: Long, inputReadable: String)
-
-private[ui] case class TaskTableRowOutputData(outputSortable: Long, outputReadable: String)
-
-private[ui] case class TaskTableRowShuffleReadData(
-    shuffleReadBlockedTimeSortable: Long,
-    shuffleReadBlockedTimeReadable: String,
-    shuffleReadSortable: Long,
-    shuffleReadReadable: String,
-    shuffleReadRemoteSortable: Long,
-    shuffleReadRemoteReadable: String)
-
-private[ui] case class TaskTableRowShuffleWriteData(
-    writeTimeSortable: Long,
-    writeTimeReadable: String,
-    shuffleWriteSortable: Long,
-    shuffleWriteReadable: String)
-
-private[ui] case class TaskTableRowBytesSpilledData(
-    memoryBytesSpilledSortable: Long,
-    memoryBytesSpilledReadable: String,
-    diskBytesSpilledSortable: Long,
-    diskBytesSpilledReadable: String)
-
-/**
- * Contains all data that needs for sorting and generating HTML. Using this one rather than
- * TaskData to avoid creating duplicate contents during sorting the data.
- */
-private[ui] class TaskTableRowData(
-    val index: Int,
-    val taskId: Long,
-    val attempt: Int,
-    val speculative: Boolean,
-    val status: String,
-    val taskLocality: String,
-    val executorId: String,
-    val host: String,
-    val launchTime: Long,
-    val duration: Long,
-    val formatDuration: String,
-    val schedulerDelay: Long,
-    val taskDeserializationTime: Long,
-    val gcTime: Long,
-    val serializationTime: Long,
-    val gettingResultTime: Long,
-    val peakExecutionMemoryUsed: Long,
-    val accumulators: Option[String], // HTML
-    val input: Option[TaskTableRowInputData],
-    val output: Option[TaskTableRowOutputData],
-    val shuffleRead: Option[TaskTableRowShuffleReadData],
-    val shuffleWrite: Option[TaskTableRowShuffleWriteData],
-    val bytesSpilled: Option[TaskTableRowBytesSpilledData],
-    val error: String,
-    val logs: Map[String, String])
-
 private[ui] class TaskDataSource(
-    tasks: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
+    stage: StageData,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedDataSource[TaskTableRowData](pageSize) {
-  import StagePage._
+    store: AppStatusStore) extends PagedDataSource[TaskData](pageSize) {
+  import ApiHelper._
 
   // Keep an internal cache of executor log maps so that long task lists render faster.
   private val executorIdToLogs = new HashMap[String, Map[String, String]]()
 
-  // Convert TaskData to TaskTableRowData which contains the final contents to show in the table
-  // so that we can avoid creating duplicate contents during sorting the data
-  private val data = tasks.map(taskRow).sorted(ordering(sortColumn, desc))
-
-  private var _slicedTaskIds: Set[Long] = _
+  private var _tasksToShow: Seq[TaskData] = null
 
-  override def dataSize: Int = data.size
+  override def dataSize: Int = stage.numCompleteTasks + stage.numFailedTasks + stage.numKilledTasks
 
-  override def sliceData(from: Int, to: Int): Seq[TaskTableRowData] = {
-    val r = data.slice(from, to)
-    _slicedTaskIds = r.map(_.taskId).toSet
-    r
-  }
-
-  def slicedTaskIds: Set[Long] = _slicedTaskIds
-
-  private def taskRow(info: TaskData): TaskTableRowData = {
-    val metrics = info.taskMetrics
-    val duration = info.duration.getOrElse(1L)
-    val formatDuration = info.duration.map(d => UIUtils.formatDuration(d)).getOrElse("")
-    val schedulerDelay = metrics.map(getSchedulerDelay(info, _, currentTime)).getOrElse(0L)
-    val gcTime = metrics.map(_.jvmGcTime).getOrElse(0L)
-    val taskDeserializationTime = metrics.map(_.executorDeserializeTime).getOrElse(0L)
-    val serializationTime = metrics.map(_.resultSerializationTime).getOrElse(0L)
-    val gettingResultTime = getGettingResultTime(info, currentTime)
-
-    val externalAccumulableReadable = info.accumulatorUpdates.map { acc =>
-      StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}")
+  override def sliceData(from: Int, to: Int): Seq[TaskData] = {
+    if (_tasksToShow == null) {
+      _tasksToShow = store.taskList(stage.stageId, stage.attemptId, from, to - from,
+        indexName(sortColumn), !desc)
     }
-    val peakExecutionMemoryUsed = metrics.map(_.peakExecutionMemory).getOrElse(0L)
-
-    val maybeInput = metrics.map(_.inputMetrics)
-    val inputSortable = maybeInput.map(_.bytesRead).getOrElse(0L)
-    val inputReadable = maybeInput
-      .map(m => s"${Utils.bytesToString(m.bytesRead)}")
-      .getOrElse("")
-    val inputRecords = maybeInput.map(_.recordsRead.toString).getOrElse("")
-
-    val maybeOutput = metrics.map(_.outputMetrics)
-    val outputSortable = maybeOutput.map(_.bytesWritten).getOrElse(0L)
-    val outputReadable = maybeOutput
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}")
-      .getOrElse("")
-    val outputRecords = maybeOutput.map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeShuffleRead = metrics.map(_.shuffleReadMetrics)
-    val shuffleReadBlockedTimeSortable = maybeShuffleRead.map(_.fetchWaitTime).getOrElse(0L)
-    val shuffleReadBlockedTimeReadable =
-      maybeShuffleRead.map(ms => UIUtils.formatDuration(ms.fetchWaitTime)).getOrElse("")
-
-    val totalShuffleBytes = maybeShuffleRead.map(ApiHelper.totalBytesRead)
-    val shuffleReadSortable = totalShuffleBytes.getOrElse(0L)
-    val shuffleReadReadable = totalShuffleBytes.map(Utils.bytesToString).getOrElse("")
-    val shuffleReadRecords = maybeShuffleRead.map(_.recordsRead.toString).getOrElse("")
-
-    val remoteShuffleBytes = maybeShuffleRead.map(_.remoteBytesRead)
-    val shuffleReadRemoteSortable = remoteShuffleBytes.getOrElse(0L)
-    val shuffleReadRemoteReadable = remoteShuffleBytes.map(Utils.bytesToString).getOrElse("")
-
-    val maybeShuffleWrite = metrics.map(_.shuffleWriteMetrics)
-    val shuffleWriteSortable = maybeShuffleWrite.map(_.bytesWritten).getOrElse(0L)
-    val shuffleWriteReadable = maybeShuffleWrite
-      .map(m => s"${Utils.bytesToString(m.bytesWritten)}").getOrElse("")
-    val shuffleWriteRecords = maybeShuffleWrite
-      .map(_.recordsWritten.toString).getOrElse("")
-
-    val maybeWriteTime = metrics.map(_.shuffleWriteMetrics.writeTime)
-    val writeTimeSortable = maybeWriteTime.getOrElse(0L)
-    val writeTimeReadable = maybeWriteTime.map(t => t / (1000 * 1000)).map { ms =>
-      if (ms == 0) "" else UIUtils.formatDuration(ms)
-    }.getOrElse("")
-
-    val maybeMemoryBytesSpilled = metrics.map(_.memoryBytesSpilled)
-    val memoryBytesSpilledSortable = maybeMemoryBytesSpilled.getOrElse(0L)
-    val memoryBytesSpilledReadable =
-      maybeMemoryBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val maybeDiskBytesSpilled = metrics.map(_.diskBytesSpilled)
-    val diskBytesSpilledSortable = maybeDiskBytesSpilled.getOrElse(0L)
-    val diskBytesSpilledReadable = maybeDiskBytesSpilled.map(Utils.bytesToString).getOrElse("")
-
-    val input =
-      if (hasInput) {
-        Some(TaskTableRowInputData(inputSortable, s"$inputReadable / $inputRecords"))
-      } else {
-        None
-      }
-
-    val output =
-      if (hasOutput) {
-        Some(TaskTableRowOutputData(outputSortable, s"$outputReadable / $outputRecords"))
-      } else {
-        None
-      }
-
-    val shuffleRead =
-      if (hasShuffleRead) {
-        Some(TaskTableRowShuffleReadData(
-          shuffleReadBlockedTimeSortable,
-          shuffleReadBlockedTimeReadable,
-          shuffleReadSortable,
-          s"$shuffleReadReadable / $shuffleReadRecords",
-          shuffleReadRemoteSortable,
-          shuffleReadRemoteReadable
-        ))
-      } else {
-        None
-      }
-
-    val shuffleWrite =
-      if (hasShuffleWrite) {
-        Some(TaskTableRowShuffleWriteData(
-          writeTimeSortable,
-          writeTimeReadable,
-          shuffleWriteSortable,
-          s"$shuffleWriteReadable / $shuffleWriteRecords"
-        ))
-      } else {
-        None
-      }
-
-    val bytesSpilled =
-      if (hasBytesSpilled) {
-        Some(TaskTableRowBytesSpilledData(
-          memoryBytesSpilledSortable,
-          memoryBytesSpilledReadable,
-          diskBytesSpilledSortable,
-          diskBytesSpilledReadable
-        ))
-      } else {
-        None
-      }
-
-    new TaskTableRowData(
-      info.index,
-      info.taskId,
-      info.attempt,
-      info.speculative,
-      info.status,
-      info.taskLocality.toString,
-      info.executorId,
-      info.host,
-      info.launchTime.getTime(),
-      duration,
-      formatDuration,
-      schedulerDelay,
-      taskDeserializationTime,
-      gcTime,
-      serializationTime,
-      gettingResultTime,
-      peakExecutionMemoryUsed,
-      if (hasAccumulators) Some(externalAccumulableReadable.mkString("<br/>")) else None,
-      input,
-      output,
-      shuffleRead,
-      shuffleWrite,
-      bytesSpilled,
-      info.errorMessage.getOrElse(""),
-      executorLogs(info.executorId))
+    _tasksToShow
   }
 
-  private def executorLogs(id: String): Map[String, String] = {
+  def tasks: Seq[TaskData] = _tasksToShow
+
+  def executorLogs(id: String): Map[String, String] = {
     executorIdToLogs.getOrElseUpdate(id,
       store.asOption(store.executorSummary(id)).map(_.executorLogs).getOrElse(Map.empty))
   }
 
-  /**
-   * Return Ordering according to sortColumn and desc
-   */
-  private def ordering(sortColumn: String, desc: Boolean): Ordering[TaskTableRowData] = {
-    val ordering: Ordering[TaskTableRowData] = sortColumn match {
-      case "Index" => Ordering.by(_.index)
-      case "ID" => Ordering.by(_.taskId)
-      case "Attempt" => Ordering.by(_.attempt)
-      case "Status" => Ordering.by(_.status)
-      case "Locality Level" => Ordering.by(_.taskLocality)
-      case "Executor ID" => Ordering.by(_.executorId)
-      case "Host" => Ordering.by(_.host)
-      case "Launch Time" => Ordering.by(_.launchTime)
-      case "Duration" => Ordering.by(_.duration)
-      case "Scheduler Delay" => Ordering.by(_.schedulerDelay)
-      case "Task Deserialization Time" => Ordering.by(_.taskDeserializationTime)
-      case "GC Time" => Ordering.by(_.gcTime)
-      case "Result Serialization Time" => Ordering.by(_.serializationTime)
-      case "Getting Result Time" => Ordering.by(_.gettingResultTime)
-      case "Peak Execution Memory" => Ordering.by(_.peakExecutionMemoryUsed)
-      case "Accumulators" =>
-        if (hasAccumulators) {
-          Ordering.by(_.accumulators.get)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Accumulators because of no accumulators")
-        }
-      case "Input Size / Records" =>
-        if (hasInput) {
-          Ordering.by(_.input.get.inputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Input Size / Records because of no inputs")
-        }
-      case "Output Size / Records" =>
-        if (hasOutput) {
-          Ordering.by(_.output.get.outputSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Output Size / Records because of no outputs")
-        }
-      // ShuffleRead
-      case "Shuffle Read Blocked Time" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadBlockedTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Blocked Time because of no shuffle reads")
-        }
-      case "Shuffle Read Size / Records" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Read Size / Records because of no shuffle reads")
-        }
-      case "Shuffle Remote Reads" =>
-        if (hasShuffleRead) {
-          Ordering.by(_.shuffleRead.get.shuffleReadRemoteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Remote Reads because of no shuffle reads")
-        }
-      // ShuffleWrite
-      case "Write Time" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.writeTimeSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Write Time because of no shuffle writes")
-        }
-      case "Shuffle Write Size / Records" =>
-        if (hasShuffleWrite) {
-          Ordering.by(_.shuffleWrite.get.shuffleWriteSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Write Size / Records because of no shuffle writes")
-        }
-      // BytesSpilled
-      case "Shuffle Spill (Memory)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.memoryBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Memory) because of no spills")
-        }
-      case "Shuffle Spill (Disk)" =>
-        if (hasBytesSpilled) {
-          Ordering.by(_.bytesSpilled.get.diskBytesSpilledSortable)
-        } else {
-          throw new IllegalArgumentException(
-            "Cannot sort by Shuffle Spill (Disk) because of no spills")
-        }
-      case "Errors" => Ordering.by(_.error)
-      case unknownColumn => throw new IllegalArgumentException(s"Unknown column: $unknownColumn")
-    }
-    if (desc) {
-      ordering.reverse
-    } else {
-      ordering
-    }
-  }
-
 }
 
 private[ui] class TaskPagedTable(
-    conf: SparkConf,
+    stage: StageData,
     basePath: String,
-    data: Seq[TaskData],
-    hasAccumulators: Boolean,
-    hasInput: Boolean,
-    hasOutput: Boolean,
-    hasShuffleRead: Boolean,
-    hasShuffleWrite: Boolean,
-    hasBytesSpilled: Boolean,
     currentTime: Long,
     pageSize: Int,
     sortColumn: String,
     desc: Boolean,
-    store: AppStatusStore) extends PagedTable[TaskTableRowData] {
+    store: AppStatusStore) extends PagedTable[TaskData] {
+
+  import ApiHelper._
 
   override def tableId: String = "task-table"
 
@@ -1142,13 +718,7 @@ private[ui] class TaskPagedTable(
   override def pageNumberFormField: String = "task.page"
 
   override val dataSource: TaskDataSource = new TaskDataSource(
-    data,
-    hasAccumulators,
-    hasInput,
-    hasOutput,
-    hasShuffleRead,
-    hasShuffleWrite,
-    hasBytesSpilled,
+    stage,
     currentTime,
     pageSize,
     sortColumn,
@@ -1180,22 +750,22 @@ private[ui] class TaskPagedTable(
         ("Result Serialization Time", TaskDetailsClassNames.RESULT_SERIALIZATION_TIME),
         ("Getting Result Time", TaskDetailsClassNames.GETTING_RESULT_TIME),
         ("Peak Execution Memory", TaskDetailsClassNames.PEAK_EXECUTION_MEMORY)) ++
-        {if (hasAccumulators) Seq(("Accumulators", "")) else Nil} ++
-        {if (hasInput) Seq(("Input Size / Records", "")) else Nil} ++
-        {if (hasOutput) Seq(("Output Size / Records", "")) else Nil} ++
-        {if (hasShuffleRead) {
+        {if (hasAccumulators(stage)) Seq(("Accumulators", "")) else Nil} ++
+        {if (hasInput(stage)) Seq(("Input Size / Records", "")) else Nil} ++
+        {if (hasOutput(stage)) Seq(("Output Size / Records", "")) else Nil} ++
+        {if (hasShuffleRead(stage)) {
           Seq(("Shuffle Read Blocked Time", TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME),
             ("Shuffle Read Size / Records", ""),
             ("Shuffle Remote Reads", TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE))
         } else {
           Nil
         }} ++
-        {if (hasShuffleWrite) {
+        {if (hasShuffleWrite(stage)) {
           Seq(("Write Time", ""), ("Shuffle Write Size / Records", ""))
         } else {
           Nil
         }} ++
-        {if (hasBytesSpilled) {
+        {if (hasBytesSpilled(stage)) {
           Seq(("Shuffle Spill (Memory)", ""), ("Shuffle Spill (Disk)", ""))
         } else {
           Nil
@@ -1237,7 +807,17 @@ private[ui] class TaskPagedTable(
     <thead>{headerRow}</thead>
   }
 
-  def row(task: TaskTableRowData): Seq[Node] = {
+  def row(task: TaskData): Seq[Node] = {
+    def formatDuration(value: Option[Long], hideZero: Boolean = false): String = {
+      value.map { v =>
+        if (v > 0 || !hideZero) UIUtils.formatDuration(v) else ""
+      }.getOrElse("")
+    }
+
+    def formatBytes(value: Option[Long]): String = {
+      Utils.bytesToString(value.getOrElse(0L))
+    }
+
     <tr>
       <td>{task.index}</td>
       <td>{task.taskId}</td>
@@ -1249,62 +829,98 @@ private[ui] class TaskPagedTable(
         <div style="float: left">{task.host}</div>
         <div style="float: right">
         {
-          task.logs.map {
+          dataSource.executorLogs(task.executorId).map {
             case (logName, logUrl) => <div><a href={logUrl}>{logName}</a></div>
           }
         }
         </div>
       </td>
-      <td>{UIUtils.formatDate(new Date(task.launchTime))}</td>
-      <td>{task.formatDuration}</td>
+      <td>{UIUtils.formatDate(task.launchTime)}</td>
+      <td>{formatDuration(task.duration)}</td>
       <td class={TaskDetailsClassNames.SCHEDULER_DELAY}>
-        {UIUtils.formatDuration(task.schedulerDelay)}
+        {UIUtils.formatDuration(AppStatusUtils.schedulerDelay(task))}
       </td>
       <td class={TaskDetailsClassNames.TASK_DESERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.taskDeserializationTime)}
+        {formatDuration(task.taskMetrics.map(_.executorDeserializeTime))}
       </td>
       <td>
-        {if (task.gcTime > 0) UIUtils.formatDuration(task.gcTime) else ""}
+        {formatDuration(task.taskMetrics.map(_.jvmGcTime), hideZero = true)}
       </td>
       <td class={TaskDetailsClassNames.RESULT_SERIALIZATION_TIME}>
-        {UIUtils.formatDuration(task.serializationTime)}
+        {formatDuration(task.taskMetrics.map(_.resultSerializationTime))}
       </td>
       <td class={TaskDetailsClassNames.GETTING_RESULT_TIME}>
-        {UIUtils.formatDuration(task.gettingResultTime)}
+        {UIUtils.formatDuration(AppStatusUtils.gettingResultTime(task))}
       </td>
       <td class={TaskDetailsClassNames.PEAK_EXECUTION_MEMORY}>
-        {Utils.bytesToString(task.peakExecutionMemoryUsed)}
+        {formatBytes(task.taskMetrics.map(_.peakExecutionMemory))}
       </td>
-      {if (task.accumulators.nonEmpty) {
-        <td>{Unparsed(task.accumulators.get)}</td>
+      {if (hasAccumulators(stage)) {
+        accumulatorsInfo(task)
       }}
-      {if (task.input.nonEmpty) {
-        <td>{task.input.get.inputReadable}</td>
+      {if (hasInput(stage)) {
+        metricInfo(task) { m =>
+          val bytesRead = Utils.bytesToString(m.inputMetrics.bytesRead)
+          val records = m.inputMetrics.recordsRead
+          <td>{bytesRead} / {records}</td>
+        }
       }}
-      {if (task.output.nonEmpty) {
-        <td>{task.output.get.outputReadable}</td>
+      {if (hasOutput(stage)) {
+        metricInfo(task) { m =>
+          val bytesWritten = Utils.bytesToString(m.outputMetrics.bytesWritten)
+          val records = m.outputMetrics.recordsWritten
+          <td>{bytesWritten} / {records}</td>
+        }
       }}
-      {if (task.shuffleRead.nonEmpty) {
+      {if (hasShuffleRead(stage)) {
         <td class={TaskDetailsClassNames.SHUFFLE_READ_BLOCKED_TIME}>
-          {task.shuffleRead.get.shuffleReadBlockedTimeReadable}
+          {formatDuration(task.taskMetrics.map(_.shuffleReadMetrics.fetchWaitTime))}
         </td>
-        <td>{task.shuffleRead.get.shuffleReadReadable}</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesRead = Utils.bytesToString(totalBytesRead(m.shuffleReadMetrics))
+            val records = m.shuffleReadMetrics.recordsRead
+            Unparsed(s"$bytesRead / $records")
+          }
+        }</td>
         <td class={TaskDetailsClassNames.SHUFFLE_READ_REMOTE_SIZE}>
-          {task.shuffleRead.get.shuffleReadRemoteReadable}
+          {formatBytes(task.taskMetrics.map(_.shuffleReadMetrics.remoteBytesRead))}
         </td>
       }}
-      {if (task.shuffleWrite.nonEmpty) {
-        <td>{task.shuffleWrite.get.writeTimeReadable}</td>
-        <td>{task.shuffleWrite.get.shuffleWriteReadable}</td>
+      {if (hasShuffleWrite(stage)) {
+        <td>{
+          formatDuration(
+            task.taskMetrics.map { m =>
+              TimeUnit.NANOSECONDS.toMillis(m.shuffleWriteMetrics.writeTime)
+            },
+            hideZero = true)
+        }</td>
+        <td>{
+          metricInfo(task) { m =>
+            val bytesWritten = Utils.bytesToString(m.shuffleWriteMetrics.bytesWritten)
+            val records = m.shuffleWriteMetrics.recordsWritten
+            Unparsed(s"$bytesWritten / $records")
+          }
+        }</td>
       }}
-      {if (task.bytesSpilled.nonEmpty) {
-        <td>{task.bytesSpilled.get.memoryBytesSpilledReadable}</td>
-        <td>{task.bytesSpilled.get.diskBytesSpilledReadable}</td>
+      {if (hasBytesSpilled(stage)) {
+        <td>{formatBytes(task.taskMetrics.map(_.memoryBytesSpilled))}</td>
+        <td>{formatBytes(task.taskMetrics.map(_.diskBytesSpilled))}</td>
       }}
-      {errorMessageCell(task.error)}
+      {errorMessageCell(task.errorMessage.getOrElse(""))}
     </tr>
   }
 
+  private def accumulatorsInfo(task: TaskData): Seq[Node] = {
+    task.accumulatorUpdates.map { acc =>
+      Unparsed(StringEscapeUtils.escapeHtml4(s"${acc.name}: ${acc.update}"))
+    }
+  }
+
+  private def metricInfo(task: TaskData)(fn: TaskMetrics => Seq[Node]): Seq[Node] = {
+    task.taskMetrics.map(fn).getOrElse(Nil)
+  }
+
   private def errorMessageCell(error: String): Seq[Node] = {
     val isMultiline = error.indexOf('\n') >= 0
     // Display the first line by default
@@ -1333,6 +949,36 @@ private[ui] class TaskPagedTable(
 
 private object ApiHelper {
 
+
+  private val COLUMN_TO_INDEX = Map(
+    "ID" -> null.asInstanceOf[String],
+    "Index" -> TaskIndexNames.TASK_INDEX,
+    "Attempt" -> TaskIndexNames.ATTEMPT,
+    "Status" -> TaskIndexNames.STATUS,
+    "Locality Level" -> TaskIndexNames.LOCALITY,
+    "Executor ID / Host" -> TaskIndexNames.EXECUTOR,
+    "Launch Time" -> TaskIndexNames.LAUNCH_TIME,
+    "Duration" -> TaskIndexNames.DURATION,
+    "Scheduler Delay" -> TaskIndexNames.SCHEDULER_DELAY,
+    "Task Deserialization Time" -> TaskIndexNames.DESER_TIME,
+    "GC Time" -> TaskIndexNames.GC_TIME,
+    "Result Serialization Time" -> TaskIndexNames.SER_TIME,
+    "Getting Result Time" -> TaskIndexNames.GETTING_RESULT_TIME,
+    "Peak Execution Memory" -> TaskIndexNames.PEAK_MEM,
+    "Accumulators" -> TaskIndexNames.ACCUMULATORS,
+    "Input Size / Records" -> TaskIndexNames.INPUT_SIZE,
+    "Output Size / Records" -> TaskIndexNames.OUTPUT_SIZE,
+    "Shuffle Read Blocked Time" -> TaskIndexNames.SHUFFLE_READ_TIME,
+    "Shuffle Read Size / Records" -> TaskIndexNames.SHUFFLE_TOTAL_READS,
+    "Shuffle Remote Reads" -> TaskIndexNames.SHUFFLE_REMOTE_READS,
+    "Write Time" -> TaskIndexNames.SHUFFLE_WRITE_TIME,
+    "Shuffle Write Size / Records" -> TaskIndexNames.SHUFFLE_WRITE_SIZE,
+    "Shuffle Spill (Memory)" -> TaskIndexNames.MEM_SPILL,
+    "Shuffle Spill (Disk)" -> TaskIndexNames.DISK_SPILL,
+    "Errors" -> TaskIndexNames.ERROR)
+
+  def hasAccumulators(stageData: StageData): Boolean = stageData.accumulatorUpdates.size > 0
+
   def hasInput(stageData: StageData): Boolean = stageData.inputBytes > 0
 
   def hasOutput(stageData: StageData): Boolean = stageData.outputBytes > 0
@@ -1349,4 +995,11 @@ private object ApiHelper {
     metrics.localBytesRead + metrics.remoteBytesRead
   }
 
+  def indexName(sortColumn: String): Option[String] = {
+    COLUMN_TO_INDEX.get(sortColumn) match {
+      case Some(v) => Option(v)
+      case _ => throw new IllegalArgumentException(s"Invalid sort column: $sortColumn")
+    }
+  }
+
 }
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
index f8e27703c0def..5c42ac1d87f4c 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w__custom_quantiles_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 2.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 6.0, 53.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
index a28bda16a956e..e6b705989cc97 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_read_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 1034.0, 1034.0, 1034.0, 1034.0, 1034.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 4.0, 4.0, 6.0, 7.0, 9.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
index ede3eaed1d1d2..788f28cf7b365 100644
--- a/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
+++ b/core/src/test/resources/HistoryServerExpectations/stage_task_summary_w_shuffle_write_expectation.json
@@ -7,6 +7,9 @@
   "resultSize" : [ 2010.0, 2065.0, 2065.0, 2065.0, 2065.0 ],
   "jvmGcTime" : [ 0.0, 0.0, 0.0, 5.0, 7.0 ],
   "resultSerializationTime" : [ 0.0, 0.0, 0.0, 0.0, 1.0 ],
+  "gettingResultTime" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
+  "schedulerDelay" : [ 2.0, 4.0, 6.0, 13.0, 40.0 ],
+  "peakExecutionMemory" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "memoryBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "diskBytesSpilled" : [ 0.0, 0.0, 0.0, 0.0, 0.0 ],
   "inputMetrics" : {
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
index b8c84e24c2c3f..ca66b6b9db890 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusListenerSuite.scala
@@ -213,45 +213,42 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     s1Tasks.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.taskId === task.taskId)
+        assert(wrapper.taskId === task.taskId)
         assert(wrapper.stageId === stages.head.stageId)
-        assert(wrapper.stageAttemptId === stages.head.attemptNumber)
-        assert(Arrays.equals(wrapper.stage, Array(stages.head.stageId, stages.head.attemptNumber)))
-
-        val runtime = Array[AnyRef](stages.head.stageId: JInteger,
-          stages.head.attemptNumber: JInteger,
-          -1L: JLong)
-        assert(Arrays.equals(wrapper.runtime, runtime))
-
-        assert(wrapper.info.index === task.index)
-        assert(wrapper.info.attempt === task.attemptNumber)
-        assert(wrapper.info.launchTime === new Date(task.launchTime))
-        assert(wrapper.info.executorId === task.executorId)
-        assert(wrapper.info.host === task.host)
-        assert(wrapper.info.status === task.status)
-        assert(wrapper.info.taskLocality === task.taskLocality.toString())
-        assert(wrapper.info.speculative === task.speculative)
+        assert(wrapper.stageAttemptId === stages.head.attemptId)
+        assert(wrapper.index === task.index)
+        assert(wrapper.attempt === task.attemptNumber)
+        assert(wrapper.launchTime === task.launchTime)
+        assert(wrapper.executorId === task.executorId)
+        assert(wrapper.host === task.host)
+        assert(wrapper.status === task.status)
+        assert(wrapper.taskLocality === task.taskLocality.toString())
+        assert(wrapper.speculative === task.speculative)
       }
     }
 
-    // Send executor metrics update. Only update one metric to avoid a lot of boilerplate code.
-    s1Tasks.foreach { task =>
-      val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
-        Some(1L), None, true, false, None)
-      listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
-        task.executorId,
-        Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
-    }
+    // Send two executor metrics update. Only update one metric to avoid a lot of boilerplate code.
+    // The tasks are distributed among the two executors, so the executor-level metrics should
+    // hold half of the cummulative value of the metric being updated.
+    Seq(1L, 2L).foreach { value =>
+      s1Tasks.foreach { task =>
+        val accum = new AccumulableInfo(1L, Some(InternalAccumulator.MEMORY_BYTES_SPILLED),
+          Some(value), None, true, false, None)
+        listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate(
+          task.executorId,
+          Seq((task.taskId, stages.head.stageId, stages.head.attemptNumber, Seq(accum)))))
+      }
 
-    check[StageDataWrapper](key(stages.head)) { stage =>
-      assert(stage.info.memoryBytesSpilled === s1Tasks.size)
-    }
+      check[StageDataWrapper](key(stages.head)) { stage =>
+        assert(stage.info.memoryBytesSpilled === s1Tasks.size * value)
+      }
 
-    val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
-      .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
-    assert(execs.size > 0)
-    execs.foreach { exec =>
-      assert(exec.info.memoryBytesSpilled === s1Tasks.size / 2)
+      val execs = store.view(classOf[ExecutorStageSummaryWrapper]).index("stage")
+        .first(key(stages.head)).last(key(stages.head)).asScala.toSeq
+      assert(execs.size > 0)
+      execs.foreach { exec =>
+        assert(exec.info.memoryBytesSpilled === s1Tasks.size * value / 2)
+      }
     }
 
     // Fail one of the tasks, re-start it.
@@ -278,13 +275,13 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](s1Tasks.head.taskId) { task =>
-      assert(task.info.status === s1Tasks.head.status)
-      assert(task.info.errorMessage == Some(TaskResultLost.toErrorString))
+      assert(task.status === s1Tasks.head.status)
+      assert(task.errorMessage == Some(TaskResultLost.toErrorString))
     }
 
     check[TaskDataWrapper](reattempt.taskId) { task =>
-      assert(task.info.index === s1Tasks.head.index)
-      assert(task.info.attempt === reattempt.attemptNumber)
+      assert(task.index === s1Tasks.head.index)
+      assert(task.attempt === reattempt.attemptNumber)
     }
 
     // Kill one task, restart it.
@@ -306,8 +303,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](killed.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some("killed"))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some("killed"))
     }
 
     // Start a new attempt and finish it with TaskCommitDenied, make sure it's handled like a kill.
@@ -334,8 +331,8 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     }
 
     check[TaskDataWrapper](denied.taskId) { task =>
-      assert(task.info.index === killed.index)
-      assert(task.info.errorMessage === Some(denyReason.toErrorString))
+      assert(task.index === killed.index)
+      assert(task.errorMessage === Some(denyReason.toErrorString))
     }
 
     // Start a new attempt.
@@ -373,10 +370,10 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
 
     pending.foreach { task =>
       check[TaskDataWrapper](task.taskId) { wrapper =>
-        assert(wrapper.info.errorMessage === None)
-        assert(wrapper.info.taskMetrics.get.executorCpuTime === 2L)
-        assert(wrapper.info.taskMetrics.get.executorRunTime === 4L)
-        assert(wrapper.info.duration === Some(task.duration))
+        assert(wrapper.errorMessage === None)
+        assert(wrapper.executorCpuTime === 2L)
+        assert(wrapper.executorRunTime === 4L)
+        assert(wrapper.duration === task.duration)
       }
     }
 
@@ -894,6 +891,23 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     assert(store.count(classOf[StageDataWrapper]) === 3)
     assert(store.count(classOf[RDDOperationGraphWrapper]) === 3)
 
+    val dropped = stages.drop(1).head
+
+    // Cache some quantiles by calling AppStatusStore.taskSummary(). For quantiles to be
+    // calculcated, we need at least one finished task.
+    time += 1
+    val task = createTasks(1, Array("1")).head
+    listener.onTaskStart(SparkListenerTaskStart(dropped.stageId, dropped.attemptId, task))
+
+    time += 1
+    task.markFinished(TaskState.FINISHED, time)
+    listener.onTaskEnd(SparkListenerTaskEnd(dropped.stageId, dropped.attemptId,
+      "taskType", Success, task, null))
+
+    new AppStatusStore(store)
+      .taskSummary(dropped.stageId, dropped.attemptId, Array(0.25d, 0.50d, 0.75d))
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 3)
+
     stages.drop(1).foreach { s =>
       time += 1
       s.completionTime = Some(time)
@@ -905,6 +919,7 @@ class AppStatusListenerSuite extends SparkFunSuite with BeforeAndAfter {
     intercept[NoSuchElementException] {
       store.read(classOf[StageDataWrapper], Array(2, 0))
     }
+    assert(store.count(classOf[CachedQuantile], "stage", key(dropped)) === 0)
 
     val attempt2 = new StageInfo(3, 1, "stage3", 4, Nil, Nil, "details3")
     time += 1
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
new file mode 100644
index 0000000000000..92f90f3d96ddf
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.status
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.status.api.v1.TaskMetricDistributions
+import org.apache.spark.util.Distribution
+import org.apache.spark.util.kvstore._
+
+class AppStatusStoreSuite extends SparkFunSuite {
+
+  private val uiQuantiles = Array(0.0, 0.25, 0.5, 0.75, 1.0)
+  private val stageId = 1
+  private val attemptId = 1
+
+  test("quantile calculation: 1 task") {
+    compareQuantiles(1, uiQuantiles)
+  }
+
+  test("quantile calculation: few tasks") {
+    compareQuantiles(4, uiQuantiles)
+  }
+
+  test("quantile calculation: more tasks") {
+    compareQuantiles(100, uiQuantiles)
+  }
+
+  test("quantile calculation: lots of tasks") {
+    compareQuantiles(4096, uiQuantiles)
+  }
+
+  test("quantile calculation: custom quantiles") {
+    compareQuantiles(4096, Array(0.01, 0.33, 0.5, 0.42, 0.69, 0.99))
+  }
+
+  test("quantile cache") {
+    val store = new InMemoryStore()
+    (0 until 4096).foreach { i => store.write(newTaskData(i)) }
+
+    val appStore = new AppStatusStore(store)
+
+    appStore.taskSummary(stageId, attemptId, Array(0.13d))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "13"))
+    }
+
+    appStore.taskSummary(stageId, attemptId, Array(0.25d))
+    val d1 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+
+    // Add a new task to force the cached quantile to be evicted, and make sure it's updated.
+    store.write(newTaskData(4096))
+    appStore.taskSummary(stageId, attemptId, Array(0.25d, 0.50d, 0.73d))
+
+    val d2 = store.read(classOf[CachedQuantile], Array(stageId, attemptId, "25"))
+    assert(d1.taskCount != d2.taskCount)
+
+    store.read(classOf[CachedQuantile], Array(stageId, attemptId, "50"))
+    intercept[NoSuchElementException] {
+      store.read(classOf[CachedQuantile], Array(stageId, attemptId, "73"))
+    }
+
+    assert(store.count(classOf[CachedQuantile]) === 2)
+  }
+
+  private def compareQuantiles(count: Int, quantiles: Array[Double]): Unit = {
+    val store = new InMemoryStore()
+    val values = (0 until count).map { i =>
+      val task = newTaskData(i)
+      store.write(task)
+      i.toDouble
+    }.toArray
+
+    val summary = new AppStatusStore(store).taskSummary(stageId, attemptId, quantiles).get
+    val dist = new Distribution(values, 0, values.length).getQuantiles(quantiles.sorted)
+
+    dist.zip(summary.executorRunTime).foreach { case (expected, actual) =>
+      assert(expected === actual)
+    }
+  }
+
+  private def newTaskData(i: Int): TaskDataWrapper = {
+    new TaskDataWrapper(
+      i, i, i, i, i, i, i.toString, i.toString, i.toString, i.toString, false, Nil, None,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, i, i, i, i, i, i,
+      i, i, i, i, stageId, attemptId)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
index 661d0d48d2f37..0aeddf730cd35 100644
--- a/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/StagePageSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark._
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
 import org.apache.spark.status.AppStatusStore
+import org.apache.spark.status.config._
 import org.apache.spark.ui.jobs.{StagePage, StagesTab}
 
 class StagePageSuite extends SparkFunSuite with LocalSparkContext {
@@ -35,15 +36,13 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
   private val peakExecutionMemory = 10
 
   test("peak execution memory should displayed") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     val targetString = "peak execution memory"
     assert(html.contains(targetString))
   }
 
   test("SPARK-10543: peak execution memory should be per-task rather than cumulative") {
-    val conf = new SparkConf(false)
-    val html = renderStagePage(conf).toString().toLowerCase(Locale.ROOT)
+    val html = renderStagePage().toString().toLowerCase(Locale.ROOT)
     // verify min/25/50/75/max show task value not cumulative values
     assert(html.contains(s"<td>$peakExecutionMemory.0 b</td>" * 5))
   }
@@ -52,7 +51,8 @@ class StagePageSuite extends SparkFunSuite with LocalSparkContext {
    * Render a stage page started with the given conf and return the HTML.
    * This also runs a dummy stage to populate the page with useful content.
    */
-  private def renderStagePage(conf: SparkConf): Seq[Node] = {
+  private def renderStagePage(): Seq[Node] = {
+    val conf = new SparkConf(false).set(LIVE_ENTITY_UPDATE_PERIOD, 0L)
     val statusStore = AppStatusStore.createLiveStore(conf)
     val listener = statusStore.listener.get
 
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index 7bdd3fac773a3..e2fa5754afaee 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -93,7 +93,7 @@ This file is divided into 3 sections:
     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
   </check>
 
-  <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
+  <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
   </check>
 

From 0552c36e02434c60dad82024334d291f6008b822 Mon Sep 17 00:00:00 2001
From: wuyi5 <ngone_5451@163.com>
Date: Thu, 11 Jan 2018 22:17:15 +0900
Subject: [PATCH 311/356] [SPARK-22967][TESTS] Fix VersionSuite's unit tests by
 change Windows path into URI path

## What changes were proposed in this pull request?

Two unit test will fail due to Windows format path:

1.test(s"$version: read avro file containing decimal")
```
org.apache.hadoop.hive.ql.metadata.HiveException: MetaException(message:java.lang.IllegalArgumentException: Can not create a Path from an empty string);
```

2.test(s"$version: SPARK-17920: Insert into/overwrite avro table")
```
Unable to infer the schema. The schema specification is required to create the table `default`.`tab2`.;
org.apache.spark.sql.AnalysisException: Unable to infer the schema. The schema specification is required to create the table `default`.`tab2`.;
```

This pr fix these two unit test by change Windows path into URI path.

## How was this patch tested?
Existed.

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: wuyi5 <ngone_5451@163.com>

Closes #20199 from Ngone51/SPARK-22967.
---
 .../org/apache/spark/sql/hive/client/VersionsSuite.scala  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index ff90e9dda5f7c..e64389e56b5a1 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -811,7 +811,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
 
     test(s"$version: read avro file containing decimal") {
       val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-      val location = new File(url.getFile)
+      val location = new File(url.getFile).toURI.toString
 
       val tableName = "tab1"
       val avroSchema =
@@ -851,6 +851,8 @@ class VersionsSuite extends SparkFunSuite with Logging {
     }
 
     test(s"$version: SPARK-17920: Insert into/overwrite avro table") {
+      // skipped because it's failed in the condition on Windows
+      assume(!(Utils.isWindows && version == "0.12"))
       withTempDir { dir =>
         val avroSchema =
           """
@@ -875,10 +877,10 @@ class VersionsSuite extends SparkFunSuite with Logging {
         val writer = new PrintWriter(schemaFile)
         writer.write(avroSchema)
         writer.close()
-        val schemaPath = schemaFile.getCanonicalPath
+        val schemaPath = schemaFile.toURI.toString
 
         val url = Thread.currentThread().getContextClassLoader.getResource("avroDecimal")
-        val srcLocation = new File(url.getFile).getCanonicalPath
+        val srcLocation = new File(url.getFile).toURI.toString
         val destTableName = "tab1"
         val srcTableName = "tab2"
 

From 76892bcf2c08efd7e9c5b16d377e623d82fe695e Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 21:32:36 +0800
Subject: [PATCH 312/356] [SPARK-23000][TEST-HADOOP2.6] Fix Flaky test suite
 DataSourceWithHiveMetastoreCatalogSuite

## What changes were proposed in this pull request?
The Spark 2.3 branch still failed due to the flaky test suite `DataSourceWithHiveMetastoreCatalogSuite `. https://amplab.cs.berkeley.edu/jenkins/job/spark-branch-2.3-test-sbt-hadoop-2.6/

Although https://github.com/apache/spark/pull/20207 is unable to reproduce it in Spark 2.3, it sounds like the current DB of Spark's Catalog is changed based on the following stacktrace. Thus, we just need to reset it.

```
[info] DataSourceWithHiveMetastoreCatalogSuite:
02:40:39.486 ERROR org.apache.hadoop.hive.ql.parse.CalcitePlanner: org.apache.hadoop.hive.ql.parse.SemanticException: Line 1:14 Table not found 't'
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getMetaData(SemanticAnalyzer.java:1594)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.getMetaData(SemanticAnalyzer.java:1545)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genResolvedParseTree(SemanticAnalyzer.java:10077)
	at org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10128)
	at org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:209)
	at org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:227)
	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
	at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
	at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
	at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
	at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:694)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$runHive$1.apply(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:272)
	at org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:210)
	at org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:209)
	at org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:255)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runHive(HiveClientImpl.scala:683)
	at org.apache.spark.sql.hive.client.HiveClientImpl.runSqlHive(HiveClientImpl.scala:673)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1$$anonfun$apply$mcV$sp$3.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:185)
	at org.apache.spark.sql.test.SQLTestUtilsBase$class.withTable(SQLTestUtils.scala:273)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite.withTable(HiveMetastoreCatalogSuite.scala:139)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply$mcV$sp(HiveMetastoreCatalogSuite.scala:163)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:163)
	at org.apache.spark.sql.hive.DataSourceWithHiveMetastoreCatalogSuite$$anonfun$9$$anonfun$apply$1.apply(HiveMetastoreCatalogSuite.scala:163)
	at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
	at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
	at org.scalatest.Transformer.apply(Transformer.scala:22)
	at org.scalatest.Transformer.apply(Transformer.scala:20)
	at org.scalatest.FunSuiteLike$$anon$1.apply(FunSuiteLike.scala:186)
	at org.apache.spark.SparkFunSuite.withFixture(SparkFunSuite.scala:68)
	at org.scalatest.FunSuiteLike$class.invokeWithFixture$1(FunSuiteLike.scala:183)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:196)
	at org.scalatest.FunSuiteLike$$anonfun$runTest$1.apply(FunSuiteLike.scala:196)
	at org.scalatest.SuperEngine.runTestImpl(Engine.scala:289)
	at org.scalatest.FunSuiteLike$class.runTest(FunSuiteLike.scala:196)
	at org.scalatest.FunSuite.runTest(FunSuite.scala:1560)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:229)
	at org.scalatest.FunSuiteLike$$anonfun$runTests$1.apply(FunSuiteLike.scala:229)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:396)
	at org.scalatest.SuperEngine$$anonfun$traverseSubNodes$1$1.apply(Engine.scala:384)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.scalatest.SuperEngine.traverseSubNodes$1(Engine.scala:384)
	at org.scalatest.SuperEngine.org$scalatest$SuperEngine$$runTestsInBranch(Engine.scala:379)
	at org.scalatest.SuperEngine.runTestsImpl(Engine.scala:461)
	at org.scalatest.FunSuiteLike$class.runTests(FunSuiteLike.scala:229)
	at org.scalatest.FunSuite.runTests(FunSuite.scala:1560)
	at org.scalatest.Suite$class.run(Suite.scala:1147)
	at org.scalatest.FunSuite.org$scalatest$FunSuiteLike$$super$run(FunSuite.scala:1560)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:233)
	at org.scalatest.FunSuiteLike$$anonfun$run$1.apply(FunSuiteLike.scala:233)
	at org.scalatest.SuperEngine.runImpl(Engine.scala:521)
	at org.scalatest.FunSuiteLike$class.run(FunSuiteLike.scala:233)
	at org.apache.spark.SparkFunSuite.org$scalatest$BeforeAndAfterAll$$super$run(SparkFunSuite.scala:31)
	at org.scalatest.BeforeAndAfterAll$class.liftedTree1$1(BeforeAndAfterAll.scala:213)
	at org.scalatest.BeforeAndAfterAll$class.run(BeforeAndAfterAll.scala:210)
	at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:31)
	at org.scalatest.tools.Framework.org$scalatest$tools$Framework$$runSuite(Framework.scala:314)
	at org.scalatest.tools.Framework$ScalaTestTask.execute(Framework.scala:480)
	at sbt.ForkMain$Run$2.call(ForkMain.java:296)
	at sbt.ForkMain$Run$2.call(ForkMain.java:286)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
```

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20218 from gatorsmile/testFixAgain.
---
 .../org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index cf4ce83124d88..ba9b944e4a055 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -148,6 +148,7 @@ class DataSourceWithHiveMetastoreCatalogSuite
 
   override def beforeAll(): Unit = {
     super.beforeAll()
+    sparkSession.sessionState.catalog.reset()
     sparkSession.metadataHive.reset()
   }
 

From b46e58b74c82dac37b7b92284ea3714919c5a886 Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Thu, 11 Jan 2018 22:33:42 +0900
Subject: [PATCH 313/356] [SPARK-19732][FOLLOW-UP] Document behavior changes
 made in na.fill and fillna

## What changes were proposed in this pull request?
https://github.com/apache/spark/pull/18164 introduces the behavior changes. We need to document it.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20234 from gatorsmile/docBehaviorChange.
---
 docs/sql-programming-guide.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 72f79d6909ecc..258c769ff593b 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -1788,12 +1788,10 @@ options.
     Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like `BigInteger`/`BigInt`. For example, 1.1 is inferred as double type.
   - In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as `toPandas`, `createDataFrame` from Pandas DataFrame, etc.
   - In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration `spark.sql.execution.pandas.respectSessionTimeZone` to `False`. See [SPARK-22395](https://issues.apache.org/jira/browse/SPARK-22395) for details.
-
- - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
-
- - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
-
- - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
+  - In PySpark, `na.fill()` or `fillna` also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.  
+  - Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section [Broadcast Hint](#broadcast-hint-for-sql-queries) and [SPARK-22489](https://issues.apache.org/jira/browse/SPARK-22489).
+  - Since Spark 2.3, when all inputs are binary, `functions.concat()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.concatBinaryAsString` to `true`.
+  - Since Spark 2.3, when all inputs are binary, SQL `elt()` returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set `spark.sql.function.eltOutputAsString` to `true`.
 
 ## Upgrading From Spark SQL 2.1 to 2.2
 

From 6d230dccf65300651f989392159d84bfaf08f18f Mon Sep 17 00:00:00 2001
From: FanDonglai <ddna_1022@163.com>
Date: Thu, 11 Jan 2018 09:06:40 -0600
Subject: [PATCH 314/356] Update PageRank.scala

## What changes were proposed in this pull request?

Hi, acording to code below,
"if (id == src) (0.0, Double.NegativeInfinity) else (0.0, 0.0)"
I think the comment can be wrong

## How was this patch tested?
Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: FanDonglai <ddna_1022@163.com>

Closes #20220 from ddna1021/master.
---
 .../src/main/scala/org/apache/spark/graphx/lib/PageRank.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index fd7b7f7c1c487..ebd65e8320e5c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -303,7 +303,7 @@ object PageRank extends Logging {
     val src: VertexId = srcId.getOrElse(-1L)
 
     // Initialize the pagerankGraph with each edge attribute
-    // having weight 1/outDegree and each vertex with attribute 1.0.
+    // having weight 1/outDegree and each vertex with attribute 0.
     val pagerankGraph: Graph[(Double, Double), Double] = graph
       // Associate the degree with each vertex
       .outerJoinVertices(graph.outDegrees) {

From 0b2eefb674151a0af64806728b38d9410da552ec Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 11 Jan 2018 10:37:35 -0800
Subject: [PATCH 315/356] [SPARK-22994][K8S] Use a single image for all Spark
 containers.

This change allows a user to submit a Spark application on kubernetes
having to provide a single image, instead of one image for each type
of container. The image's entry point now takes an extra argument that
identifies the process that is being started.

The configuration still allows the user to provide different images
for each container type if they so desire.

On top of that, the entry point was simplified a bit to share more
code; mainly, the same env variable is used to propagate the user-defined
classpath to the different containers.

Aside from being modified to match the new behavior, the
'build-push-docker-images.sh' script was renamed to 'docker-image-tool.sh'
to more closely match its purpose; the old name was a little awkward
and now also not entirely correct, since there is a single image. It
was also moved to 'bin' since it's not necessarily an admin tool.

Docs have been updated to match the new behavior.

Tested locally with minikube.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #20192 from vanzin/SPARK-22994.
---
 .../docker-image-tool.sh                      | 68 ++++++-------
 docs/running-on-kubernetes.md                 | 58 +++++------
 .../org/apache/spark/deploy/k8s/Config.scala  | 17 ++--
 .../apache/spark/deploy/k8s/Constants.scala   |  3 +-
 .../deploy/k8s/InitContainerBootstrap.scala   |  1 +
 .../steps/BasicDriverConfigurationStep.scala  |  3 +-
 .../cluster/k8s/ExecutorPodFactory.scala      |  3 +-
 .../DriverConfigOrchestratorSuite.scala       | 12 +--
 .../BasicDriverConfigurationStepSuite.scala   |  4 +-
 ...InitContainerConfigOrchestratorSuite.scala |  4 +-
 .../cluster/k8s/ExecutorPodFactorySuite.scala |  4 +-
 .../src/main/dockerfiles/driver/Dockerfile    | 35 -------
 .../src/main/dockerfiles/executor/Dockerfile  | 35 -------
 .../dockerfiles/init-container/Dockerfile     | 24 -----
 .../main/dockerfiles/spark-base/entrypoint.sh | 37 -------
 .../{spark-base => spark}/Dockerfile          | 10 +-
 .../src/main/dockerfiles/spark/entrypoint.sh  | 97 +++++++++++++++++++
 17 files changed, 189 insertions(+), 226 deletions(-)
 rename sbin/build-push-docker-images.sh => bin/docker-image-tool.sh (63%)
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
 delete mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
 rename resource-managers/kubernetes/docker/src/main/dockerfiles/{spark-base => spark}/Dockerfile (87%)
 create mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh

diff --git a/sbin/build-push-docker-images.sh b/bin/docker-image-tool.sh
similarity index 63%
rename from sbin/build-push-docker-images.sh
rename to bin/docker-image-tool.sh
index b9532597419a5..071406336d1b1 100755
--- a/sbin/build-push-docker-images.sh
+++ b/bin/docker-image-tool.sh
@@ -24,29 +24,11 @@ function error {
   exit 1
 }
 
-# Detect whether this is a git clone or a Spark distribution and adjust paths
-# accordingly.
 if [ -z "${SPARK_HOME}" ]; then
   SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
 fi
 . "${SPARK_HOME}/bin/load-spark-env.sh"
 
-if [ -f "$SPARK_HOME/RELEASE" ]; then
-  IMG_PATH="kubernetes/dockerfiles"
-  SPARK_JARS="jars"
-else
-  IMG_PATH="resource-managers/kubernetes/docker/src/main/dockerfiles"
-  SPARK_JARS="assembly/target/scala-$SPARK_SCALA_VERSION/jars"
-fi
-
-if [ ! -d "$IMG_PATH" ]; then
-  error "Cannot find docker images. This script must be run from a runnable distribution of Apache Spark."
-fi
-
-declare -A path=( [spark-driver]="$IMG_PATH/driver/Dockerfile" \
-                  [spark-executor]="$IMG_PATH/executor/Dockerfile" \
-                  [spark-init]="$IMG_PATH/init-container/Dockerfile" )
-
 function image_ref {
   local image="$1"
   local add_repo="${2:-1}"
@@ -60,35 +42,49 @@ function image_ref {
 }
 
 function build {
-  docker build \
-    --build-arg "spark_jars=$SPARK_JARS" \
-    --build-arg "img_path=$IMG_PATH" \
-    -t spark-base \
-    -f "$IMG_PATH/spark-base/Dockerfile" .
-  for image in "${!path[@]}"; do
-    docker build -t "$(image_ref $image)" -f ${path[$image]} .
-  done
+  local BUILD_ARGS
+  local IMG_PATH
+
+  if [ ! -f "$SPARK_HOME/RELEASE" ]; then
+    # Set image build arguments accordingly if this is a source repo and not a distribution archive.
+    IMG_PATH=resource-managers/kubernetes/docker/src/main/dockerfiles
+    BUILD_ARGS=(
+      --build-arg
+      img_path=$IMG_PATH
+      --build-arg
+      spark_jars=assembly/target/scala-$SPARK_SCALA_VERSION/jars
+    )
+  else
+    # Not passed as an argument to docker, but used to validate the Spark directory.
+    IMG_PATH="kubernetes/dockerfiles"
+  fi
+
+  if [ ! -d "$IMG_PATH" ]; then
+    error "Cannot find docker image. This script must be run from a runnable distribution of Apache Spark."
+  fi
+
+  docker build "${BUILD_ARGS[@]}" \
+    -t $(image_ref spark) \
+    -f "$IMG_PATH/spark/Dockerfile" .
 }
 
 function push {
-  for image in "${!path[@]}"; do
-    docker push "$(image_ref $image)"
-  done
+  docker push "$(image_ref spark)"
 }
 
 function usage {
   cat <<EOF
 Usage: $0 [options] [command]
-Builds or pushes the built-in Spark Docker images.
+Builds or pushes the built-in Spark Docker image.
 
 Commands:
-  build       Build images.
-  push        Push images to a registry. Requires a repository address to be provided, both
-              when building and when pushing the images.
+  build       Build image. Requires a repository address to be provided if the image will be
+              pushed to a different registry.
+  push        Push a pre-built image to a registry. Requires a repository address to be provided.
 
 Options:
   -r repo     Repository address.
-  -t tag      Tag to apply to built images, or to identify images to be pushed.
+  -t tag      Tag to apply to the built image, or to identify the image to be pushed.
   -m          Use minikube's Docker daemon.
 
 Using minikube when building images will do so directly into minikube's Docker daemon.
@@ -100,10 +96,10 @@ Check the following documentation for more information on using the minikube Doc
   https://kubernetes.io/docs/getting-started-guides/minikube/#reusing-the-docker-daemon
 
 Examples:
-  - Build images in minikube with tag "testing"
+  - Build image in minikube with tag "testing"
     $0 -m -t testing build
 
-  - Build and push images with tag "v2.3.0" to docker.io/myrepo
+  - Build and push image with tag "v2.3.0" to docker.io/myrepo
     $0 -r docker.io/myrepo -t v2.3.0 build
     $0 -r docker.io/myrepo -t v2.3.0 push
 EOF
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index 2d69f636472ae..08ec34c63ba3f 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -53,20 +53,17 @@ in a future release.
 
 Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
 be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
-frequently used with Kubernetes. With Spark 2.3, there are Dockerfiles provided in the runnable distribution that can be customized
-and built for your usage.
+frequently used with Kubernetes. Spark (starting with version 2.3) ships with a Dockerfile that can be used for this
+purpose, or customized to match an individual application's needs. It can be found in the `kubernetes/dockerfiles/`
+directory.
 
-You may build these docker images from sources.
-There is a script, `sbin/build-push-docker-images.sh` that you can use to build and push
-customized Spark distribution images consisting of all the above components.
+Spark also ships with a `bin/docker-image-tool.sh` script that can be used to build and publish the Docker images to
+use with the Kubernetes backend.
 
 Example usage is:
 
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag build
-    ./sbin/build-push-docker-images.sh -r <repo> -t my-tag push
-
-Docker files are under the `kubernetes/dockerfiles/` directory and can be customized further before
-building using the supplied script, or manually.
+    ./bin/docker-image-tool.sh -r <repo> -t my-tag build
+    ./bin/docker-image-tool.sh -r <repo> -t my-tag push
 
 ## Cluster Mode
 
@@ -79,8 +76,7 @@ $ bin/spark-submit \
     --name spark-pi \
     --class org.apache.spark.examples.SparkPi \
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
+    --conf spark.kubernetes.container.image=<spark-image> \
     local:///path/to/examples.jar
 ```
 
@@ -126,13 +122,7 @@ Those dependencies can be added to the classpath by referencing them with `local
 ### Using Remote Dependencies
 When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
 need a Kubernetes [init-container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) for downloading
-the dependencies so the driver and executor containers can use them locally. This requires users to specify the container
-image for the init-container using the configuration property `spark.kubernetes.initContainer.image`. For example, users
-simply add the following option to the `spark-submit` command to specify the init-container image:
-
-```
---conf spark.kubernetes.initContainer.image=<init-container image>
-```
+the dependencies so the driver and executor containers can use them locally.
 
 The init-container handles remote dependencies specified in `spark.jars` (or the `--jars` option of `spark-submit`) and
 `spark.files` (or the `--files` option of `spark-submit`). It also handles remotely hosted main application resources, e.g.,
@@ -147,9 +137,7 @@ $ bin/spark-submit \
     --jars https://path/to/dependency1.jar,https://path/to/dependency2.jar
     --files hdfs://host:port/path/to/file1,hdfs://host:port/path/to/file2
     --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.driver.container.image=<driver-image> \
-    --conf spark.kubernetes.executor.container.image=<executor-image> \
-    --conf spark.kubernetes.initContainer.image=<init-container image>
+    --conf spark.kubernetes.container.image=<spark-image> \
     https://path/to/examples.jar
 ```
 
@@ -322,21 +310,27 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>spark.kubernetes.container.image</code></td>
   <td><code>(none)</code></td>
   <td>
-    Container image to use for the driver.
-    This is usually of the form <code>example.com/repo/spark-driver:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Container image to use for the Spark application.
+    This is usually of the form <code>example.com/repo/spark:v1.0.0</code>.
+    This configuration is required and must be provided by the user, unless explicit
+    images are provided for each different container type.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.kubernetes.driver.container.image</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
+  <td>
+    Custom container image to use for the driver.
   </td>
 </tr>
 <tr>
   <td><code>spark.kubernetes.executor.container.image</code></td>
-  <td><code>(none)</code></td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-    Container image to use for the executors.
-    This is usually of the form <code>example.com/repo/spark-executor:v1.0.0</code>.
-    This configuration is required and must be provided by the user.
+    Custom container image to use for executors.
   </td>
 </tr>
 <tr>
@@ -643,9 +637,9 @@ specific to Spark on Kubernetes.
 </tr>
 <tr>
   <td><code>spark.kubernetes.initContainer.image</code></td>
-  <td>(none)</td>
+  <td><code>(value of spark.kubernetes.container.image)</code></td>
   <td>
-   Container image for the <a href="https://kubernetes.io/docs/concepts/workloads/pods/init-containers/">init-container</a> of the driver and executors for downloading dependencies. This is usually of the form <code>example.com/repo/spark-init:v1.0.0</code>. This configuration is optional and must be provided by the user if any non-container local dependency is used and must be downloaded remotely.
+   Custom container image for the init container of both driver and executors.
   </td>
 </tr>
 <tr>
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index e5d79d9a9d9da..471196ac0e3f6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -29,17 +29,23 @@ private[spark] object Config extends Logging {
       .stringConf
       .createWithDefault("default")
 
+  val CONTAINER_IMAGE =
+    ConfigBuilder("spark.kubernetes.container.image")
+      .doc("Container image to use for Spark containers. Individual container types " +
+        "(e.g. driver or executor) can also be configured to use different images if desired, " +
+        "by setting the container type-specific image name.")
+      .stringConf
+      .createOptional
+
   val DRIVER_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.driver.container.image")
       .doc("Container image to use for the driver.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val EXECUTOR_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.executor.container.image")
       .doc("Container image to use for the executors.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val CONTAINER_IMAGE_PULL_POLICY =
     ConfigBuilder("spark.kubernetes.container.image.pullPolicy")
@@ -148,8 +154,7 @@ private[spark] object Config extends Logging {
   val INIT_CONTAINER_IMAGE =
     ConfigBuilder("spark.kubernetes.initContainer.image")
       .doc("Image for the driver and executor's init-container for downloading dependencies.")
-      .stringConf
-      .createOptional
+      .fallbackConf(CONTAINER_IMAGE)
 
   val INIT_CONTAINER_MOUNT_TIMEOUT =
     ConfigBuilder("spark.kubernetes.mountDependencies.timeout")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
index 111cb2a3b75e5..9411956996843 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
@@ -60,10 +60,9 @@ private[spark] object Constants {
   val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
   val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
   val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
-  val ENV_EXECUTOR_EXTRA_CLASSPATH = "SPARK_EXECUTOR_EXTRA_CLASSPATH"
   val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
   val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
-  val ENV_SUBMIT_EXTRA_CLASSPATH = "SPARK_SUBMIT_EXTRA_CLASSPATH"
+  val ENV_CLASSPATH = "SPARK_CLASSPATH"
   val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
   val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
   val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
index dfeccf9e2bd1c..f6a57dfe00171 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
@@ -77,6 +77,7 @@ private[spark] class InitContainerBootstrap(
         .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
         .endVolumeMount()
       .addToVolumeMounts(sharedVolumeMounts: _*)
+      .addToArgs("init")
       .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
       .build()
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
index eca46b84c6066..164e2e5594778 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
@@ -66,7 +66,7 @@ private[spark] class BasicDriverConfigurationStep(
   override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
     val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
       new EnvVarBuilder()
-        .withName(ENV_SUBMIT_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(classPath)
         .build()
     }
@@ -133,6 +133,7 @@ private[spark] class BasicDriverConfigurationStep(
         .addToLimits("memory", driverMemoryLimitQuantity)
         .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
         .endResources()
+      .addToArgs("driver")
       .build()
 
     val baseDriverPod = new PodBuilder(driverSpec.driverPod)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index bcacb3934d36a..141bd2827e7c5 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -128,7 +128,7 @@ private[spark] class ExecutorPodFactory(
       .build()
     val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
       new EnvVarBuilder()
-        .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
+        .withName(ENV_CLASSPATH)
         .withValue(cp)
         .build()
     }
@@ -181,6 +181,7 @@ private[spark] class ExecutorPodFactory(
         .endResources()
       .addAllToEnv(executorEnv.asJava)
       .withPorts(requiredPorts.asJava)
+      .addToArgs("executor")
       .build()
 
     val executorPod = new PodBuilder()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
index f193b1f4d3664..65274c6f50e01 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
@@ -34,8 +34,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
 
   test("Base submission steps with a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
@@ -55,8 +54,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
   }
 
   test("Base submission steps without a main app resource.") {
-    val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
     val orchestrator = new DriverConfigOrchestrator(
       APP_ID,
       LAUNCH_TIME,
@@ -75,8 +73,8 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with an init-container.") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
-      .set(INIT_CONTAINER_IMAGE, IC_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(INIT_CONTAINER_IMAGE.key, IC_IMAGE)
       .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
     val orchestrator = new DriverConfigOrchestrator(
@@ -98,7 +96,7 @@ class DriverConfigOrchestratorSuite extends SparkFunSuite {
 
   test("Submission steps with driver secrets to mount") {
     val sparkConf = new SparkConf(false)
-      .set(DRIVER_CONTAINER_IMAGE, DRIVER_IMAGE)
+      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
     val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
index 8ee629ac8ddc1..b136f2c02ffba 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
@@ -47,7 +47,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
       .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
-      .set(DRIVER_CONTAINER_IMAGE, "spark-driver:latest")
+      .set(CONTAINER_IMAGE, "spark-driver:latest")
       .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
       .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
@@ -79,7 +79,7 @@ class BasicDriverConfigurationStepSuite extends SparkFunSuite {
       .asScala
       .map(env => (env.getName, env.getValue))
       .toMap
-    assert(envs(ENV_SUBMIT_EXTRA_CLASSPATH) === "/opt/spark/spark-examples.jar")
+    assert(envs(ENV_CLASSPATH) === "/opt/spark/spark-examples.jar")
     assert(envs(ENV_DRIVER_MEMORY) === "256M")
     assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
     assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
index 20f2e5bc15df3..09b42e4484d86 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
@@ -40,7 +40,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including basic configuration step") {
     val sparkConf = new SparkConf(true)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
 
     val orchestrator = new InitContainerConfigOrchestrator(
@@ -59,7 +59,7 @@ class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
 
   test("including step to mount user-specified secrets") {
     val sparkConf = new SparkConf(false)
-      .set(INIT_CONTAINER_IMAGE, DOCKER_IMAGE)
+      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
       .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
 
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index 7cfbe54c95390..a3c615be031d2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -54,7 +54,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     baseConf = new SparkConf()
       .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
-      .set(EXECUTOR_CONTAINER_IMAGE, executorImage)
+      .set(CONTAINER_IMAGE, executorImage)
   }
 
   test("basic executor pod has reasonable defaults") {
@@ -107,7 +107,7 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
     checkEnv(executor,
       Map("SPARK_JAVA_OPT_0" -> "foo=bar",
-        "SPARK_EXECUTOR_EXTRA_CLASSPATH" -> "bar=baz",
+        ENV_CLASSPATH -> "bar=baz",
         "qux" -> "quux"))
     checkOwnerReferences(executor, driverPodUid)
   }
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
deleted file mode 100644
index 45fbcd9cd0deb..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/driver/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-driver:latest -f kubernetes/dockerfiles/driver/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_SUBMIT_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_SUBMIT_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_DRIVER_JAVA_OPTS[@]}" -cp "$SPARK_CLASSPATH" -Xms$SPARK_DRIVER_MEMORY -Xmx$SPARK_DRIVER_MEMORY -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS $SPARK_DRIVER_CLASS $SPARK_DRIVER_ARGS
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
deleted file mode 100644
index 0f806cf7e148e..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/executor/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark-executor:latest -f kubernetes/dockerfiles/executor/Dockerfile .
-
-COPY examples /opt/spark/examples
-
-CMD SPARK_CLASSPATH="${SPARK_HOME}/jars/*" && \
-    env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt && \
-    readarray -t SPARK_EXECUTOR_JAVA_OPTS < /tmp/java_opts.txt && \
-    if ! [ -z ${SPARK_MOUNTED_CLASSPATH}+x} ]; then SPARK_CLASSPATH="$SPARK_MOUNTED_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_EXECUTOR_EXTRA_CLASSPATH+x} ]; then SPARK_CLASSPATH="$SPARK_EXECUTOR_EXTRA_CLASSPATH:$SPARK_CLASSPATH"; fi && \
-    if ! [ -z ${SPARK_MOUNTED_FILES_DIR+x} ]; then cp -R "$SPARK_MOUNTED_FILES_DIR/." .; fi && \
-    ${JAVA_HOME}/bin/java "${SPARK_EXECUTOR_JAVA_OPTS[@]}" -Xms$SPARK_EXECUTOR_MEMORY -Xmx$SPARK_EXECUTOR_MEMORY -cp "$SPARK_CLASSPATH" org.apache.spark.executor.CoarseGrainedExecutorBackend --driver-url $SPARK_DRIVER_URL --executor-id $SPARK_EXECUTOR_ID --cores $SPARK_EXECUTOR_CORES --app-id $SPARK_APPLICATION_ID --hostname $SPARK_EXECUTOR_POD_IP
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
deleted file mode 100644
index 047056ab2633b..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/init-container/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM spark-base
-
-# If this docker file is being used in the context of building your images from a Spark distribution, the docker build
-# command should be invoked from the top level directory of the Spark distribution. E.g.:
-# docker build -t spark-init:latest -f kubernetes/dockerfiles/init-container/Dockerfile .
-
-ENTRYPOINT [ "/opt/entrypoint.sh", "/opt/spark/bin/spark-class", "org.apache.spark.deploy.k8s.SparkPodInitContainer" ]
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
deleted file mode 100755
index 82559889f4beb..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/entrypoint.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# echo commands to the terminal output
-set -ex
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-uidentry=$(getent passwd $myuid)
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
-    if [ -w /etc/passwd ] ; then
-        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
-    else
-        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
-    fi
-fi
-
-# Execute the container CMD under tini for better hygiene
-/sbin/tini -s -- "$@"
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
similarity index 87%
rename from resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
rename to resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
index da1d6b9e161cc..491b7cf692478 100644
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark-base/Dockerfile
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
@@ -17,15 +17,15 @@
 
 FROM openjdk:8-alpine
 
-ARG spark_jars
-ARG img_path
+ARG spark_jars=jars
+ARG img_path=kubernetes/dockerfiles
 
 # Before building the docker image, first build and make a Spark distribution following
 # the instructions in http://spark.apache.org/docs/latest/building-spark.html.
 # If this docker file is being used in the context of building your images from a Spark
 # distribution, the docker build command should be invoked from the top level directory
 # of the Spark distribution. E.g.:
-# docker build -t spark-base:latest -f kubernetes/dockerfiles/spark-base/Dockerfile .
+# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .
 
 RUN set -ex && \
     apk upgrade --no-cache && \
@@ -41,7 +41,9 @@ COPY ${spark_jars} /opt/spark/jars
 COPY bin /opt/spark/bin
 COPY sbin /opt/spark/sbin
 COPY conf /opt/spark/conf
-COPY ${img_path}/spark-base/entrypoint.sh /opt/
+COPY ${img_path}/spark/entrypoint.sh /opt/
+COPY examples /opt/spark/examples
+COPY data /opt/spark/data
 
 ENV SPARK_HOME /opt/spark
 
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
new file mode 100755
index 0000000000000..0c28c75857871
--- /dev/null
+++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# echo commands to the terminal output
+set -ex
+
+# Check whether there is a passwd entry for the container UID
+myuid=$(id -u)
+mygid=$(id -g)
+uidentry=$(getent passwd $myuid)
+
+# If there is no passwd entry for the container UID, attempt to create one
+if [ -z "$uidentry" ] ; then
+    if [ -w /etc/passwd ] ; then
+        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
+    else
+        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
+    fi
+fi
+
+SPARK_K8S_CMD="$1"
+if [ -z "$SPARK_K8S_CMD" ]; then
+  echo "No command to execute has been provided." 1>&2
+  exit 1
+fi
+shift 1
+
+SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
+env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
+readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt
+if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
+  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
+fi
+if [ -n "$SPARK_MOUNTED_FILES_DIR" ]; then
+  cp -R "$SPARK_MOUNTED_FILES_DIR/." .
+fi
+
+case "$SPARK_K8S_CMD" in
+  driver)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_DRIVER_JAVA_OPTS[@]}"
+      -cp "$SPARK_CLASSPATH"
+      -Xms$SPARK_DRIVER_MEMORY
+      -Xmx$SPARK_DRIVER_MEMORY
+      -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS
+      $SPARK_DRIVER_CLASS
+      $SPARK_DRIVER_ARGS
+    )
+    ;;
+
+  executor)
+    CMD=(
+      ${JAVA_HOME}/bin/java
+      "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
+      -Xms$SPARK_EXECUTOR_MEMORY
+      -Xmx$SPARK_EXECUTOR_MEMORY
+      -cp "$SPARK_CLASSPATH"
+      org.apache.spark.executor.CoarseGrainedExecutorBackend
+      --driver-url $SPARK_DRIVER_URL
+      --executor-id $SPARK_EXECUTOR_ID
+      --cores $SPARK_EXECUTOR_CORES
+      --app-id $SPARK_APPLICATION_ID
+      --hostname $SPARK_EXECUTOR_POD_IP
+    )
+    ;;
+
+  init)
+    CMD=(
+      "$SPARK_HOME/bin/spark-class"
+      "org.apache.spark.deploy.k8s.SparkPodInitContainer"
+      "$@"
+    )
+    ;;
+
+  *)
+    echo "Unknown command: $SPARK_K8S_CMD" 1>&2
+    exit 1
+esac
+
+# Execute the container CMD under tini for better hygiene
+exec /sbin/tini -s -- "${CMD[@]}"

From 6f7aaed805070d29dcba32e04ca7a1f581fa54b9 Mon Sep 17 00:00:00 2001
From: Jose Torres <jose@databricks.com>
Date: Thu, 11 Jan 2018 10:52:12 -0800
Subject: [PATCH 316/356] [SPARK-22908] Add kafka source and sink for
 continuous processing.

## What changes were proposed in this pull request?

Add kafka source and sink for continuous processing. This involves two small changes to the execution engine:

* Bring data reader close() into the normal data reader thread to avoid thread safety issues.
* Fix up the semantics of the RECONFIGURING StreamExecution state. State updates are now atomic, and we don't have to deal with swallowing an exception.

## How was this patch tested?

new unit tests

Author: Jose Torres <jose@databricks.com>

Closes #20096 from jose-torres/continuous-kafka.
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 232 +++++++++
 .../sql/kafka010/KafkaContinuousWriter.scala  | 119 +++++
 .../sql/kafka010/KafkaOffsetReader.scala      |  21 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  17 +-
 .../sql/kafka010/KafkaSourceOffset.scala      |   7 +-
 .../sql/kafka010/KafkaSourceProvider.scala    | 105 +++-
 .../spark/sql/kafka010/KafkaWriteTask.scala   |  71 ++-
 .../spark/sql/kafka010/KafkaWriter.scala      |   5 +-
 .../kafka010/KafkaContinuousSinkSuite.scala   | 474 ++++++++++++++++++
 .../kafka010/KafkaContinuousSourceSuite.scala |  96 ++++
 .../sql/kafka010/KafkaContinuousTest.scala    |  64 +++
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 470 +++++++++--------
 .../apache/spark/sql/DataFrameReader.scala    |  32 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  25 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |   8 +-
 .../execution/streaming/StreamExecution.scala |  15 +-
 .../ContinuousDataSourceRDDIter.scala         |   3 +-
 .../continuous/ContinuousExecution.scala      |  67 ++-
 .../continuous/EpochCoordinator.scala         |  21 +-
 .../sql/streaming/DataStreamWriter.scala      |  26 +-
 .../spark/sql/streaming/StreamTest.scala      |  36 +-
 21 files changed, 1531 insertions(+), 383 deletions(-)
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
 create mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
 create mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
new file mode 100644
index 0000000000000..928379544758c
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.{util => ju}
+
+import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.common.TopicPartition
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.expressions.UnsafeRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
+import org.apache.spark.sql.sources.v2.reader._
+import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * A [[ContinuousReader]] for data from kafka.
+ *
+ * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
+ *                      read by per-task consumers generated later.
+ * @param kafkaParams   String params for per-task Kafka consumers.
+ * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
+ *                      are not Kafka consumer params.
+ * @param metadataPath Path to a directory this reader can use for writing metadata.
+ * @param initialOffsets The Kafka offsets to start reading data at.
+ * @param failOnDataLoss Flag indicating whether reading should fail in data loss
+ *                       scenarios, where some offsets after the specified initial ones can't be
+ *                       properly read.
+ */
+class KafkaContinuousReader(
+    offsetReader: KafkaOffsetReader,
+    kafkaParams: ju.Map[String, Object],
+    sourceOptions: Map[String, String],
+    metadataPath: String,
+    initialOffsets: KafkaOffsetRangeLimit,
+    failOnDataLoss: Boolean)
+  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
+
+  private lazy val session = SparkSession.getActiveSession.get
+  private lazy val sc = session.sparkContext
+
+  // Initialized when creating read tasks. If this diverges from the partitions at the latest
+  // offsets, we need to reconfigure.
+  // Exposed outside this object only for unit tests.
+  private[sql] var knownPartitions: Set[TopicPartition] = _
+
+  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
+
+  private var offset: Offset = _
+  override def setOffset(start: ju.Optional[Offset]): Unit = {
+    offset = start.orElse {
+      val offsets = initialOffsets match {
+        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
+        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
+        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
+      }
+      logInfo(s"Initial offsets: $offsets")
+      offsets
+    }
+  }
+
+  override def getStartOffset(): Offset = offset
+
+  override def deserializeOffset(json: String): Offset = {
+    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
+  }
+
+  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
+    import scala.collection.JavaConverters._
+
+    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
+
+    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
+    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
+    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
+
+    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
+    if (deletedPartitions.nonEmpty) {
+      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
+    }
+
+    val startOffsets = newPartitionOffsets ++
+      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
+    knownPartitions = startOffsets.keySet
+
+    startOffsets.toSeq.map {
+      case (topicPartition, start) =>
+        KafkaContinuousReadTask(
+          topicPartition, start, kafkaParams, failOnDataLoss)
+          .asInstanceOf[ReadTask[UnsafeRow]]
+    }.asJava
+  }
+
+  /** Stop this source and free any resources it has allocated. */
+  def stop(): Unit = synchronized {
+    offsetReader.close()
+  }
+
+  override def commit(end: Offset): Unit = {}
+
+  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
+    val mergedMap = offsets.map {
+      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
+    }.reduce(_ ++ _)
+    KafkaSourceOffset(mergedMap)
+  }
+
+  override def needsReconfiguration(): Boolean = {
+    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
+  }
+
+  override def toString(): String = s"KafkaSource[$offsetReader]"
+
+  /**
+   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
+   * Otherwise, just log a warning.
+   */
+  private def reportDataLoss(message: String): Unit = {
+    if (failOnDataLoss) {
+      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
+    } else {
+      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
+    }
+  }
+}
+
+/**
+ * A read task for continuous Kafka processing. This will be serialized and transformed into a
+ * full reader on executors.
+ *
+ * @param topicPartition The (topic, partition) pair this task is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+case class KafkaContinuousReadTask(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
+  override def createDataReader(): KafkaContinuousDataReader = {
+    new KafkaContinuousDataReader(topicPartition, startOffset, kafkaParams, failOnDataLoss)
+  }
+}
+
+/**
+ * A per-task data reader for continuous Kafka processing.
+ *
+ * @param topicPartition The (topic, partition) pair this data reader is responsible for.
+ * @param startOffset The offset to start reading from within the partition.
+ * @param kafkaParams Kafka consumer params to use.
+ * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
+ *                       are skipped.
+ */
+class KafkaContinuousDataReader(
+    topicPartition: TopicPartition,
+    startOffset: Long,
+    kafkaParams: ju.Map[String, Object],
+    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
+  private val topic = topicPartition.topic
+  private val kafkaPartition = topicPartition.partition
+  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
+
+  private val sharedRow = new UnsafeRow(7)
+  private val bufferHolder = new BufferHolder(sharedRow)
+  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
+
+  private var nextKafkaOffset = startOffset
+  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
+
+  override def next(): Boolean = {
+    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
+    while (r == null) {
+      r = consumer.get(
+        nextKafkaOffset,
+        untilOffset = Long.MaxValue,
+        pollTimeoutMs = Long.MaxValue,
+        failOnDataLoss)
+    }
+    nextKafkaOffset = r.offset + 1
+    currentRecord = r
+    true
+  }
+
+  override def get(): UnsafeRow = {
+    bufferHolder.reset()
+
+    if (currentRecord.key == null) {
+      rowWriter.setNullAt(0)
+    } else {
+      rowWriter.write(0, currentRecord.key)
+    }
+    rowWriter.write(1, currentRecord.value)
+    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
+    rowWriter.write(3, currentRecord.partition)
+    rowWriter.write(4, currentRecord.offset)
+    rowWriter.write(5,
+      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
+    rowWriter.write(6, currentRecord.timestampType.id)
+    sharedRow.setTotalSize(bufferHolder.totalSize)
+    sharedRow
+  }
+
+  override def getOffset(): KafkaSourcePartitionOffset = {
+    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
+  }
+
+  override def close(): Unit = {
+    consumer.close()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
new file mode 100644
index 0000000000000..9843f469c5b25
--- /dev/null
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
+import scala.collection.JavaConverters._
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{Row, SparkSession}
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
+import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
+import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
+import org.apache.spark.sql.sources.v2.writer._
+import org.apache.spark.sql.streaming.OutputMode
+import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
+
+/**
+ * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
+ * don't need to really send one.
+ */
+case object KafkaWriterCommitMessage extends WriterCommitMessage
+
+/**
+ * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
+ * @param topic The topic this writer is responsible for. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+class KafkaContinuousWriter(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends ContinuousWriter with SupportsWriteInternalRow {
+
+  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
+
+  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
+    KafkaContinuousWriterFactory(topic, producerParams, schema)
+
+  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
+  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
+}
+
+/**
+ * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
+ * the per-task data writers.
+ * @param topic The topic that should be written to. If None, topic will be inferred from
+ *              a `topic` field in the incoming data.
+ * @param producerParams Parameters for Kafka producers in each task.
+ * @param schema The schema of the input data.
+ */
+case class KafkaContinuousWriterFactory(
+    topic: Option[String], producerParams: Map[String, String], schema: StructType)
+  extends DataWriterFactory[InternalRow] {
+
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
+  }
+}
+
+/**
+ * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
+ * process incoming rows.
+ *
+ * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
+ *                    from a `topic` field in the incoming data.
+ * @param producerParams Parameters to use for the Kafka producer.
+ * @param inputSchema The attributes in the input data.
+ */
+class KafkaContinuousDataWriter(
+    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
+  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
+  import scala.collection.JavaConverters._
+
+  private lazy val producer = CachedKafkaProducer.getOrCreate(
+    new java.util.HashMap[String, Object](producerParams.asJava))
+
+  def write(row: InternalRow): Unit = {
+    checkForErrors()
+    sendRow(row, producer)
+  }
+
+  def commit(): WriterCommitMessage = {
+    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
+    // This requires flushing and then checking that no callbacks produced errors.
+    // We also check for errors before to fail as soon as possible - the check is cheap.
+    checkForErrors()
+    producer.flush()
+    checkForErrors()
+    KafkaWriterCommitMessage
+  }
+
+  def abort(): Unit = {}
+
+  def close(): Unit = {
+    checkForErrors()
+    if (producer != null) {
+      producer.flush()
+      checkForErrors()
+      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
+    }
+  }
+}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 3e65949a6fd1b..551641cfdbca8 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,10 +117,14 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
+   *
+   * @param partitionOffsets the specific offsets to resolve
+   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
-    runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long],
+      reportDataLoss: String => Unit): KafkaSourceOffset = {
+    val fetched = runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -145,6 +149,19 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
+    partitionOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+        off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (fetched(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
+        }
+      case _ =>
+        // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(fetched)
+  }
+
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index e9cff04ba5f2e..27da76068a66f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
+        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,21 +138,6 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
-  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
-    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
-    specificOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-          off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (result(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
-        }
-      case _ =>
-      // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(result)
-  }
-
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index b5da415b3097e..c82154cfbad7f 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,17 +20,22 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
+import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
+private[kafka010]
+case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
+  extends PartitionOffset
+
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3cb4d8cad12cc..3914370a96595 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, UUID}
+import java.util.{Locale, Optional, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,9 +27,12 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
 import org.apache.spark.sql.sources._
+import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
+import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
+import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -43,6 +46,8 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
+    with ContinuousWriteSupport
+    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -101,6 +106,43 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
+  override def createContinuousReader(
+      schema: Optional[StructType],
+      metadataPath: String,
+      options: DataSourceV2Options): KafkaContinuousReader = {
+    val parameters = options.asMap().asScala.toMap
+    validateStreamOptions(parameters)
+    // Each running query should use its own group id. Otherwise, the query may be only assigned
+    // partial data since Kafka will assign partitions to multiple consumers having the same group
+    // id. Hence, we should generate a unique id for each query.
+    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
+
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    val specifiedKafkaParams =
+      parameters
+        .keySet
+        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+        .map { k => k.drop(6).toString -> parameters(k) }
+        .toMap
+
+    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
+      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
+
+    val kafkaOffsetReader = new KafkaOffsetReader(
+      strategy(caseInsensitiveParams),
+      kafkaParamsForDriver(specifiedKafkaParams),
+      parameters,
+      driverGroupIdPrefix = s"$uniqueGroupId-driver")
+
+    new KafkaContinuousReader(
+      kafkaOffsetReader,
+      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
+      parameters,
+      metadataPath,
+      startingStreamOffsets,
+      failOnDataLoss(caseInsensitiveParams))
+  }
+
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -181,26 +223,22 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
+  override def createContinuousWriter(
+      queryId: String,
+      schema: StructType,
+      mode: OutputMode,
+      options: DataSourceV2Options): Optional[ContinuousWriter] = {
+    import scala.collection.JavaConverters._
 
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+    val spark = SparkSession.getActiveSession.get
+    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
+    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
+    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
+
+    KafkaWriter.validateQuery(
+      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+
+    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -450,4 +488,27 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
+
+  private[kafka010] def kafkaParamsForProducer(
+      parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
+
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
+  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index 6fd333e2f43ba..baa60febf661d 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,10 +33,8 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) {
+    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
   // used to synchronize with Kafka callbacks
-  @volatile private var failedWrite: Exception = null
-  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -46,23 +44,7 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      val projectedRow = projection(currentRow)
-      val topic = projectedRow.getUTF8String(0)
-      val key = projectedRow.getBinary(1)
-      val value = projectedRow.getBinary(2)
-      if (topic == null) {
-        throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-      }
-      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-      val callback = new Callback() {
-        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-          if (failedWrite == null && e != null) {
-            failedWrite = e
-          }
-        }
-      }
-      producer.send(record, callback)
+      sendRow(currentRow, producer)
     }
   }
 
@@ -74,8 +56,49 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
+}
+
+private[kafka010] abstract class KafkaRowWriter(
+    inputSchema: Seq[Attribute], topic: Option[String]) {
+
+  // used to synchronize with Kafka callbacks
+  @volatile protected var failedWrite: Exception = _
+  protected val projection = createProjection
+
+  private val callback = new Callback() {
+    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+      if (failedWrite == null && e != null) {
+        failedWrite = e
+      }
+    }
+  }
 
-  private def createProjection: UnsafeProjection = {
+  /**
+   * Send the specified row to the producer, with a callback that will save any exception
+   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
+   * assuming the row is in Kafka.
+   */
+  protected def sendRow(
+      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
+    val projectedRow = projection(row)
+    val topic = projectedRow.getUTF8String(0)
+    val key = projectedRow.getBinary(1)
+    val value = projectedRow.getBinary(2)
+    if (topic == null) {
+      throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+    }
+    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+    producer.send(record, callback)
+  }
+
+  protected def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
+
+  private def createProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -112,11 +135,5 @@ private[kafka010] class KafkaWriteTask(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
-
-  private def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 5e9ae35b3f008..15cd44812cb0c 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,10 +43,9 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      queryExecution: QueryExecution,
+      schema: Seq[Attribute],
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
-    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -84,7 +83,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(queryExecution, kafkaParameters, topic)
+    validateQuery(schema, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
new file mode 100644
index 0000000000000..dfc97b1c38bb5
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
@@ -0,0 +1,474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Locale
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.apache.kafka.clients.producer.ProducerConfig
+import org.apache.kafka.common.serialization.ByteArraySerializer
+import org.scalatest.time.SpanSugar._
+import scala.collection.JavaConverters._
+
+import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
+import org.apache.spark.sql.execution.streaming.MemoryStream
+import org.apache.spark.sql.streaming._
+import org.apache.spark.sql.types.{BinaryType, DataType}
+import org.apache.spark.util.Utils
+
+/**
+ * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
+ * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
+ * to duplicate all the code.
+ */
+class KafkaContinuousSinkSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val streamingTimeout = 30.seconds
+
+  override def beforeAll(): Unit = {
+    super.beforeAll()
+    testUtils = new KafkaTestUtils(
+      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
+    testUtils.setup()
+  }
+
+  override def afterAll(): Unit = {
+    if (testUtils != null) {
+      testUtils.teardown()
+      testUtils = null
+    }
+    super.afterAll()
+  }
+
+  test("streaming - write to kafka with topic field") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = None,
+      withOutputMode = Some(OutputMode.Append))(
+      withSelectExpr = s"'$topic' as topic", "value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - write w/o topic field, with topic option") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))()
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
+      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("streaming - topic field and topic option") {
+    /* The purpose of this test is to ensure that the topic option
+     * overrides the topic field. We begin by writing some data that
+     * includes a topic field and value (e.g., 'foo') along with a topic
+     * option. Then when we read from the topic specified in the option
+     * we should see the data i.e., the data was written to the topic
+     * option, and not to the topic in the data e.g., foo
+     */
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    val writer = createKafkaWriter(
+      input.toDF(),
+      withTopic = Some(topic),
+      withOutputMode = Some(OutputMode.Append()))(
+      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
+
+    val reader = createKafkaReader(topic)
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
+      .as[(Int, Int)]
+      .map(_._2)
+
+    try {
+      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
+      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
+      failAfter(streamingTimeout) {
+        writer.processAllAvailable()
+      }
+      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("null topic attribute") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = "CAST(null as STRING) as topic", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getCause.getCause.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("null topic present in the data."))
+  }
+
+  test("streaming - write data with bad schema") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    /* No topic field or topic option */
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = "value as key", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage
+      .toLowerCase(Locale.ROOT)
+      .contains("topic option required when no 'topic' attribute is present"))
+
+    try {
+      /* No value field */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "value as key"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "required attribute 'value' not found"))
+  }
+
+  test("streaming - write data with valid schema but wrong types") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+      .selectExpr("CAST(value as STRING) value")
+    val topic = newTopic()
+    testUtils.createTopic(topic)
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      /* topic field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"CAST('1' as INT) as topic", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
+
+    try {
+      /* value field wrong type */
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "value attribute type must be a string or binarytype"))
+
+    try {
+      ex = intercept[StreamingQueryException] {
+        /* key field wrong type */
+        writer = createKafkaWriter(input.toDF())(
+          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
+        )
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        writer.processAllAvailable()
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+      "key attribute type must be a string or binarytype"))
+  }
+
+  test("streaming - write to non-existing topic") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .option("startingOffsets", "earliest")
+      .load()
+    val topic = newTopic()
+
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
+        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
+        eventually(timeout(streamingTimeout)) {
+          assert(writer.exception.isDefined)
+        }
+        throw writer.exception.get
+      }
+    } finally {
+      writer.stop()
+    }
+    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
+  }
+
+  test("streaming - exception on config serializer") {
+    val inputTopic = newTopic()
+    testUtils.createTopic(inputTopic, partitions = 1)
+    testUtils.sendMessages(inputTopic, Array("0"))
+
+    val input = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", inputTopic)
+      .load()
+    var writer: StreamingQuery = null
+    var ex: Exception = null
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(
+          input.toDF(),
+          withOptions = Map("kafka.key.serializer" -> "foo"))()
+        writer.processAllAvailable()
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'key.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+
+    try {
+      ex = intercept[StreamingQueryException] {
+        writer = createKafkaWriter(
+          input.toDF(),
+          withOptions = Map("kafka.value.serializer" -> "foo"))()
+        writer.processAllAvailable()
+      }
+      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
+        "kafka option 'value.serializer' is not supported"))
+    } finally {
+      writer.stop()
+    }
+  }
+
+  test("generic - write big data with small producer buffer") {
+    /* This test ensures that we understand the semantics of Kafka when
+    * is comes to blocking on a call to send when the send buffer is full.
+    * This test will configure the smallest possible producer buffer and
+    * indicate that we should block when it is full. Thus, no exception should
+    * be thrown in the case of a full buffer.
+    */
+    val topic = newTopic()
+    testUtils.createTopic(topic, 1)
+    val options = new java.util.HashMap[String, String]
+    options.put("bootstrap.servers", testUtils.brokerAddress)
+    options.put("buffer.memory", "16384") // min buffer size
+    options.put("block.on.buffer.full", "true")
+    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
+    val inputSchema = Seq(AttributeReference("value", BinaryType)())
+    val data = new Array[Byte](15000) // large value
+    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
+    try {
+      val fieldTypes: Array[DataType] = Array(BinaryType)
+      val converter = UnsafeProjection.create(fieldTypes)
+      val row = new SpecificInternalRow(fieldTypes)
+      row.update(0, data)
+      val iter = Seq.fill(1000)(converter.apply(row)).iterator
+      iter.foreach(writeTask.write(_))
+      writeTask.commit()
+    } finally {
+      writeTask.close()
+    }
+  }
+
+  private def createKafkaReader(topic: String): DataFrame = {
+    spark.read
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("startingOffsets", "earliest")
+      .option("endingOffsets", "latest")
+      .option("subscribe", topic)
+      .load()
+  }
+
+  private def createKafkaWriter(
+      input: DataFrame,
+      withTopic: Option[String] = None,
+      withOutputMode: Option[OutputMode] = None,
+      withOptions: Map[String, String] = Map[String, String]())
+      (withSelectExpr: String*): StreamingQuery = {
+    var stream: DataStreamWriter[Row] = null
+    val checkpointDir = Utils.createTempDir()
+    var df = input.toDF()
+    if (withSelectExpr.length > 0) {
+      df = df.selectExpr(withSelectExpr: _*)
+    }
+    stream = df.writeStream
+      .format("kafka")
+      .option("checkpointLocation", checkpointDir.getCanonicalPath)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      // We need to reduce blocking time to efficiently test non-existent partition behavior.
+      .option("kafka.max.block.ms", "1000")
+      .trigger(Trigger.Continuous(1000))
+      .queryName("kafkaStream")
+    withTopic.foreach(stream.option("topic", _))
+    withOutputMode.foreach(stream.outputMode(_))
+    withOptions.foreach(opt => stream.option(opt._1, opt._2))
+    stream.start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
new file mode 100644
index 0000000000000..b3dade414f625
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import java.util.Properties
+import java.util.concurrent.atomic.AtomicInteger
+
+import org.scalatest.time.SpanSugar._
+import scala.collection.mutable
+import scala.util.Random
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.{StreamTest, Trigger}
+import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
+
+// Run tests in KafkaSourceSuiteBase in continuous execution mode.
+class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
+
+class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
+  import testImplicits._
+
+  override val brokerProps = Map("auto.create.topics.enable" -> "false")
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Execute { query =>
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        eventually(timeout(streamingTimeout)) {
+          assert(
+            query.lastExecution.logical.collectFirst {
+              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+            }.exists { r =>
+              // Ensure the new topic is present and the old topic is gone.
+              r.knownPartitions.exists(_.topic == topic2)
+            },
+            s"query never reconfigured to new topic $topic2")
+        }
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+}
+
+class KafkaContinuousSourceStressForDontFailOnDataLossSuite
+    extends KafkaSourceStressForDontFailOnDataLossSuite {
+  override protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream
+      .format("memory")
+      .queryName("memory")
+      .start()
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
new file mode 100644
index 0000000000000..e713e6695d2bd
--- /dev/null
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.kafka010
+
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
+import org.apache.spark.sql.execution.streaming.StreamExecution
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.streaming.Trigger
+import org.apache.spark.sql.test.TestSparkSession
+
+// Trait to configure StreamTest for kafka continuous execution tests.
+trait KafkaContinuousTest extends KafkaSourceTest {
+  override val defaultTrigger = Trigger.Continuous(1000)
+  override val defaultUseV2Sink = true
+
+  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
+  override protected def createSparkSession = new TestSparkSession(
+    new SparkContext(
+      "local[10]",
+      "continuous-stream-test-sql-context",
+      sparkConf.set("spark.sql.testkey", "true")))
+
+  // In addition to setting the partitions in Kafka, we have to wait until the query has
+  // reconfigured to the new count so the test framework can hook in properly.
+  override protected def setTopicPartitions(
+      topic: String, newCount: Int, query: StreamExecution) = {
+    testUtils.addPartitions(topic, newCount)
+    eventually(timeout(streamingTimeout)) {
+      assert(
+        query.lastExecution.logical.collectFirst {
+          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
+        }.exists(_.knownPartitions.size == newCount),
+        s"query never reconfigured to $newCount partitions")
+    }
+  }
+
+  test("ensure continuous stream is being used") {
+    val query = spark.readStream
+      .format("rate")
+      .option("numPartitions", "1")
+      .option("rowsPerSecond", "1")
+      .load()
+
+    testStream(query)(
+      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
+    )
+  }
+}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index 2034b9be07f24..d66908f86ccc7 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,11 +34,14 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.ForeachWriter
+import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
+import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
 import org.apache.spark.sql.execution.streaming._
+import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
+import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -49,9 +52,11 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
+  protected val brokerProps = Map[String, Object]()
+
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils
+    testUtils = new KafkaTestUtils(brokerProps)
     testUtils.setup()
   }
 
@@ -59,18 +64,25 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
-      super.afterAll()
     }
+    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q.processAllAvailable()
+    q match {
+      case c: ContinuousExecution => c.awaitEpoch(0)
+      case m: MicroBatchExecution => m.processAllAvailable()
+    }
     true
   }
 
+  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
+    testUtils.addPartitions(topic, newCount)
+  }
+
   /**
    * Add data to Kafka.
    *
@@ -82,7 +94,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
+    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
       if (query.get.isActive) {
         // Make sure no Spark job is running when deleting a topic
         query.get.processAllAvailable()
@@ -97,16 +109,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
-      // Read all topics again in case some topics are delete.
-      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
-          source.asInstanceOf[KafkaSource]
-      }
+        case StreamingExecutionRelation(source: KafkaSource, _) => source
+      } ++ (query.get.lastExecution match {
+        case null => Seq()
+        case e => e.logical.collect {
+          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
+        }
+      })
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -137,14 +151,158 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-}
 
+  private val topicId = new AtomicInteger(0)
+  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+}
 
-class KafkaSourceSuite extends KafkaSourceTest {
+class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
 
   import testImplicits._
 
-  private val topicId = new AtomicInteger(0)
+  test("(de)serialization of initial offsets") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
+  }
+
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
+
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -237,86 +395,51 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("(de)serialization of initial offsets") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 64)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
-  }
-
-  test("maxOffsetsPerTrigger") {
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
     val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
 
-    val reader = spark
+    val kafka = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
+      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
-      .option("startingOffsets", "earliest")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
-
-    val clock = new StreamManualClock
+      .load()
 
-    val waitUntilBatchProcessed = AssertOnQuery { q =>
-      eventually(Timeout(streamingTimeout)) {
-        if (!q.exception.isDefined) {
-          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
-        }
-      }
-      if (q.exception.isDefined) {
-        throw q.exception.get
-      }
-      true
-    }
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
 
-    testStream(mapped)(
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
-      StopStream,
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
-    )
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
   }
+}
+
+class KafkaSourceSuiteBase extends KafkaSourceTest {
+
+  import testImplicits._
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
@@ -328,7 +451,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"topic-.*")
+      .option("subscribePattern", s"$topic.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -422,65 +545,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-
-  test("starting offset is latest by default") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("0"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    val kafka = reader.load()
-      .selectExpr("CAST(value AS STRING)")
-      .as[String]
-    val mapped = kafka.map(_.toInt)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(1, 2, 3)  // should not have 0
-    )
-  }
-
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -540,34 +604,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
   test("delete a topic when a Spark job is running") {
     KafkaSourceSuite.collectedData.clear()
 
@@ -629,8 +665,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     }
   }
 
-  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
-
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -676,6 +710,10 @@ class KafkaSourceSuite extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
+      Execute { q =>
+        // wait to reach the last offset in every partition
+        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
+      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -706,6 +744,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       .format("memory")
       .outputMode("append")
       .queryName("kafkaColumnTypes")
+      .trigger(defaultTrigger)
       .start()
     query.processAllAvailable()
     val rows = spark.table("kafkaColumnTypes").collect()
@@ -723,47 +762,6 @@ class KafkaSourceSuite extends KafkaSourceTest {
     query.stop()
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
-    val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
-      .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
-
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
-  }
-
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -800,9 +798,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -843,9 +839,7 @@ class KafkaSourceSuite extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) {
-          testUtils.addPartitions(topic, 10)
-        }
+        if (addPartitions) setTopicPartitions(topic, 10, query)
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -977,20 +971,8 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  test("stress test for failOnDataLoss=false") {
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", "failOnDataLoss.*")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-      .option("fetchOffset.retryIntervalMs", "3000")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+  protected def startStream(ds: Dataset[Int]) = {
+    ds.writeStream.foreach(new ForeachWriter[Int] {
 
       override def open(partitionId: Long, version: Long): Boolean = {
         true
@@ -1004,6 +986,22 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
       override def close(errorOrNull: Throwable): Unit = {
       }
     }).start()
+  }
+
+  test("stress test for failOnDataLoss=false") {
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", "failOnDataLoss.*")
+      .option("startingOffsets", "earliest")
+      .option("failOnDataLoss", "false")
+      .option("fetchOffset.retryIntervalMs", "3000")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val query = startStream(kafka.map(kv => kv._2.toInt))
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index e8d683a578f35..b714a46b5f786 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,6 +191,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
+      // Streaming also uses the data source V2 API. So it may be that the data source implements
+      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
+      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -208,23 +211,30 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ =>
-          throw new AnalysisException(s"$cls does not support data reading.")
+        case _ => null // fall back to v1
       }
 
-      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      if (reader == null) {
+        loadV1Source(paths: _*)
+      } else {
+        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
+      }
     } else {
-      // Code path for data source v1.
-      sparkSession.baseRelationToDataFrame(
-        DataSource.apply(
-          sparkSession,
-          paths = paths,
-          userSpecifiedSchema = userSpecifiedSchema,
-          className = source,
-          options = extraOptions.toMap).resolveRelation())
+      loadV1Source(paths: _*)
     }
   }
 
+  private def loadV1Source(paths: String*) = {
+    // Code path for data source v1.
+    sparkSession.baseRelationToDataFrame(
+      DataSource.apply(
+        sparkSession,
+        paths = paths,
+        userSpecifiedSchema = userSpecifiedSchema,
+        className = source,
+        options = extraOptions.toMap).resolveRelation())
+  }
+
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 3304f368e1050..97f12ff625c42 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,17 +255,24 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        case _ => throw new AnalysisException(s"$cls does not support data writing.")
+        // Streaming also uses the data source V2 API. So it may be that the data source implements
+        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
+        // as though it's a V1 source.
+        case _ => saveToV1Source()
       }
     } else {
-      // Code path for data source v1.
-      runCommand(df.sparkSession, "save") {
-        DataSource(
-          sparkSession = df.sparkSession,
-          className = source,
-          partitionColumns = partitioningColumns.getOrElse(Nil),
-          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
-      }
+      saveToV1Source()
+    }
+  }
+
+  private def saveToV1Source(): Unit = {
+    // Code path for data source v1.
+    runCommand(df.sparkSession, "save") {
+      DataSource(
+        sparkSession = df.sparkSession,
+        className = source,
+        partitionColumns = partitioningColumns.getOrElse(Nil),
+        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index f0bdf84bb7a84..a4a857f2d4d9b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -81,9 +81,11 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      logInfo(s"Data source writer $writer is committing.")
-      writer.commit(messages)
-      logInfo(s"Data source writer $writer committed.")
+      if (!writer.isInstanceOf[ContinuousWriter]) {
+        logInfo(s"Data source writer $writer is committing.")
+        writer.commit(messages)
+        logInfo(s"Data source writer $writer committed.")
+      }
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 24a8b000df0c1..cf27e1a70650a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,7 +142,8 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override val runId: UUID = UUID.randomUUID
+  override def runId: UUID = currentRunId
+  protected var currentRunId = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
@@ -418,11 +419,17 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      if (sources == null) {
+        // sources might not be initialized yet
+        false
+      } else {
+        val source = sources(sourceIndex)
+        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
+      }
     }
 
     while (notDone) {
@@ -436,7 +443,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for $source")
+    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index d79e4bd65f563..e700aa4f9aea7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -77,7 +77,6 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
-      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
@@ -201,6 +200,8 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
+    } finally {
+      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 9657b5e26d770..667410ef9f1c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,7 +17,9 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
+import java.util.UUID
 import java.util.concurrent.TimeUnit
+import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
@@ -52,7 +54,7 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
+  @volatile protected var continuousSources: Seq[ContinuousReader] = _
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   override lazy val logicalPlan: LogicalPlan = {
@@ -78,15 +80,17 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    do {
-      try {
-        runContinuous(sparkSessionForStream)
-      } catch {
-        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
-          // swallow exception and run again
-          state.set(ACTIVE)
+    val stateUpdate = new UnaryOperator[State] {
+      override def apply(s: State) = s match {
+        // If we ended the query to reconfigure, reset the state to active.
+        case RECONFIGURING => ACTIVE
+        case _ => s
       }
-    } while (state.get() == ACTIVE)
+    }
+
+    do {
+      runContinuous(sparkSessionForStream)
+    } while (state.updateAndGet(stateUpdate) == ACTIVE)
   }
 
   /**
@@ -120,12 +124,16 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
-        // a previous run. We can't allow commits to an epoch that a previous run reached but
-        // this run has not.
-        offsetLog.purgeAfter(latestEpochId)
+        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
+        // commit happens between offset log write and commit log write, this means an epoch ID
+        // which is not in the offset log.
+        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
+          throw new IllegalStateException(
+            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
+              s"an element.")
+        }
+        currentBatchId = latestOffsetEpoch + 1
 
-        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -141,6 +149,7 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
+    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -225,13 +234,11 @@ class ContinuousExecution(
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration()) {
-              state.set(RECONFIGURING)
+            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
-                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -259,6 +266,7 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
+      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -273,17 +281,22 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    if (partitionOffsets.contains(null)) {
-      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
-      // there's nothing meaningful to add to the offset log.
-    }
     val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    synchronized {
-      if (queryExecutionThread.isAlive) {
-        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      } else {
-        return
-      }
+    val oldOffset = synchronized {
+      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      offsetLog.get(epoch - 1)
+    }
+
+    // If offset hasn't changed since last epoch, there's been no new data.
+    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
+      noNewData = true
+    }
+
+    awaitProgressLock.lock()
+    try {
+      awaitProgressLockCondition.signalAll()
+    } finally {
+      awaitProgressLock.unlock()
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 98017c3ac6a33..40dcbecade814 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,6 +39,15 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
+/**
+ * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
+ * object. This can lead to a race condition where the query restarts at epoch n, a new
+ * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
+ * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
+ * message to stop any writes to the ContinuousExecution object.
+ */
+private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
+
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -116,6 +125,8 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
+  private var queryWritesStopped: Boolean = false
+
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -147,12 +158,16 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionCommits.remove(k)
+        partitionOffsets.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
+    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
+    // won't shed errors or anything.
+    case _ if queryWritesStopped => ()
+
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -188,5 +203,9 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
+
+    case StopContinuousExecutionWrites =>
+      queryWritesStopped = true
+      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index db588ae282f38..b5b4a05ab4973 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,6 +29,7 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
+import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -279,18 +280,29 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val dataSource =
-        DataSource(
-          df.sparkSession,
-          className = source,
-          options = extraOptions.toMap,
-          partitionColumns = normalizedParCols.getOrElse(Nil))
+      val sink = trigger match {
+        case _: ContinuousTrigger =>
+          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
+          ds.newInstance() match {
+            case w: ContinuousWriteSupport => w
+            case _ => throw new AnalysisException(
+              s"Data source $source does not support continuous writing")
+          }
+        case _ =>
+          val ds = DataSource(
+            df.sparkSession,
+            className = source,
+            options = extraOptions.toMap,
+            partitionColumns = normalizedParCols.getOrElse(Nil))
+          ds.createSink(outputMode)
+      }
+
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        dataSource.createSink(outputMode),
+        sink,
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index d46461fa9bf6d..0762895fdc620 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,8 +38,9 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -80,6 +81,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
+  protected val defaultTrigger = Trigger.ProcessingTime(0)
+  protected val defaultUseV2Sink = false
+
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -189,7 +193,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = Trigger.ProcessingTime(0),
+      trigger: Trigger = defaultTrigger,
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -276,7 +280,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -403,18 +407,11 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
-      // Get the map of source index to the current source objects
-      val indexToSource = currentStream
-        .logicalPlan
-        .collect { case StreamingExecutionRelation(s, _) => s }
-        .zipWithIndex
-        .map(_.swap)
-        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
+          currentStream.awaitOffset(sourceIndex, offset)
         }
       }
 
@@ -473,6 +470,12 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
+              currentStream match {
+                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
+                    s.lastExecution.executedPlan // will fail if lastExecution is null
+                  }
+                case _ =>
+              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -600,7 +603,10 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect { case StreamingExecutionRelation(s, _) => s }
+                  .collect {
+                    case StreamingExecutionRelation(s, _) => s
+                    case DataSourceV2Relation(_, r) => r
+                  }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -613,9 +619,13 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
+                }.orElse {
+                  queryToUse.flatMap { q =>
+                    findSourceIndex(q.lastExecution.logical)
+                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could find index of the source to which data was added")
+                    "Could not find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later

From 186bf8fb2e9ff8a80f3f6bcb5f2a0327fa79a1c9 Mon Sep 17 00:00:00 2001
From: Bago Amirbekian <bago@databricks.com>
Date: Thu, 11 Jan 2018 13:57:15 -0800
Subject: [PATCH 317/356] [SPARK-23046][ML][SPARKR] Have RFormula include
 VectorSizeHint in pipeline

## What changes were proposed in this pull request?

Including VectorSizeHint in RFormula piplelines will allow them to be applied to streaming dataframes.

## How was this patch tested?

Unit tests.

Author: Bago Amirbekian <bago@databricks.com>

Closes #20238 from MrBago/rFormulaVectorSize.
---
 R/pkg/R/mllib_utils.R                         |  1 +
 .../apache/spark/ml/feature/RFormula.scala    | 18 +++++++--
 .../spark/ml/feature/RFormulaSuite.scala      | 37 ++++++++++++++++---
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/R/pkg/R/mllib_utils.R b/R/pkg/R/mllib_utils.R
index a53c92c2c4815..23dda42c325be 100644
--- a/R/pkg/R/mllib_utils.R
+++ b/R/pkg/R/mllib_utils.R
@@ -130,3 +130,4 @@ read.ml <- function(path) {
     stop("Unsupported model: ", jobj)
   }
 }
+
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index 7da3339f8b487..f384ffbf578bc 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -25,7 +25,7 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
 import org.apache.spark.ml.attribute.AttributeGroup
-import org.apache.spark.ml.linalg.VectorUDT
+import org.apache.spark.ml.linalg.{Vector, VectorUDT}
 import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasHandleInvalid, HasLabelCol}
 import org.apache.spark.ml.util._
@@ -210,8 +210,8 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
 
     // First we index each string column referenced by the input terms.
     val indexed: Map[String, String] = resolvedFormula.terms.flatten.distinct.map { term =>
-      dataset.schema(term) match {
-        case column if column.dataType == StringType =>
+      dataset.schema(term).dataType match {
+        case _: StringType =>
           val indexCol = tmpColumn("stridx")
           encoderStages += new StringIndexer()
             .setInputCol(term)
@@ -220,6 +220,18 @@ class RFormula @Since("1.5.0") (@Since("1.5.0") override val uid: String)
             .setHandleInvalid($(handleInvalid))
           prefixesToRewrite(indexCol + "_") = term + "_"
           (term, indexCol)
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(dataset.schema(term))
+          val size = if (group.size < 0) {
+            dataset.select(term).first().getAs[Vector](0).size
+          } else {
+            group.size
+          }
+          encoderStages += new VectorSizeHint(uid)
+            .setHandleInvalid("optimistic")
+            .setInputCol(term)
+            .setSize(size)
+          (term, term)
         case _ =>
           (term, term)
       }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
index 5d09c90ec6dfa..f3f4b5a3d0233 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/RFormulaSuite.scala
@@ -17,15 +17,15 @@
 
 package org.apache.spark.ml.feature
 
-import org.apache.spark.{SparkException, SparkFunSuite}
+import org.apache.spark.SparkException
 import org.apache.spark.ml.attribute._
-import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
-import org.apache.spark.mllib.util.MLlibTestSparkContext
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils}
+import org.apache.spark.sql.{DataFrame, Encoder, Row}
 import org.apache.spark.sql.types.DoubleType
 
-class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
+class RFormulaSuite extends MLTest with DefaultReadWriteTest {
 
   import testImplicits._
 
@@ -548,4 +548,31 @@ class RFormulaSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul
     assert(result3.collect() === expected3.collect())
     assert(result4.collect() === expected4.collect())
   }
+
+  test("Use Vectors as inputs to formula.") {
+    val original = Seq(
+      (1, 4, Vectors.dense(0.0, 0.0, 4.0)),
+      (2, 4, Vectors.dense(1.0, 0.0, 4.0)),
+      (3, 5, Vectors.dense(1.0, 0.0, 5.0)),
+      (4, 5, Vectors.dense(0.0, 1.0, 5.0))
+    ).toDF("id", "a", "b")
+    val formula = new RFormula().setFormula("id ~ a + b")
+    val (first +: rest) = Seq("id", "a", "b", "features", "label")
+    testTransformer[(Int, Int, Vector)](original, formula.fit(original), first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
+
+    val group = new AttributeGroup("b", 3)
+    val vectorColWithMetadata = original("b").as("b", group.toMetadata())
+    val dfWithMetadata = original.withColumn("b", vectorColWithMetadata)
+    val model = formula.fit(dfWithMetadata)
+    // model should work even when applied to dataframe without metadata.
+    testTransformer[(Int, Int, Vector)](original, model, first, rest: _*) {
+      case Row(id: Int, a: Int, b: Vector, features: Vector, label: Double) =>
+        assert(label === id)
+        assert(features.toArray === a +: b.toArray)
+    }
+  }
 }

From b5042d75c2faa5f15bc1e160d75f06dfdd6eea37 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Thu, 11 Jan 2018 16:20:30 -0800
Subject: [PATCH 318/356] [SPARK-23008][ML] OnehotEncoderEstimator python API

## What changes were proposed in this pull request?

OnehotEncoderEstimator python API.

## How was this patch tested?

doctest

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20209 from WeichenXu123/ohe_py.
---
 python/pyspark/ml/feature.py                  | 113 ++++++++++++++++++
 .../ml/param/_shared_params_code_gen.py       |   1 +
 python/pyspark/ml/param/shared.py             |  23 ++++
 3 files changed, 137 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 13bf95cce40be..b963e45dd7cff 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -45,6 +45,7 @@
            'NGram',
            'Normalizer',
            'OneHotEncoder',
+           'OneHotEncoderEstimator', 'OneHotEncoderModel',
            'PCA', 'PCAModel',
            'PolynomialExpansion',
            'QuantileDiscretizer',
@@ -1641,6 +1642,118 @@ def getDropLast(self):
         return self.getOrDefault(self.dropLast)
 
 
+@inherit_doc
+class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, HasHandleInvalid,
+                             JavaMLReadable, JavaMLWritable):
+    """
+    A one-hot encoder that maps a column of category indices to a column of binary vectors, with
+    at most a single one-value per row that indicates the input category index.
+    For example with 5 categories, an input value of 2.0 would map to an output vector of
+    `[0.0, 0.0, 1.0, 0.0]`.
+    The last category is not included by default (configurable via `dropLast`),
+    because it makes the vector entries sum up to one, and hence linearly dependent.
+    So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+
+    Note: This is different from scikit-learn's OneHotEncoder, which keeps all categories.
+    The output vectors are sparse.
+
+    When `handleInvalid` is configured to 'keep', an extra "category" indicating invalid values is
+    added as last category. So when `dropLast` is true, invalid values are encoded as all-zeros
+    vector.
+
+    Note: When encoding multi-column by using `inputCols` and `outputCols` params, input/output
+    cols come in pairs, specified by the order in the arrays, and each pair is treated
+    independently.
+
+    See `StringIndexer` for converting categorical values into category indices
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
+    >>> ohe = OneHotEncoderEstimator(inputCols=["input"], outputCols=["output"])
+    >>> model = ohe.fit(df)
+    >>> model.transform(df).head().output
+    SparseVector(2, {0: 1.0})
+    >>> ohePath = temp_path + "/oheEstimator"
+    >>> ohe.save(ohePath)
+    >>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
+    >>> loadedOHE.getInputCols() == ohe.getInputCols()
+    True
+    >>> modelPath = temp_path + "/ohe-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = OneHotEncoderModel.load(modelPath)
+    >>> loadedModel.categorySizes == model.categorySizes
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle invalid data during " +
+                          "transform(). Options are 'keep' (invalid data presented as an extra " +
+                          "categorical feature) or error (throw an error). Note that this Param " +
+                          "is only used during transform; during fitting, invalid data will " +
+                          "result in an error.",
+                          typeConverter=TypeConverters.toString)
+
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last category",
+                     typeConverter=TypeConverters.toBoolean)
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        __init__(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        """
+        super(OneHotEncoderEstimator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
+        self._setDefault(handleInvalid="error", dropLast=True)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True):
+        """
+        setParams(self, inputCols=None, outputCols=None, handleInvalid="error", dropLast=True)
+        Sets params for this OneHotEncoderEstimator.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    @since("2.3.0")
+    def setDropLast(self, value):
+        """
+        Sets the value of :py:attr:`dropLast`.
+        """
+        return self._set(dropLast=value)
+
+    @since("2.3.0")
+    def getDropLast(self):
+        """
+        Gets the value of dropLast or its default value.
+        """
+        return self.getOrDefault(self.dropLast)
+
+    def _create_model(self, java_model):
+        return OneHotEncoderModel(java_model)
+
+
+class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable):
+    """
+    Model fitted by :py:class:`OneHotEncoderEstimator`.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @property
+    @since("2.3.0")
+    def categorySizes(self):
+        """
+        Original number of categories for each feature being encoded.
+        The array contains one value for each input column, in order.
+        """
+        return self._call_java("categorySizes")
+
+
 @inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
                           JavaMLWritable):
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 1d0f60acc6983..db951d81de1e7 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -119,6 +119,7 @@ def get$Name(self):
         ("inputCol", "input column name.", None, "TypeConverters.toString"),
         ("inputCols", "input column names.", None, "TypeConverters.toListString"),
         ("outputCol", "output column name.", "self.uid + '__output'", "TypeConverters.toString"),
+        ("outputCols", "output column names.", None, "TypeConverters.toListString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable checkpoint (-1). " +
          "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: " +
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 813f7a59f3fd1..474c38764e5a1 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -256,6 +256,29 @@ def getOutputCol(self):
         return self.getOrDefault(self.outputCol)
 
 
+class HasOutputCols(Params):
+    """
+    Mixin for param outputCols: output column names.
+    """
+
+    outputCols = Param(Params._dummy(), "outputCols", "output column names.", typeConverter=TypeConverters.toListString)
+
+    def __init__(self):
+        super(HasOutputCols, self).__init__()
+
+    def setOutputCols(self, value):
+        """
+        Sets the value of :py:attr:`outputCols`.
+        """
+        return self._set(outputCols=value)
+
+    def getOutputCols(self):
+        """
+        Gets the value of outputCols or its default value.
+        """
+        return self.getOrDefault(self.outputCols)
+
+
 class HasNumFeatures(Params):
     """
     Mixin for param numFeatures: number of features.

From cbe7c6fbf9dc2fc422b93b3644c40d449a869eea Mon Sep 17 00:00:00 2001
From: ho3rexqj <ho3rexqj@gmail.com>
Date: Fri, 12 Jan 2018 15:27:00 +0800
Subject: [PATCH 319/356] [SPARK-22986][CORE] Use a cache to avoid
 instantiating multiple instances of broadcast variable values

When resources happen to be constrained on an executor the first time a broadcast variable is instantiated it is persisted to disk by the BlockManager. Consequently, every subsequent call to TorrentBroadcast::readBroadcastBlock from other instances of that broadcast variable spawns another instance of the underlying value. That is, broadcast variables are spawned once per executor **unless** memory is constrained, in which case every instance of a broadcast variable is provided with a unique copy of the underlying value.

This patch fixes the above by explicitly caching the underlying values using weak references in a ReferenceMap.

Author: ho3rexqj <ho3rexqj@gmail.com>

Closes #20183 from ho3rexqj/fix/cache-broadcast-values.
---
 .../spark/broadcast/BroadcastManager.scala    |  6 ++
 .../spark/broadcast/TorrentBroadcast.scala    | 72 +++++++++++--------
 .../spark/broadcast/BroadcastSuite.scala      | 34 +++++++++
 3 files changed, 83 insertions(+), 29 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
index e88988fe03b2e..8d7a4a353a792 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/BroadcastManager.scala
@@ -21,6 +21,8 @@ import java.util.concurrent.atomic.AtomicLong
 
 import scala.reflect.ClassTag
 
+import org.apache.commons.collections.map.{AbstractReferenceMap, ReferenceMap}
+
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.internal.Logging
 
@@ -52,6 +54,10 @@ private[spark] class BroadcastManager(
 
   private val nextBroadcastId = new AtomicLong(0)
 
+  private[broadcast] val cachedValues = {
+    new ReferenceMap(AbstractReferenceMap.HARD, AbstractReferenceMap.WEAK)
+  }
+
   def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {
     broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())
   }
diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 7aecd3c9668ea..e125095cf4777 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -206,36 +206,50 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
 
   private def readBroadcastBlock(): T = Utils.tryOrIOException {
     TorrentBroadcast.synchronized {
-      setConf(SparkEnv.get.conf)
-      val blockManager = SparkEnv.get.blockManager
-      blockManager.getLocalValues(broadcastId) match {
-        case Some(blockResult) =>
-          if (blockResult.data.hasNext) {
-            val x = blockResult.data.next().asInstanceOf[T]
-            releaseLock(broadcastId)
-            x
-          } else {
-            throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
-          }
-        case None =>
-          logInfo("Started reading broadcast variable " + id)
-          val startTimeMs = System.currentTimeMillis()
-          val blocks = readBlocks()
-          logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
-
-          try {
-            val obj = TorrentBroadcast.unBlockifyObject[T](
-              blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
-            // Store the merged copy in BlockManager so other tasks on this executor don't
-            // need to re-fetch it.
-            val storageLevel = StorageLevel.MEMORY_AND_DISK
-            if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
-              throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+      val broadcastCache = SparkEnv.get.broadcastManager.cachedValues
+
+      Option(broadcastCache.get(broadcastId)).map(_.asInstanceOf[T]).getOrElse {
+        setConf(SparkEnv.get.conf)
+        val blockManager = SparkEnv.get.blockManager
+        blockManager.getLocalValues(broadcastId) match {
+          case Some(blockResult) =>
+            if (blockResult.data.hasNext) {
+              val x = blockResult.data.next().asInstanceOf[T]
+              releaseLock(broadcastId)
+
+              if (x != null) {
+                broadcastCache.put(broadcastId, x)
+              }
+
+              x
+            } else {
+              throw new SparkException(s"Failed to get locally stored broadcast data: $broadcastId")
             }
-            obj
-          } finally {
-            blocks.foreach(_.dispose())
-          }
+          case None =>
+            logInfo("Started reading broadcast variable " + id)
+            val startTimeMs = System.currentTimeMillis()
+            val blocks = readBlocks()
+            logInfo("Reading broadcast variable " + id + " took" + Utils.getUsedTimeMs(startTimeMs))
+
+            try {
+              val obj = TorrentBroadcast.unBlockifyObject[T](
+                blocks.map(_.toInputStream()), SparkEnv.get.serializer, compressionCodec)
+              // Store the merged copy in BlockManager so other tasks on this executor don't
+              // need to re-fetch it.
+              val storageLevel = StorageLevel.MEMORY_AND_DISK
+              if (!blockManager.putSingle(broadcastId, obj, storageLevel, tellMaster = false)) {
+                throw new SparkException(s"Failed to store $broadcastId in BlockManager")
+              }
+
+              if (obj != null) {
+                broadcastCache.put(broadcastId, obj)
+              }
+
+              obj
+            } finally {
+              blocks.foreach(_.dispose())
+            }
+        }
       }
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
index 159629825c677..9ad2e9a5e74ac 100644
--- a/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
+++ b/core/src/test/scala/org/apache/spark/broadcast/BroadcastSuite.scala
@@ -153,6 +153,40 @@ class BroadcastSuite extends SparkFunSuite with LocalSparkContext with Encryptio
     assert(broadcast.value.sum === 10)
   }
 
+  test("One broadcast value instance per executor") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
+  test("One broadcast value instance per executor when memory is constrained") {
+    val conf = new SparkConf()
+      .setMaster("local[4]")
+      .setAppName("test")
+      .set("spark.memory.useLegacyMode", "true")
+      .set("spark.storage.memoryFraction", "0.0")
+
+    sc = new SparkContext(conf)
+    val list = List[Int](1, 2, 3, 4)
+    val broadcast = sc.broadcast(list)
+    val instances = sc.parallelize(1 to 10)
+      .map(x => System.identityHashCode(broadcast.value))
+      .collect()
+      .toSet
+
+    assert(instances.size === 1)
+  }
+
   /**
    * Verify the persistence of state associated with a TorrentBroadcast in a local-cluster.
    *

From a7d98d53ceaf69cabaecc6c9113f17438c4e61f6 Mon Sep 17 00:00:00 2001
From: WeichenXu <weichen.xu@databricks.com>
Date: Fri, 12 Jan 2018 11:27:02 +0200
Subject: [PATCH 320/356] [SPARK-23008][ML][FOLLOW-UP] mark OneHotEncoder
 python API deprecated

## What changes were proposed in this pull request?

mark OneHotEncoder python API deprecated

## How was this patch tested?

N/A

Author: WeichenXu <weichen.xu@databricks.com>

Closes #20241 from WeichenXu123/mark_ohe_deprecated.
---
 python/pyspark/ml/feature.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index b963e45dd7cff..eb79b193103e2 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1578,6 +1578,9 @@ class OneHotEncoder(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,
     .. note:: This is different from scikit-learn's OneHotEncoder,
         which keeps all categories. The output vectors are sparse.
 
+    .. note:: Deprecated in 2.3.0. :py:class:`OneHotEncoderEstimator` will be renamed to
+        :py:class:`OneHotEncoder` and this :py:class:`OneHotEncoder` will be removed in 3.0.0.
+
     .. seealso::
 
        :py:class:`StringIndexer` for converting categorical values into

From 505086806997b4331d4a8c2fc5e08345d869a23c Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 12 Jan 2018 18:04:44 +0800
Subject: [PATCH 321/356] [SPARK-23025][SQL] Support Null type in scala
 reflection

## What changes were proposed in this pull request?

Add support for `Null` type in the `schemaFor` method for Scala reflection.

## How was this patch tested?

Added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20219 from mgaido91/SPARK-23025.
---
 .../org/apache/spark/sql/catalyst/ScalaReflection.scala  | 4 ++++
 .../apache/spark/sql/catalyst/ScalaReflectionSuite.scala | 9 +++++++++
 .../test/scala/org/apache/spark/sql/DatasetSuite.scala   | 5 +++++
 3 files changed, 18 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 65040f1af4b04..9a4bf0075a178 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -63,6 +63,7 @@ object ScalaReflection extends ScalaReflection {
 
   private def dataTypeFor(tpe: `Type`): DataType = cleanUpReflectionObjects {
     tpe.dealias match {
+      case t if t <:< definitions.NullTpe => NullType
       case t if t <:< definitions.IntTpe => IntegerType
       case t if t <:< definitions.LongTpe => LongType
       case t if t <:< definitions.DoubleTpe => DoubleType
@@ -712,6 +713,9 @@ object ScalaReflection extends ScalaReflection {
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = cleanUpReflectionObjects {
     tpe.dealias match {
+      // this must be the first case, since all objects in scala are instances of Null, therefore
+      // Null type would wrongly match the first of them, which is Option as of now
+      case t if t <:< definitions.NullTpe => Schema(NullType, nullable = true)
       case t if t.typeSymbol.annotations.exists(_.tree.tpe =:= typeOf[SQLUserDefinedType]) =>
         val udt = getClassFromType(t).getAnnotation(classOf[SQLUserDefinedType]).udt().newInstance()
         Schema(udt, nullable = true)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
index 23e866cdf4917..8c3db48a01f12 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ScalaReflectionSuite.scala
@@ -356,4 +356,13 @@ class ScalaReflectionSuite extends SparkFunSuite {
     assert(deserializerFor[Int].isInstanceOf[AssertNotNull])
     assert(!deserializerFor[String].isInstanceOf[AssertNotNull])
   }
+
+  test("SPARK-23025: schemaFor should support Null type") {
+    val schema = schemaFor[(Int, Null)]
+    assert(schema === Schema(
+      StructType(Seq(
+        StructField("_1", IntegerType, nullable = false),
+        StructField("_2", NullType, nullable = true))),
+      nullable = true))
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index d535896723bd5..54893c184642b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1441,6 +1441,11 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
       assert(e.getCause.isInstanceOf[NullPointerException])
     }
   }
+
+  test("SPARK-23025: Add support for null type in scala reflection") {
+    val data = Seq(("a", null))
+    checkDataset(data.toDS(), data: _*)
+  }
 }
 
 case class SingleData(id: Int)

From f5300fbbe370af3741560f67bfb5ae6f0b0f7bb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matthias=20Beaup=C3=A8re?= <matthias.beaupere@gmail.com>
Date: Fri, 12 Jan 2018 08:29:46 -0600
Subject: [PATCH 322/356] Update rdd-programming-guide.md
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## What changes were proposed in this pull request?

Small typing correction - double word

## How was this patch tested?

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Matthias Beaupère <matthias.beaupere@gmail.com>

Closes #20212 from matthiasbe/patch-1.
---
 docs/rdd-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
index 29af159510e46..2e29aef7f21a2 100644
--- a/docs/rdd-programming-guide.md
+++ b/docs/rdd-programming-guide.md
@@ -91,7 +91,7 @@ so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
 Python 2.6 support was removed in Spark 2.2.0.
 
-Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including including it in your setup.py as:
+Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
 
 {% highlight python %}
     install_requires=[

From 651f76153f5e9b185aaf593161d40cabe7994fea Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Sat, 13 Jan 2018 00:37:59 +0800
Subject: [PATCH 323/356] [SPARK-23028] Bump master branch version to
 2.4.0-SNAPSHOT

## What changes were proposed in this pull request?
This patch bumps the master branch version to `2.4.0-SNAPSHOT`.

## How was this patch tested?
N/A

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20222 from gatorsmile/bump24.
---
 R/pkg/DESCRIPTION                         | 2 +-
 assembly/pom.xml                          | 2 +-
 common/kvstore/pom.xml                    | 2 +-
 common/network-common/pom.xml             | 2 +-
 common/network-shuffle/pom.xml            | 2 +-
 common/network-yarn/pom.xml               | 2 +-
 common/sketch/pom.xml                     | 2 +-
 common/tags/pom.xml                       | 2 +-
 common/unsafe/pom.xml                     | 2 +-
 core/pom.xml                              | 2 +-
 dev/run-tests-jenkins.py                  | 4 ++--
 docs/_config.yml                          | 4 ++--
 examples/pom.xml                          | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml           | 2 +-
 external/flume-sink/pom.xml               | 2 +-
 external/flume/pom.xml                    | 2 +-
 external/kafka-0-10-assembly/pom.xml      | 2 +-
 external/kafka-0-10-sql/pom.xml           | 2 +-
 external/kafka-0-10/pom.xml               | 2 +-
 external/kafka-0-8-assembly/pom.xml       | 2 +-
 external/kafka-0-8/pom.xml                | 2 +-
 external/kinesis-asl-assembly/pom.xml     | 2 +-
 external/kinesis-asl/pom.xml              | 2 +-
 external/spark-ganglia-lgpl/pom.xml       | 2 +-
 graphx/pom.xml                            | 2 +-
 hadoop-cloud/pom.xml                      | 2 +-
 launcher/pom.xml                          | 2 +-
 mllib-local/pom.xml                       | 2 +-
 mllib/pom.xml                             | 2 +-
 pom.xml                                   | 2 +-
 project/MimaExcludes.scala                | 5 +++++
 python/pyspark/version.py                 | 2 +-
 repl/pom.xml                              | 2 +-
 resource-managers/kubernetes/core/pom.xml | 2 +-
 resource-managers/mesos/pom.xml           | 2 +-
 resource-managers/yarn/pom.xml            | 2 +-
 sql/catalyst/pom.xml                      | 2 +-
 sql/core/pom.xml                          | 2 +-
 sql/hive-thriftserver/pom.xml             | 2 +-
 sql/hive/pom.xml                          | 2 +-
 streaming/pom.xml                         | 2 +-
 tools/pom.xml                             | 2 +-
 43 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 6d46c31906260..855eb5bf77f16 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.3.0
+Version: 2.4.0
 Title: R Frontend for Apache Spark
 Description: Provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),
diff --git a/assembly/pom.xml b/assembly/pom.xml
index b3b4239771bc3..a207dae5a74ff 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml
index cf93d41cd77cf..8c148359c3029 100644
--- a/common/kvstore/pom.xml
+++ b/common/kvstore/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 18cbdadd224ab..8ca7733507f1b 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 9968480ab7658..05335df61a664 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index ec2db6e5bb88c..564e6583c909e 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml
index 2d59c71cc3757..2f04abe8c7e88 100644
--- a/common/sketch/pom.xml
+++ b/common/sketch/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/tags/pom.xml b/common/tags/pom.xml
index f7e586ee777e1..ba127408e1c59 100644
--- a/common/tags/pom.xml
+++ b/common/tags/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/common/unsafe/pom.xml b/common/unsafe/pom.xml
index a3772a2620088..1527854730394 100644
--- a/common/unsafe/pom.xml
+++ b/common/unsafe/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
index 0a5bd958fc9c5..9258a856028a0 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
index 914eb93622d51..3960a0de62530 100755
--- a/dev/run-tests-jenkins.py
+++ b/dev/run-tests-jenkins.py
@@ -181,8 +181,8 @@ def main():
     short_commit_hash = ghprb_actual_commit[0:7]
 
     # format: http://linux.die.net/man/1/timeout
-    # must be less than the timeout configured on Jenkins (currently 300m)
-    tests_timeout = "250m"
+    # must be less than the timeout configured on Jenkins (currently 350m)
+    tests_timeout = "300m"
 
     # Array to capture all test names to run on the pull request. These tests are represented
     # by their file equivalents in the dev/tests/ directory.
diff --git a/docs/_config.yml b/docs/_config.yml
index dcc211204d766..095fadb93fe5d 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -14,8 +14,8 @@ include:
 
 # These allow the documentation to be updated with newer releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 2.3.0-SNAPSHOT
-SPARK_VERSION_SHORT: 2.3.0
+SPARK_VERSION: 2.4.0-SNAPSHOT
+SPARK_VERSION_SHORT: 2.4.0
 SCALA_BINARY_VERSION: "2.11"
 SCALA_VERSION: "2.11.8"
 MESOS_VERSION: 1.0.0
diff --git a/examples/pom.xml b/examples/pom.xml
index 1791dbaad775e..868110b8e35ef 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/external/docker-integration-tests/pom.xml b/external/docker-integration-tests/pom.xml
index 485b562dce990..431339d412194 100644
--- a/external/docker-integration-tests/pom.xml
+++ b/external/docker-integration-tests/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 71016bc645ca7..7cd1ec4c9c09a 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 12630840e79dc..f810aa80e8780 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 87a09642405a7..498e88f665eb5 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-assembly/pom.xml b/external/kafka-0-10-assembly/pom.xml
index d6f97316b326a..a742b8d6dbddb 100644
--- a/external/kafka-0-10-assembly/pom.xml
+++ b/external/kafka-0-10-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10-sql/pom.xml b/external/kafka-0-10-sql/pom.xml
index 0c9f0aa765a39..16bbc6db641ca 100644
--- a/external/kafka-0-10-sql/pom.xml
+++ b/external/kafka-0-10-sql/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-10/pom.xml b/external/kafka-0-10/pom.xml
index 6eb7ba5f0092d..3b124b2a69d50 100644
--- a/external/kafka-0-10/pom.xml
+++ b/external/kafka-0-10/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8-assembly/pom.xml b/external/kafka-0-8-assembly/pom.xml
index 786349474389b..41bc8b3e3ee1f 100644
--- a/external/kafka-0-8-assembly/pom.xml
+++ b/external/kafka-0-8-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka-0-8/pom.xml b/external/kafka-0-8/pom.xml
index 849c8b465f99e..6d1c4789f382d 100644
--- a/external/kafka-0-8/pom.xml
+++ b/external/kafka-0-8/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl-assembly/pom.xml b/external/kinesis-asl-assembly/pom.xml
index 48783d65826aa..37c7d1e604ec5 100644
--- a/external/kinesis-asl-assembly/pom.xml
+++ b/external/kinesis-asl-assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kinesis-asl/pom.xml b/external/kinesis-asl/pom.xml
index 40a751a652fa9..4915893965595 100644
--- a/external/kinesis-asl/pom.xml
+++ b/external/kinesis-asl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/spark-ganglia-lgpl/pom.xml b/external/spark-ganglia-lgpl/pom.xml
index 36d555066b181..027157e53d511 100644
--- a/external/spark-ganglia-lgpl/pom.xml
+++ b/external/spark-ganglia-lgpl/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/graphx/pom.xml b/graphx/pom.xml
index cb30e4a4af4bc..fbe77fcb958d5 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/hadoop-cloud/pom.xml b/hadoop-cloud/pom.xml
index aa36dd4774d86..8e424b1c50236 100644
--- a/hadoop-cloud/pom.xml
+++ b/hadoop-cloud/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/launcher/pom.xml b/launcher/pom.xml
index e9b46c4cf0ffa..912eb6b6d2a08 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml
index 043d13609fd26..53286fe93478d 100644
--- a/mllib-local/pom.xml
+++ b/mllib-local/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
index a906c9e02cd4c..f07d7f24fd312 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
index 1b37164376460..d14594aa4ccb0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent_2.11</artifactId>
-  <version>2.3.0-SNAPSHOT</version>
+  <version>2.4.0-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.apache.org/</url>
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 3b452f35c5ec1..32eb31f495979 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -34,6 +34,10 @@ import com.typesafe.tools.mima.core.ProblemFilters._
  */
 object MimaExcludes {
 
+  // Exclude rules for 2.4.x
+  lazy val v24excludes = v23excludes ++ Seq(
+  )
+
   // Exclude rules for 2.3.x
   lazy val v23excludes = v22excludes ++ Seq(
     // [SPARK-22897] Expose stageAttemptId in TaskContext
@@ -1082,6 +1086,7 @@ object MimaExcludes {
   }
 
   def excludes(version: String) = version match {
+    case v if v.startsWith("2.4") => v24excludes
     case v if v.startsWith("2.3") => v23excludes
     case v if v.startsWith("2.2") => v22excludes
     case v if v.startsWith("2.1") => v21excludes
diff --git a/python/pyspark/version.py b/python/pyspark/version.py
index 12dd53b9d2902..b9c2c4ced71d5 100644
--- a/python/pyspark/version.py
+++ b/python/pyspark/version.py
@@ -16,4 +16,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.3.0.dev0"
+__version__ = "2.4.0.dev0"
diff --git a/repl/pom.xml b/repl/pom.xml
index 1cb0098d0eca3..6f4a863c48bc7 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 7d35aea8a4142..a62f271273465 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/mesos/pom.xml b/resource-managers/mesos/pom.xml
index 70d0c1750b14e..3995d0afeb5f4 100644
--- a/resource-managers/mesos/pom.xml
+++ b/resource-managers/mesos/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/yarn/pom.xml b/resource-managers/yarn/pom.xml
index 43a7ce95bd3de..37e25ceecb883 100644
--- a/resource-managers/yarn/pom.xml
+++ b/resource-managers/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 9e2ced30407d4..839b929abd3cb 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 93010c606cf45..744daa6079779 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive-thriftserver/pom.xml b/sql/hive-thriftserver/pom.xml
index 3135a8a275dae..9f247f9224c75 100644
--- a/sql/hive-thriftserver/pom.xml
+++ b/sql/hive-thriftserver/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/sql/hive/pom.xml b/sql/hive/pom.xml
index 66fad85ea0263..c55ba32fa458c 100644
--- a/sql/hive/pom.xml
+++ b/sql/hive/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/pom.xml b/streaming/pom.xml
index fea882ad11230..4497e53b65984 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 37427e8da62d8..242219e29f50f 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

From 7bd14cfd40500a0b6462cda647bdbb686a430328 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Fri, 12 Jan 2018 10:18:42 -0800
Subject: [PATCH 324/356] [MINOR][BUILD] Fix Java linter errors

## What changes were proposed in this pull request?

This PR cleans up the java-lint errors (for v2.3.0-rc1 tag). Hopefully, this will be the final one.

```
$ dev/lint-java
Using `mvn` from path: /usr/local/bin/mvn
Checkstyle checks failed at following occurrences:
[ERROR] src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java:[85] (sizes) LineLength: Line is longer than 100 characters (found 101).
[ERROR] src/main/java/org/apache/spark/launcher/InProcessAppHandle.java:[20,8] (imports) UnusedImports: Unused import - java.io.IOException.
[ERROR] src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java:[41,9] (modifier) ModifierOrder: 'private' modifier out of order with the JLS suggestions.
[ERROR] src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java:[464] (sizes) LineLength: Line is longer than 100 characters (found 102).
```

## How was this patch tested?

Manual.

```
$ dev/lint-java
Using `mvn` from path: /usr/local/bin/mvn
Checkstyle checks passed.
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20242 from dongjoon-hyun/fix_lint_java_2.3_rc1.
---
 .../org/apache/spark/unsafe/memory/HeapMemoryAllocator.java    | 3 ++-
 .../java/org/apache/spark/launcher/InProcessAppHandle.java     | 1 -
 .../spark/sql/execution/datasources/orc/OrcColumnVector.java   | 2 +-
 .../java/test/org/apache/spark/sql/JavaDataFrameSuite.java     | 3 ++-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
index 3acfe3696cb1e..a9603c1aba051 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/memory/HeapMemoryAllocator.java
@@ -82,7 +82,8 @@ public void free(MemoryBlock memory) {
       "page has already been freed";
     assert ((memory.pageNumber == MemoryBlock.NO_PAGE_NUMBER)
             || (memory.pageNumber == MemoryBlock.FREED_IN_TMM_PAGE_NUMBER)) :
-      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator free()";
+      "TMM-allocated pages must first be freed via TMM.freePage(), not directly in allocator " +
+        "free()";
 
     final long size = memory.size();
     if (MemoryAllocator.MEMORY_DEBUG_FILL_ENABLED) {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
index 0d6a73a3da3ed..acd64c962604f 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/InProcessAppHandle.java
@@ -17,7 +17,6 @@
 
 package org.apache.spark.launcher;
 
-import java.io.IOException;
 import java.lang.reflect.Method;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.logging.Level;
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
index f94c55d860304..b6e792274da11 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/orc/OrcColumnVector.java
@@ -38,7 +38,7 @@ public class OrcColumnVector extends org.apache.spark.sql.vectorized.ColumnVecto
   private BytesColumnVector bytesData;
   private DecimalColumnVector decimalData;
   private TimestampColumnVector timestampData;
-  final private boolean isTimestamp;
+  private final boolean isTimestamp;
 
   private int batchSize;
 
diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index 4f8a31f185724..69a2904f5f3fe 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -461,7 +461,8 @@ public void testCircularReferenceBean() {
   public void testUDF() {
     UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType);
     Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value")));
-    String[] result = df.collectAsList().stream().map(row -> row.getString(0)).toArray(String[]::new);
+    String[] result = df.collectAsList().stream().map(row -> row.getString(0))
+      .toArray(String[]::new);
     String[] expected = spark.table("testData").collectAsList().stream()
       .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new);
     Assert.assertArrayEquals(expected, result);

From 54277398afbde92a38ba2802f4a7a3e5910533de Mon Sep 17 00:00:00 2001
From: Marco Gaido <marcogaido91@gmail.com>
Date: Fri, 12 Jan 2018 11:25:37 -0800
Subject: [PATCH 325/356] [SPARK-22975][SS] MetricsReporter should not throw
 exception when there was no progress reported

## What changes were proposed in this pull request?

`MetricsReporter ` assumes that there has been some progress for the query, ie. `lastProgress` is not null. If this is not true, as it might happen in particular conditions, a `NullPointerException` can be thrown.

The PR checks whether there is a `lastProgress` and if this is not true, it returns a default value for the metrics.

## How was this patch tested?

added UT

Author: Marco Gaido <marcogaido91@gmail.com>

Closes #20189 from mgaido91/SPARK-22975.
---
 .../execution/streaming/MetricsReporter.scala | 21 ++++++++---------
 .../sql/streaming/StreamingQuerySuite.scala   | 23 +++++++++++++++++++
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
index b84e6ce64c611..66b11ecddf233 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MetricsReporter.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.execution.streaming
 
-import java.{util => ju}
-
-import scala.collection.mutable
-
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.metrics.source.{Source => CodahaleSource}
-import org.apache.spark.util.Clock
+import org.apache.spark.sql.streaming.StreamingQueryProgress
 
 /**
  * Serves metrics from a [[org.apache.spark.sql.streaming.StreamingQuery]] to
@@ -39,14 +35,17 @@ class MetricsReporter(
 
   // Metric names should not have . in them, so that all the metrics of a query are identified
   // together in Ganglia as a single metric group
-  registerGauge("inputRate-total", () => stream.lastProgress.inputRowsPerSecond)
-  registerGauge("processingRate-total", () => stream.lastProgress.processedRowsPerSecond)
-  registerGauge("latency", () => stream.lastProgress.durationMs.get("triggerExecution").longValue())
-
-  private def registerGauge[T](name: String, f: () => T)(implicit num: Numeric[T]): Unit = {
+  registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0)
+  registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0)
+  registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L)
+
+  private def registerGauge[T](
+      name: String,
+      f: StreamingQueryProgress => T,
+      default: T): Unit = {
     synchronized {
       metricRegistry.register(name, new Gauge[T] {
-        override def getValue: T = f()
+        override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default)
       })
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
index 2fa4595dab376..76201c63a2701 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQuerySuite.scala
@@ -424,6 +424,29 @@ class StreamingQuerySuite extends StreamTest with BeforeAndAfter with Logging wi
     }
   }
 
+  test("SPARK-22975: MetricsReporter defaults when there was no progress reported") {
+    withSQLConf("spark.sql.streaming.metricsEnabled" -> "true") {
+      BlockingSource.latch = new CountDownLatch(1)
+      withTempDir { tempDir =>
+        val sq = spark.readStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .load()
+          .writeStream
+          .format("org.apache.spark.sql.streaming.util.BlockingSource")
+          .option("checkpointLocation", tempDir.toString)
+          .start()
+          .asInstanceOf[StreamingQueryWrapper]
+          .streamingQuery
+
+        val gauges = sq.streamMetrics.metricRegistry.getGauges
+        assert(gauges.get("latency").getValue.asInstanceOf[Long] == 0)
+        assert(gauges.get("processingRate-total").getValue.asInstanceOf[Double] == 0.0)
+        assert(gauges.get("inputRate-total").getValue.asInstanceOf[Double] == 0.0)
+        sq.stop()
+      }
+    }
+  }
+
   test("input row calculation with mixed batch and streaming sources") {
     val streamingTriggerDF = spark.createDataset(1 to 10).toDF
     val streamingInputDF = createSingleTriggerStreamingDF(streamingTriggerDF).toDF("value")

From 66dd9cbd1fdd29bfd6156cb27b35573571feed0c Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Fri, 12 Jan 2018 19:15:24 +0000
Subject: [PATCH 326/356] Resolve conflicts keeping our k8s code

---
 .travis.yml                                   |   5 -
 .../org/apache/spark/deploy/SparkSubmit.scala |  36 +-
 .../spark/internal/config/package.scala       |   8 -
 dev/deps/spark-deps-hadoop-2.6                | 190 ------
 dev/deps/spark-deps-hadoop-2.7                | 191 ------
 dev/make-distribution.sh                      |   4 -
 dev/sparktestsupport/modules.py               |   7 -
 docs/running-on-kubernetes.md                 | 569 ------------------
 .../spark/sql/kafka010/KafkaSourceSuite.scala |  18 -
 pom.xml                                       |  22 +-
 project/SparkBuild.scala                      |   6 -
 resource-managers/kubernetes/core/pom.xml     |  32 -
 .../k8s/PodWithDetachedInitContainer.scala    |  10 -
 .../k8s/SparkKubernetesClientFactory.scala    |  64 --
 .../k8s/submit/LoggingPodStatusWatcher.scala  |  82 ---
 .../deploy/k8s/submit/MainAppResource.scala   |   3 -
 .../cluster/k8s/ExecutorPodFactory.scala      | 176 ------
 .../k8s/KubernetesClusterManager.scala        | 102 ----
 .../KubernetesClusterSchedulerBackend.scala   | 322 ----------
 .../spark/deploy/k8s/submit/ClientSuite.scala |  76 ---
 .../cluster/k8s/ExecutorPodFactorySuite.scala | 139 -----
 ...bernetesClusterSchedulerBackendSuite.scala | 311 ----------
 .../parquet/VectorizedColumnReader.java       |   6 +-
 23 files changed, 9 insertions(+), 2370 deletions(-)
 delete mode 100644 dev/deps/spark-deps-hadoop-2.6
 delete mode 100644 dev/deps/spark-deps-hadoop-2.7

diff --git a/.travis.yml b/.travis.yml
index 8db451a769a91..3934ac1d99e32 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -55,14 +55,9 @@ notifications:
 # 5. Run maven build before running lints.
 install:
   - export MAVEN_SKIP_RC=1
-<<<<<<< HEAD
-  - build/mvn ${PHASE} ${PROFILES} ${MODULES} ${ARGS}
-# 6. Run lints.
-=======
   - build/mvn -T 4 -q -DskipTests -Pkubernetes -Pmesos -Pyarn -Pkinesis-asl -Phive -Phive-thriftserver install
 
 # 6. Run lint-java.
->>>>>>> master
 script:
   - dev/lint-java
   - dev/lint-scala
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index c866190395b62..33932a4833906 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -18,13 +18,13 @@
 package org.apache.spark.deploy
 
 import java.io._
-import java.lang.reflect.{InvocationTargetException, Modifier, UndeclaredThrowableException}
+import java.lang.reflect.{InvocationTargetException, UndeclaredThrowableException}
 import java.net.URL
 import java.security.PrivilegedExceptionAction
 import java.text.ParseException
 
 import scala.annotation.tailrec
-import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
+import scala.collection.mutable.ArrayBuffer
 import scala.util.{Properties, Try}
 
 import org.apache.commons.lang3.StringUtils
@@ -77,11 +77,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private val MESOS = 4
   private val LOCAL = 8
   private val KUBERNETES = 16
-<<<<<<< HEAD
-  private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | KUBERNETES | LOCAL
-=======
   private val ALL_CLUSTER_MGRS = YARN | STANDALONE | MESOS | LOCAL | KUBERNETES
->>>>>>> master
 
   // Deploy modes
   private val CLIENT = 1
@@ -351,7 +347,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
     val isMesosCluster = clusterManager == MESOS && deployMode == CLUSTER
     val isKubernetesCluster = clusterManager == KUBERNETES && deployMode == CLUSTER
     val isStandAloneCluster = clusterManager == STANDALONE && deployMode == CLUSTER
-    val isKubernetesCluster = clusterManager == KUBERNETES && deployMode == CLUSTER
 
     if (!isMesosCluster && !isStandAloneCluster) {
       // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files
@@ -586,16 +581,11 @@ object SparkSubmit extends CommandLineUtils with Logging {
       OptionAssigner(args.principal, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.principal"),
       OptionAssigner(args.keytab, YARN, ALL_DEPLOY_MODES, confKey = "spark.yarn.keytab"),
 
-<<<<<<< HEAD
       OptionAssigner(args.kubernetesNamespace, KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.kubernetes.namespace"),
 
-        // Other options
-      OptionAssigner(args.executorCores, STANDALONE | YARN, ALL_DEPLOY_MODES,
-=======
       // Other options
       OptionAssigner(args.executorCores, STANDALONE | YARN | KUBERNETES, ALL_DEPLOY_MODES,
->>>>>>> master
         confKey = "spark.executor.cores"),
       OptionAssigner(args.executorMemory, STANDALONE | MESOS | YARN | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.executor.memory"),
@@ -606,11 +596,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
       OptionAssigner(args.jars, LOCAL, CLIENT, confKey = "spark.jars"),
       OptionAssigner(args.jars, STANDALONE | MESOS | KUBERNETES, ALL_DEPLOY_MODES,
         confKey = "spark.jars"),
-<<<<<<< HEAD
-      OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN, CLUSTER,
-=======
       OptionAssigner(args.driverMemory, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
->>>>>>> master
         confKey = "spark.driver.memory"),
       OptionAssigner(args.driverCores, STANDALONE | MESOS | YARN | KUBERNETES, CLUSTER,
         confKey = "spark.driver.cores"),
@@ -748,8 +734,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
     }
 
     if (isKubernetesCluster) {
-<<<<<<< HEAD
-      childMainClass = "org.apache.spark.deploy.k8s.submit.Client"
+      childMainClass = KUBERNETES_CLUSTER_SUBMIT_CLASS
       if (args.isPython) {
         childArgs ++= Array("--primary-py-file", args.primaryResource)
         childArgs ++= Array("--main-class", "org.apache.spark.deploy.PythonRunner")
@@ -757,23 +742,14 @@ object SparkSubmit extends CommandLineUtils with Logging {
           childArgs ++= Array("--other-py-files", args.pyFiles)
         }
       } else {
-        childArgs ++= Array("--primary-java-resource", args.primaryResource)
+        if (args.primaryResource != SparkLauncher.NO_RESOURCE) {
+          childArgs ++= Array("--primary-java-resource", args.primaryResource)
+        }
         childArgs ++= Array("--main-class", args.mainClass)
       }
       args.childArgs.foreach { arg =>
         childArgs += "--arg"
         childArgs += arg
-=======
-      childMainClass = KUBERNETES_CLUSTER_SUBMIT_CLASS
-      if (args.primaryResource != SparkLauncher.NO_RESOURCE) {
-        childArgs ++= Array("--primary-java-resource", args.primaryResource)
-      }
-      childArgs ++= Array("--main-class", args.mainClass)
-      if (args.childArgs != null) {
-        args.childArgs.foreach { arg =>
-          childArgs += ("--arg", arg)
-        }
->>>>>>> master
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 60d315104b2e1..8d093bb8402e0 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -240,7 +240,6 @@ package object config {
     .stringConf
     .createOptional
 
-<<<<<<< HEAD
   private[spark] val CONDA_BOOTSTRAP_PACKAGES = ConfigBuilder("spark.conda.bootstrapPackages")
     .doc("The packages that will be added to the conda environment. "
       + "Only relevant when main class is CondaRunner.")
@@ -272,13 +271,6 @@ package object config {
     .toSequence
     .createWithDefault(Nil)
 
-  // To limit memory usage, we only track information for a fixed number of tasks
-  private[spark] val UI_RETAINED_TASKS = ConfigBuilder("spark.ui.retainedTasks")
-    .intConf
-    .createWithDefault(100000)
-
-=======
->>>>>>> master
   // To limit how many applications are shown in the History Server summary ui
   private[spark] val HISTORY_UI_MAX_APPS =
     ConfigBuilder("spark.history.ui.maxApplications").intConf.createWithDefault(Integer.MAX_VALUE)
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
deleted file mode 100644
index a7fce2ede0ea5..0000000000000
--- a/dev/deps/spark-deps-hadoop-2.6
+++ /dev/null
@@ -1,190 +0,0 @@
-JavaEWAH-0.3.2.jar
-RoaringBitmap-0.5.11.jar
-ST4-4.0.4.jar
-activation-1.1.1.jar
-aircompressor-0.8.jar
-antlr-2.7.7.jar
-antlr-runtime-3.4.jar
-antlr4-runtime-4.7.jar
-aopalliance-1.0.jar
-aopalliance-repackaged-2.4.0-b34.jar
-apache-log4j-extras-1.2.17.jar
-apacheds-i18n-2.0.0-M15.jar
-apacheds-kerberos-codec-2.0.0-M15.jar
-api-asn1-api-1.0.0-M20.jar
-api-util-1.0.0-M20.jar
-arpack_combined_all-0.1.jar
-arrow-format-0.8.0.jar
-arrow-memory-0.8.0.jar
-arrow-vector-0.8.0.jar
-avro-1.7.7.jar
-avro-ipc-1.7.7.jar
-avro-mapred-1.7.7-hadoop2.jar
-base64-2.3.8.jar
-bcprov-jdk15on-1.58.jar
-bonecp-0.8.0.RELEASE.jar
-breeze-macros_2.11-0.13.2.jar
-breeze_2.11-0.13.2.jar
-calcite-avatica-1.2.0-incubating.jar
-calcite-core-1.2.0-incubating.jar
-calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.4.jar
-chill_2.11-0.8.4.jar
-commons-beanutils-1.7.0.jar
-commons-beanutils-core-1.8.0.jar
-commons-cli-1.2.jar
-commons-codec-1.10.jar
-commons-collections-3.2.2.jar
-commons-compiler-3.0.8.jar
-commons-compress-1.4.1.jar
-commons-configuration-1.6.jar
-commons-crypto-1.0.0.jar
-commons-dbcp-1.4.jar
-commons-digester-1.8.jar
-commons-httpclient-3.1.jar
-commons-io-2.4.jar
-commons-lang-2.6.jar
-commons-lang3-3.5.jar
-commons-logging-1.1.3.jar
-commons-math3-3.4.1.jar
-commons-net-2.2.jar
-commons-pool-1.5.4.jar
-compress-lzf-1.0.3.jar
-core-1.1.2.jar
-curator-client-2.6.0.jar
-curator-framework-2.6.0.jar
-curator-recipes-2.6.0.jar
-datanucleus-api-jdo-3.2.6.jar
-datanucleus-core-3.2.10.jar
-datanucleus-rdbms-3.2.9.jar
-derby-10.12.1.1.jar
-eigenbase-properties-1.1.5.jar
-flatbuffers-1.2.0-3f79e055.jar
-gson-2.2.4.jar
-guava-14.0.1.jar
-guice-3.0.jar
-guice-servlet-3.0.jar
-hadoop-annotations-2.6.5.jar
-hadoop-auth-2.6.5.jar
-hadoop-client-2.6.5.jar
-hadoop-common-2.6.5.jar
-hadoop-hdfs-2.6.5.jar
-hadoop-mapreduce-client-app-2.6.5.jar
-hadoop-mapreduce-client-common-2.6.5.jar
-hadoop-mapreduce-client-core-2.6.5.jar
-hadoop-mapreduce-client-jobclient-2.6.5.jar
-hadoop-mapreduce-client-shuffle-2.6.5.jar
-hadoop-yarn-api-2.6.5.jar
-hadoop-yarn-client-2.6.5.jar
-hadoop-yarn-common-2.6.5.jar
-hadoop-yarn-server-common-2.6.5.jar
-hadoop-yarn-server-web-proxy-2.6.5.jar
-hk2-api-2.4.0-b34.jar
-hk2-locator-2.4.0-b34.jar
-hk2-utils-2.4.0-b34.jar
-hppc-0.7.2.jar
-htrace-core-3.0.4.jar
-httpclient-4.5.4.jar
-httpcore-4.4.8.jar
-ivy-2.4.0.jar
-jackson-annotations-2.6.7.jar
-jackson-core-2.6.7.jar
-jackson-core-asl-1.9.13.jar
-jackson-databind-2.6.7.1.jar
-jackson-jaxrs-1.9.13.jar
-jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.7.9.jar
-jackson-module-scala_2.11-2.6.7.1.jar
-jackson-xc-1.9.13.jar
-janino-3.0.8.jar
-java-xmlbuilder-1.1.jar
-javassist-3.18.1-GA.jar
-javax.annotation-api-1.2.jar
-javax.inject-1.jar
-javax.inject-2.4.0-b34.jar
-javax.servlet-api-3.1.0.jar
-javax.ws.rs-api-2.0.1.jar
-javolution-5.5.1.jar
-jaxb-api-2.2.2.jar
-jcl-over-slf4j-1.7.16.jar
-jdo-api-3.0.1.jar
-jersey-client-2.22.2.jar
-jersey-common-2.22.2.jar
-jersey-container-servlet-2.22.2.jar
-jersey-container-servlet-core-2.22.2.jar
-jersey-guava-2.22.2.jar
-jersey-media-jaxb-2.22.2.jar
-jersey-server-2.22.2.jar
-jets3t-0.9.4.jar
-jetty-6.1.26.jar
-jetty-util-6.1.26.jar
-jline-2.12.1.jar
-joda-time-2.9.3.jar
-jodd-core-3.5.2.jar
-jpam-1.1.jar
-json4s-ast_2.11-3.2.11.jar
-json4s-core_2.11-3.2.11.jar
-json4s-jackson_2.11-3.2.11.jar
-jsr305-1.3.9.jar
-jta-1.1.jar
-jtransforms-2.4.0.jar
-jul-to-slf4j-1.7.16.jar
-kryo-shaded-3.0.3.jar
-leveldbjni-all-1.8.jar
-libfb303-0.9.3.jar
-libthrift-0.9.3.jar
-log4j-1.2.17.jar
-lz4-java-1.4.0.jar
-machinist_2.11-0.6.1.jar
-macro-compat_2.11-1.1.1.jar
-mesos-1.4.0-shaded-protobuf.jar
-metrics-core-3.1.5.jar
-metrics-graphite-3.1.5.jar
-metrics-json-3.1.5.jar
-metrics-jvm-3.1.5.jar
-minlog-1.3.0.jar
-netty-3.9.9.Final.jar
-netty-all-4.1.17.Final.jar
-objenesis-2.1.jar
-opencsv-2.3.jar
-orc-core-1.4.1-nohive.jar
-orc-mapreduce-1.4.1-nohive.jar
-oro-2.0.8.jar
-osgi-resource-locator-1.0.1.jar
-paranamer-2.8.jar
-parquet-column-1.8.2.jar
-parquet-common-1.8.2.jar
-parquet-encoding-1.8.2.jar
-parquet-format-2.3.1.jar
-parquet-hadoop-1.8.2.jar
-parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.8.2.jar
-protobuf-java-2.5.0.jar
-py4j-0.10.6.jar
-pyrolite-4.13.jar
-scala-compiler-2.11.8.jar
-scala-library-2.11.8.jar
-scala-parser-combinators_2.11-1.0.4.jar
-scala-reflect-2.11.8.jar
-scala-xml_2.11-1.0.5.jar
-scalap-2.11.8.jar
-shapeless_2.11-2.3.2.jar
-slf4j-api-1.7.16.jar
-slf4j-log4j12-1.7.16.jar
-snappy-0.2.jar
-snappy-java-1.1.2.6.jar
-spire-macros_2.11-0.13.0.jar
-spire_2.11-0.13.0.jar
-stax-api-1.0-2.jar
-stax-api-1.0.1.jar
-stream-2.7.0.jar
-stringtemplate-3.2.1.jar
-super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
-validation-api-1.1.0.Final.jar
-xbean-asm5-shaded-4.4.jar
-xercesImpl-2.9.1.jar
-xmlenc-0.52.jar
-xz-1.0.jar
-zookeeper-3.4.6.jar
-zstd-jni-1.3.2-2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
deleted file mode 100644
index 94b2e98d85e74..0000000000000
--- a/dev/deps/spark-deps-hadoop-2.7
+++ /dev/null
@@ -1,191 +0,0 @@
-JavaEWAH-0.3.2.jar
-RoaringBitmap-0.5.11.jar
-ST4-4.0.4.jar
-activation-1.1.1.jar
-aircompressor-0.8.jar
-antlr-2.7.7.jar
-antlr-runtime-3.4.jar
-antlr4-runtime-4.7.jar
-aopalliance-1.0.jar
-aopalliance-repackaged-2.4.0-b34.jar
-apache-log4j-extras-1.2.17.jar
-apacheds-i18n-2.0.0-M15.jar
-apacheds-kerberos-codec-2.0.0-M15.jar
-api-asn1-api-1.0.0-M20.jar
-api-util-1.0.0-M20.jar
-arpack_combined_all-0.1.jar
-arrow-format-0.8.0.jar
-arrow-memory-0.8.0.jar
-arrow-vector-0.8.0.jar
-avro-1.7.7.jar
-avro-ipc-1.7.7.jar
-avro-mapred-1.7.7-hadoop2.jar
-base64-2.3.8.jar
-bcprov-jdk15on-1.58.jar
-bonecp-0.8.0.RELEASE.jar
-breeze-macros_2.11-0.13.2.jar
-breeze_2.11-0.13.2.jar
-calcite-avatica-1.2.0-incubating.jar
-calcite-core-1.2.0-incubating.jar
-calcite-linq4j-1.2.0-incubating.jar
-chill-java-0.8.4.jar
-chill_2.11-0.8.4.jar
-commons-beanutils-1.7.0.jar
-commons-beanutils-core-1.8.0.jar
-commons-cli-1.2.jar
-commons-codec-1.10.jar
-commons-collections-3.2.2.jar
-commons-compiler-3.0.8.jar
-commons-compress-1.4.1.jar
-commons-configuration-1.6.jar
-commons-crypto-1.0.0.jar
-commons-dbcp-1.4.jar
-commons-digester-1.8.jar
-commons-httpclient-3.1.jar
-commons-io-2.4.jar
-commons-lang-2.6.jar
-commons-lang3-3.5.jar
-commons-logging-1.1.3.jar
-commons-math3-3.4.1.jar
-commons-net-2.2.jar
-commons-pool-1.5.4.jar
-compress-lzf-1.0.3.jar
-core-1.1.2.jar
-curator-client-2.7.1.jar
-curator-framework-2.7.1.jar
-curator-recipes-2.7.1.jar
-datanucleus-api-jdo-3.2.6.jar
-datanucleus-core-3.2.10.jar
-datanucleus-rdbms-3.2.9.jar
-derby-10.12.1.1.jar
-eigenbase-properties-1.1.5.jar
-flatbuffers-1.2.0-3f79e055.jar
-gson-2.2.4.jar
-guava-14.0.1.jar
-guice-3.0.jar
-guice-servlet-3.0.jar
-hadoop-annotations-2.7.3.jar
-hadoop-auth-2.7.3.jar
-hadoop-client-2.7.3.jar
-hadoop-common-2.7.3.jar
-hadoop-hdfs-2.7.3.jar
-hadoop-mapreduce-client-app-2.7.3.jar
-hadoop-mapreduce-client-common-2.7.3.jar
-hadoop-mapreduce-client-core-2.7.3.jar
-hadoop-mapreduce-client-jobclient-2.7.3.jar
-hadoop-mapreduce-client-shuffle-2.7.3.jar
-hadoop-yarn-api-2.7.3.jar
-hadoop-yarn-client-2.7.3.jar
-hadoop-yarn-common-2.7.3.jar
-hadoop-yarn-server-common-2.7.3.jar
-hadoop-yarn-server-web-proxy-2.7.3.jar
-hk2-api-2.4.0-b34.jar
-hk2-locator-2.4.0-b34.jar
-hk2-utils-2.4.0-b34.jar
-hppc-0.7.2.jar
-htrace-core-3.1.0-incubating.jar
-httpclient-4.5.4.jar
-httpcore-4.4.8.jar
-ivy-2.4.0.jar
-jackson-annotations-2.6.7.jar
-jackson-core-2.6.7.jar
-jackson-core-asl-1.9.13.jar
-jackson-databind-2.6.7.1.jar
-jackson-jaxrs-1.9.13.jar
-jackson-mapper-asl-1.9.13.jar
-jackson-module-paranamer-2.7.9.jar
-jackson-module-scala_2.11-2.6.7.1.jar
-jackson-xc-1.9.13.jar
-janino-3.0.8.jar
-java-xmlbuilder-1.1.jar
-javassist-3.18.1-GA.jar
-javax.annotation-api-1.2.jar
-javax.inject-1.jar
-javax.inject-2.4.0-b34.jar
-javax.servlet-api-3.1.0.jar
-javax.ws.rs-api-2.0.1.jar
-javolution-5.5.1.jar
-jaxb-api-2.2.2.jar
-jcl-over-slf4j-1.7.16.jar
-jdo-api-3.0.1.jar
-jersey-client-2.22.2.jar
-jersey-common-2.22.2.jar
-jersey-container-servlet-2.22.2.jar
-jersey-container-servlet-core-2.22.2.jar
-jersey-guava-2.22.2.jar
-jersey-media-jaxb-2.22.2.jar
-jersey-server-2.22.2.jar
-jets3t-0.9.4.jar
-jetty-6.1.26.jar
-jetty-util-6.1.26.jar
-jline-2.12.1.jar
-joda-time-2.9.3.jar
-jodd-core-3.5.2.jar
-jpam-1.1.jar
-json4s-ast_2.11-3.2.11.jar
-json4s-core_2.11-3.2.11.jar
-json4s-jackson_2.11-3.2.11.jar
-jsp-api-2.1.jar
-jsr305-1.3.9.jar
-jta-1.1.jar
-jtransforms-2.4.0.jar
-jul-to-slf4j-1.7.16.jar
-kryo-shaded-3.0.3.jar
-leveldbjni-all-1.8.jar
-libfb303-0.9.3.jar
-libthrift-0.9.3.jar
-log4j-1.2.17.jar
-lz4-java-1.4.0.jar
-machinist_2.11-0.6.1.jar
-macro-compat_2.11-1.1.1.jar
-mesos-1.4.0-shaded-protobuf.jar
-metrics-core-3.1.5.jar
-metrics-graphite-3.1.5.jar
-metrics-json-3.1.5.jar
-metrics-jvm-3.1.5.jar
-minlog-1.3.0.jar
-netty-3.9.9.Final.jar
-netty-all-4.1.17.Final.jar
-objenesis-2.1.jar
-opencsv-2.3.jar
-orc-core-1.4.1-nohive.jar
-orc-mapreduce-1.4.1-nohive.jar
-oro-2.0.8.jar
-osgi-resource-locator-1.0.1.jar
-paranamer-2.8.jar
-parquet-column-1.8.2.jar
-parquet-common-1.8.2.jar
-parquet-encoding-1.8.2.jar
-parquet-format-2.3.1.jar
-parquet-hadoop-1.8.2.jar
-parquet-hadoop-bundle-1.6.0.jar
-parquet-jackson-1.8.2.jar
-protobuf-java-2.5.0.jar
-py4j-0.10.6.jar
-pyrolite-4.13.jar
-scala-compiler-2.11.8.jar
-scala-library-2.11.8.jar
-scala-parser-combinators_2.11-1.0.4.jar
-scala-reflect-2.11.8.jar
-scala-xml_2.11-1.0.5.jar
-scalap-2.11.8.jar
-shapeless_2.11-2.3.2.jar
-slf4j-api-1.7.16.jar
-slf4j-log4j12-1.7.16.jar
-snappy-0.2.jar
-snappy-java-1.1.2.6.jar
-spire-macros_2.11-0.13.0.jar
-spire_2.11-0.13.0.jar
-stax-api-1.0-2.jar
-stax-api-1.0.1.jar
-stream-2.7.0.jar
-stringtemplate-3.2.1.jar
-super-csv-2.2.0.jar
-univocity-parsers-2.5.9.jar
-validation-api-1.1.0.Final.jar
-xbean-asm5-shaded-4.4.jar
-xercesImpl-2.9.1.jar
-xmlenc-0.52.jar
-xz-1.0.jar
-zookeeper-3.4.6.jar
-zstd-jni-1.3.2-2.jar
diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh
index 2ecf35b632524..eeb3dc0a93051 100755
--- a/dev/make-distribution.sh
+++ b/dev/make-distribution.sh
@@ -178,16 +178,12 @@ echo "Build flags: $@" >> "$DISTDIR/RELEASE"
 # Copy jars
 cp "$SPARK_HOME"/assembly/target/scala*/jars/* "$DISTDIR/jars/"
 
-<<<<<<< HEAD
 # Copy docker files
 mkdir -p "$DISTDIR/dockerfiles"
 DOCKERFILES_SRC="$SPARK_HOME/resource-managers/kubernetes/docker-minimal-bundle/src/main/docker"
 cp -R "$DOCKERFILES_SRC/." "$DISTDIR/dockerfiles/."
 
-# Only create the yarn directory if the yarn artifacts were build.
-=======
 # Only create the yarn directory if the yarn artifacts were built.
->>>>>>> master
 if [ -f "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar ]; then
   mkdir "$DISTDIR/yarn"
   cp "$SPARK_HOME"/common/network-yarn/target/scala*/spark-*-yarn-shuffle.jar "$DISTDIR/yarn"
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 1756315e7ee78..2a9fcabcecb06 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -539,16 +539,11 @@ def __hash__(self):
 kubernetes = Module(
     name="kubernetes",
     dependencies=[],
-<<<<<<< HEAD
     source_file_regexes=["resource-managers/kubernetes/core"],
-=======
-    source_file_regexes=["resource-managers/kubernetes"],
->>>>>>> master
     build_profile_flags=["-Pkubernetes"],
     sbt_test_goals=["kubernetes/test"]
 )
 
-<<<<<<< HEAD
 cloud = Module(
     name="hadoop-cloud",
     dependencies=[],
@@ -557,8 +552,6 @@ def __hash__(self):
     sbt_test_goals=["hadoop-cloud/test"]
 )
 
-=======
->>>>>>> master
 # The root module is a dummy module which is used to run all of the tests.
 # No other modules should directly depend on this module.
 root = Module(
diff --git a/docs/running-on-kubernetes.md b/docs/running-on-kubernetes.md
index bb178d26a7e35..a9a9c49ea3c32 100644
--- a/docs/running-on-kubernetes.md
+++ b/docs/running-on-kubernetes.md
@@ -2,7 +2,6 @@
 layout: global
 title: Running Spark on Kubernetes
 ---
-<<<<<<< HEAD
 
 Support for running on [Kubernetes](https://kubernetes.io/docs/whatisk8s/) is available in experimental status. The
 feature set is currently limited and not well-tested. This should not be used in production environments.
@@ -89,92 +88,12 @@ are set up as described above:
       --conf spark.kubernetes.executor.docker.image=kubespark/spark-executor:v2.2.0-kubernetes-0.3.0 \
       --conf spark.kubernetes.initcontainer.docker.image=kubespark/spark-init:v2.2.0-kubernetes-0.3.0 \
       local:///opt/spark/examples/jars/spark-examples_2.11-2.2.0-k8s-0.3.0.jar
-=======
-* This will become a table of contents (this text will be scraped).
-{:toc}
-
-Spark can run on clusters managed by [Kubernetes](https://kubernetes.io). This feature makes use of native
-Kubernetes scheduler that has been added to Spark.
-
-# Prerequisites
-
-* A runnable distribution of Spark 2.3 or above.
-* A running Kubernetes cluster at version >= 1.6 with access configured to it using
-[kubectl](https://kubernetes.io/docs/user-guide/prereqs/).  If you do not already have a working Kubernetes cluster,
-you may setup a test cluster on your local machine using
-[minikube](https://kubernetes.io/docs/getting-started-guides/minikube/).
-  * We recommend using the latest release of minikube with the DNS addon enabled.
-  * Be aware that the default minikube configuration is not enough for running Spark applications.
-  We recommend 3 CPUs and 4g of memory to be able to start a simple Spark application with a single
-  executor.
-* You must have appropriate permissions to list, create, edit and delete
-[pods](https://kubernetes.io/docs/user-guide/pods/) in your cluster. You can verify that you can list these resources
-by running `kubectl auth can-i <list|create|edit|delete> pods`.
-  * The service account credentials used by the driver pods must be allowed to create pods, services and configmaps.
-* You must have [Kubernetes DNS](https://kubernetes.io/docs/concepts/services-networking/dns-pod-service/) configured in your cluster.
-
-# How it works
-
-<p style="text-align: center;">
-  <img src="img/k8s-cluster-mode.png" title="Spark cluster components" alt="Spark cluster components" />
-</p>
-
-<code>spark-submit</code> can be directly used to submit a Spark application to a Kubernetes cluster.
-The submission mechanism works as follows:
-
-* Spark creates a Spark driver running within a [Kubernetes pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/).
-* The driver creates executors which are also running within Kubernetes pods and connects to them, and executes application code.
-* When the application completes, the executor pods terminate and are cleaned up, but the driver pod persists
-logs and remains in "completed" state in the Kubernetes API until it's eventually garbage collected or manually cleaned up.
-
-Note that in the completed state, the driver pod does *not* use any computational or memory resources.
-
-The driver and executor pod scheduling is handled by Kubernetes. It will be possible to affect Kubernetes scheduling
-decisions for driver and executor pods using advanced primitives like
-[node selectors](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#nodeselector)
-and [node/pod affinities](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity)
-in a future release.
-
-# Submitting Applications to Kubernetes
-
-## Docker Images
-
-Kubernetes requires users to supply images that can be deployed into containers within pods. The images are built to
-be run in a container runtime environment that Kubernetes supports. Docker is a container runtime environment that is
-frequently used with Kubernetes. Spark (starting with version 2.3) ships with a Dockerfile that can be used for this
-purpose, or customized to match an individual application's needs. It can be found in the `kubernetes/dockerfiles/`
-directory.
-
-Spark also ships with a `bin/docker-image-tool.sh` script that can be used to build and publish the Docker images to
-use with the Kubernetes backend.
-
-Example usage is:
-
-    ./bin/docker-image-tool.sh -r <repo> -t my-tag build
-    ./bin/docker-image-tool.sh -r <repo> -t my-tag push
-
-## Cluster Mode
-
-To launch Spark Pi in cluster mode,
-
-```bash
-$ bin/spark-submit \
-    --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
-    --deploy-mode cluster \
-    --name spark-pi \
-    --class org.apache.spark.examples.SparkPi \
-    --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.container.image=<spark-image> \
-    local:///path/to/examples.jar
-```
->>>>>>> master
 
 The Spark master, specified either via passing the `--master` command line argument to `spark-submit` or by setting
 `spark.master` in the application's configuration, must be a URL with the format `k8s://<api_server_url>`. Prefixing the
 master string with `k8s://` will cause the Spark application to launch on the Kubernetes cluster, with the API server
 being contacted at `api_server_url`. If no HTTP protocol is specified in the URL, it defaults to `https`. For example,
 setting the master to `k8s://example.com:443` is equivalent to setting it to `k8s://https://example.com:443`, but to
-<<<<<<< HEAD
 connect without TLS on a different port, the master would be set to `k8s://http://example.com:8443`.
 
 If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
@@ -432,222 +351,6 @@ communicate with the resource staging server over TLS. The trustStore can be set
 
 Below are some other common properties that are specific to Kubernetes. Most of the other configurations are the same
 from the other deployment modes. See the [configuration page](configuration.html) for more information on those.
-=======
-connect without TLS on a different port, the master would be set to `k8s://http://example.com:8080`.
-
-In Kubernetes mode, the Spark application name that is specified by `spark.app.name` or the `--name` argument to
-`spark-submit` is used by default to name the Kubernetes resources created like drivers and executors. So, application names
-must consist of lower case alphanumeric characters, `-`, and `.`  and must start and end with an alphanumeric character.
-
-If you have a Kubernetes cluster setup, one way to discover the apiserver URL is by executing `kubectl cluster-info`.
-
-```bash
-kubectl cluster-info
-Kubernetes master is running at http://127.0.0.1:6443
-```
-
-In the above example, the specific Kubernetes cluster can be used with <code>spark-submit</code> by specifying
-`--master k8s://http://127.0.0.1:6443` as an argument to spark-submit. Additionally, it is also possible to use the
-authenticating proxy, `kubectl proxy` to communicate to the Kubernetes API.
-
-The local proxy can be started by:
-
-```bash
-kubectl proxy
-```
-
-If the local proxy is running at localhost:8001, `--master k8s://http://127.0.0.1:8001` can be used as the argument to
-spark-submit. Finally, notice that in the above example we specify a jar with a specific URI with a scheme of `local://`.
-This URI is the location of the example jar that is already in the Docker image.
-
-## Dependency Management
-
-If your application's dependencies are all hosted in remote locations like HDFS or HTTP servers, they may be referred to
-by their appropriate remote URIs. Also, application dependencies can be pre-mounted into custom-built Docker images.
-Those dependencies can be added to the classpath by referencing them with `local://` URIs and/or setting the
-`SPARK_EXTRA_CLASSPATH` environment variable in your Dockerfiles.
-
-### Using Remote Dependencies
-When there are application dependencies hosted in remote locations like HDFS or HTTP servers, the driver and executor pods
-need a Kubernetes [init-container](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/) for downloading
-the dependencies so the driver and executor containers can use them locally.
-
-The init-container handles remote dependencies specified in `spark.jars` (or the `--jars` option of `spark-submit`) and
-`spark.files` (or the `--files` option of `spark-submit`). It also handles remotely hosted main application resources, e.g.,
-the main application jar. The following shows an example of using remote dependencies with the `spark-submit` command:
-
-```bash
-$ bin/spark-submit \
-    --master k8s://https://<k8s-apiserver-host>:<k8s-apiserver-port> \
-    --deploy-mode cluster \
-    --name spark-pi \
-    --class org.apache.spark.examples.SparkPi \
-    --jars https://path/to/dependency1.jar,https://path/to/dependency2.jar
-    --files hdfs://host:port/path/to/file1,hdfs://host:port/path/to/file2
-    --conf spark.executor.instances=5 \
-    --conf spark.kubernetes.container.image=<spark-image> \
-    https://path/to/examples.jar
-```
-
-## Secret Management
-Kubernetes [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/) can be used to provide credentials for a
-Spark application to access secured services. To mount a user-specified secret into the driver container, users can use
-the configuration property of the form `spark.kubernetes.driver.secrets.[SecretName]=<mount path>`. Similarly, the
-configuration property of the form `spark.kubernetes.executor.secrets.[SecretName]=<mount path>` can be used to mount a
-user-specified secret into the executor containers. Note that it is assumed that the secret to be mounted is in the same
-namespace as that of the driver and executor pods. For example, to mount a secret named `spark-secret` onto the path
-`/etc/secrets` in both the driver and executor containers, add the following options to the `spark-submit` command:
-
-```
---conf spark.kubernetes.driver.secrets.spark-secret=/etc/secrets
---conf spark.kubernetes.executor.secrets.spark-secret=/etc/secrets
-```
-
-Note that if an init-container is used, any secret mounted into the driver container will also be mounted into the
-init-container of the driver. Similarly, any secret mounted into an executor container will also be mounted into the
-init-container of the executor.
-
-## Introspection and Debugging
-
-These are the different ways in which you can investigate a running/completed Spark application, monitor progress, and
-take actions.
-
-### Accessing Logs
-
-Logs can be accessed using the Kubernetes API and the `kubectl` CLI. When a Spark application is running, it's possible
-to stream logs from the application using:
-
-```bash
-kubectl -n=<namespace> logs -f <driver-pod-name>
-```
-
-The same logs can also be accessed through the
-[Kubernetes dashboard](https://kubernetes.io/docs/tasks/access-application-cluster/web-ui-dashboard/) if installed on
-the cluster.
-
-### Accessing Driver UI
-
-The UI associated with any application can be accessed locally using
-[`kubectl port-forward`](https://kubernetes.io/docs/tasks/access-application-cluster/port-forward-access-application-cluster/#forward-a-local-port-to-a-port-on-the-pod).
-
-```bash
-kubectl port-forward <driver-pod-name> 4040:4040
-```
-
-Then, the Spark driver UI can be accessed on `http://localhost:4040`.
-
-### Debugging
-
-There may be several kinds of failures. If the Kubernetes API server rejects the request made from spark-submit, or the
-connection is refused for a different reason, the submission logic should indicate the error encountered. However, if there
-are errors during the running of the application, often, the best way to investigate may be through the Kubernetes CLI.
-
-To get some basic information about the scheduling decisions made around the driver pod, you can run:
-
-```bash
-kubectl describe pod <spark-driver-pod>
-```
-
-If the pod has encountered a runtime error, the status can be probed further using:
-
-```bash
-kubectl logs <spark-driver-pod>
-```
-
-Status and logs of failed executor pods can be checked in similar ways. Finally, deleting the driver pod will clean up the entire spark
-application, including all executors, associated service, etc. The driver pod can be thought of as the Kubernetes representation of
-the Spark application.
-
-## Kubernetes Features
-
-### Namespaces
-
-Kubernetes has the concept of [namespaces](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/).
-Namespaces are ways to divide cluster resources between multiple users (via resource quota). Spark on Kubernetes can
-use namespaces to launch Spark applications. This can be made use of through the `spark.kubernetes.namespace` configuration.
-
-Kubernetes allows using [ResourceQuota](https://kubernetes.io/docs/concepts/policy/resource-quotas/) to set limits on
-resources, number of objects, etc on individual namespaces. Namespaces and ResourceQuota can be used in combination by
-administrator to control sharing and resource allocation in a Kubernetes cluster running Spark applications.
-
-### RBAC
-
-In Kubernetes clusters with [RBAC](https://kubernetes.io/docs/admin/authorization/rbac/) enabled, users can configure
-Kubernetes RBAC roles and service accounts used by the various Spark on Kubernetes components to access the Kubernetes
-API server.
-
-The Spark driver pod uses a Kubernetes service account to access the Kubernetes API server to create and watch executor
-pods. The service account used by the driver pod must have the appropriate permission for the driver to be able to do
-its work. Specifically, at minimum, the service account must be granted a
-[`Role` or `ClusterRole`](https://kubernetes.io/docs/admin/authorization/rbac/#role-and-clusterrole) that allows driver
-pods to create pods and services. By default, the driver pod is automatically assigned the `default` service account in
-the namespace specified by `spark.kubernetes.namespace`, if no service account is specified when the pod gets created.
-
-Depending on the version and setup of Kubernetes deployed, this `default` service account may or may not have the role
-that allows driver pods to create pods and services under the default Kubernetes
-[RBAC](https://kubernetes.io/docs/admin/authorization/rbac/) policies. Sometimes users may need to specify a custom
-service account that has the right role granted. Spark on Kubernetes supports specifying a custom service account to
-be used by the driver pod through the configuration property
-`spark.kubernetes.authenticate.driver.serviceAccountName=<service account name>`. For example to make the driver pod
-use the `spark` service account, a user simply adds the following option to the `spark-submit` command:
-
-```
---conf spark.kubernetes.authenticate.driver.serviceAccountName=spark
-```
-
-To create a custom service account, a user can use the `kubectl create serviceaccount` command. For example, the
-following command creates a service account named `spark`:
-
-```bash
-kubectl create serviceaccount spark
-```
-
-To grant a service account a `Role` or `ClusterRole`, a `RoleBinding` or `ClusterRoleBinding` is needed. To create
-a `RoleBinding` or `ClusterRoleBinding`, a user can use the `kubectl create rolebinding` (or `clusterrolebinding`
-for `ClusterRoleBinding`) command. For example, the following command creates an `edit` `ClusterRole` in the `default`
-namespace and grants it to the `spark` service account created above:
-
-```bash
-kubectl create clusterrolebinding spark-role --clusterrole=edit --serviceaccount=default:spark --namespace=default
-```
-
-Note that a `Role` can only be used to grant access to resources (like pods) within a single namespace, whereas a
-`ClusterRole` can be used to grant access to cluster-scoped resources (like nodes) as well as namespaced resources
-(like pods) across all namespaces. For Spark on Kubernetes, since the driver always creates executor pods in the
-same namespace, a `Role` is sufficient, although users may use a `ClusterRole` instead. For more information on
-RBAC authorization and how to configure Kubernetes service accounts for pods, please refer to
-[Using RBAC Authorization](https://kubernetes.io/docs/admin/authorization/rbac/) and
-[Configure Service Accounts for Pods](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/).
-
-## Client Mode
-
-Client mode is not currently supported.
-
-## Future Work
-
-There are several Spark on Kubernetes features that are currently being incubated in a fork -
-[apache-spark-on-k8s/spark](https://github.com/apache-spark-on-k8s/spark), which are expected to eventually make it into
-future versions of the spark-kubernetes integration.
-
-Some of these include:
-
-* PySpark
-* R
-* Dynamic Executor Scaling
-* Local File Dependency Management
-* Spark Application Management
-* Job Queues and Resource Management
-
-You can refer to the [documentation](https://apache-spark-on-k8s.github.io/userdocs/) if you want to try these features
-and provide feedback to the development team.
-
-# Configuration
-
-See the [configuration page](configuration.html) for information on Spark configurations.  The following configurations are
-specific to Spark on Kubernetes.
-
-#### Spark Properties
->>>>>>> master
 
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -655,7 +358,6 @@ specific to Spark on Kubernetes.
   <td><code>spark.kubernetes.namespace</code></td>
   <td><code>default</code></td>
   <td>
-<<<<<<< HEAD
     The namespace that will be used for running the driver and executor pods. When using
     <code>spark-submit</code> in cluster mode, this can also be passed to <code>spark-submit</code> via the
     <code>--kubernetes-namespace</code> command line argument.
@@ -702,40 +404,6 @@ specific to Spark on Kubernetes.
     Labels that will be used to look up shuffle service pods. This should be a comma-separated list of label key-value pairs,
     where each label is in the format <code>key=value</code>. The labels chosen must be such that
     they match exactly one shuffle service pod on each node that executors are launched.
-=======
-    The namespace that will be used for running the driver and executor pods.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.container.image</code></td>
-  <td><code>(none)</code></td>
-  <td>
-    Container image to use for the Spark application.
-    This is usually of the form <code>example.com/repo/spark:v1.0.0</code>.
-    This configuration is required and must be provided by the user, unless explicit
-    images are provided for each different container type.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driver.container.image</code></td>
-  <td><code>(value of spark.kubernetes.container.image)</code></td>
-  <td>
-    Custom container image to use for the driver.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.executor.container.image</code></td>
-  <td><code>(value of spark.kubernetes.container.image)</code></td>
-  <td>
-    Custom container image to use for executors.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.container.image.pullPolicy</code></td>
-  <td><code>IfNotPresent</code></td>
-  <td>
-    Container image pull policy used when pulling images within Kubernetes.
->>>>>>> master
   </td>
 </tr>
 <tr>
@@ -747,16 +415,10 @@ specific to Spark on Kubernetes.
 </tr>
 <tr>
   <td><code>spark.kubernetes.allocation.batch.delay</code></td>
-<<<<<<< HEAD
-  <td><code>1</code></td>
-  <td>
-    Number of seconds to wait between each round of executor pod allocation.
-=======
   <td><code>1s</code></td>
   <td>
     Time to wait between each round of executor pod allocation. Specifying values less than 1 second may lead to
     excessive CPU usage on the spark driver.
->>>>>>> master
   </td>
 </tr>
 <tr>
@@ -796,18 +458,6 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-<<<<<<< HEAD
-=======
-  <td><code>spark.kubernetes.authenticate.submission.oauthTokenFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the OAuth token file containing the token to use when authenticating against the Kubernetes API server when starting the driver.
-    This file must be located on the submitting machine's disk. Specify this as a path as opposed to a URI (i.e. do not
-    provide a scheme).
-  </td>
-</tr>
-<tr>
->>>>>>> master
   <td><code>spark.kubernetes.authenticate.driver.caCertFile</code></td>
   <td>(none)</td>
   <td>
@@ -840,11 +490,7 @@ specific to Spark on Kubernetes.
   <td><code>spark.kubernetes.authenticate.driver.oauthToken</code></td>
   <td>(none)</td>
   <td>
-<<<<<<< HEAD
-    OAuth token to use when authenticating against the against the Kubernetes API server from the driver pod when
-=======
     OAuth token to use when authenticating against the Kubernetes API server from the driver pod when
->>>>>>> master
     requesting executors. Note that unlike the other authentication options, this must be the exact string value of
     the token to use for the authentication. This token value is uploaded to the driver pod. If this is specified, it is
     highly recommended to set up TLS for the driver submission server, as this value is sensitive information that would
@@ -852,7 +498,6 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-<<<<<<< HEAD
   <td><code>spark.kubernetes.authenticate.driver.serviceAccountName</code></td>
   <td><code>default</code></td>
   <td>
@@ -971,97 +616,6 @@ specific to Spark on Kubernetes.
     value. For example, setting <code>spark.kubernetes.executor.annotation.identifier</code> to <code>myIdentifier</code>
     will result in the executor pods having an annotation with key <code>identifier</code> and value
     <code>myIdentifier</code>. Multiple annotations can be added by setting multiple configurations with this prefix.
-=======
-  <td><code>spark.kubernetes.authenticate.driver.oauthTokenFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the OAuth token file containing the token to use when authenticating against the Kubernetes API server from the driver pod when
-    requesting executors. Note that unlike the other authentication options, this file must contain the exact string value of
-    the token to use for the authentication. This token value is uploaded to the driver pod. If this is specified, it is
-    highly recommended to set up TLS for the driver submission server, as this value is sensitive information that would
-    be passed to the driver pod in plaintext otherwise.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.authenticate.driver.mounted.caCertFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the CA cert file for connecting to the Kubernetes API server over TLS from the driver pod when requesting
-    executors. This path must be accessible from the driver pod.
-    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.authenticate.driver.mounted.clientKeyFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the client key file for authenticating against the Kubernetes API server from the driver pod when requesting
-    executors. This path must be accessible from the driver pod.
-    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.authenticate.driver.mounted.clientCertFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the client cert file for authenticating against the Kubernetes API server from the driver pod when
-    requesting executors. This path must be accessible from the driver pod.
-    Specify this as a path as opposed to a URI (i.e. do not provide a scheme).
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.authenticate.driver.mounted.oauthTokenFile</code></td>
-  <td>(none)</td>
-  <td>
-    Path to the file containing the OAuth token to use when authenticating against the Kubernetes API server from the driver pod when
-    requesting executors. This path must be accessible from the driver pod.
-    Note that unlike the other authentication options, this file must contain the exact string value of the token to use for the authentication.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.authenticate.driver.serviceAccountName</code></td>
-  <td><code>default</code></td>
-  <td>
-    Service account that is used when running the driver pod. The driver pod uses this service account when requesting
-    executor pods from the API server. Note that this cannot be specified alongside a CA cert file, client key file,
-    client cert file, and/or OAuth token.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driver.label.[LabelName]</code></td>
-  <td>(none)</td>
-  <td>
-    Add the label specified by <code>LabelName</code> to the driver pod.
-    For example, <code>spark.kubernetes.driver.label.something=true</code>.
-    Note that Spark also adds its own labels to the driver pod
-    for bookkeeping purposes.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driver.annotation.[AnnotationName]</code></td>
-  <td>(none)</td>
-  <td>
-    Add the annotation specified by <code>AnnotationName</code> to the driver pod.
-    For example, <code>spark.kubernetes.driver.annotation.something=true</code>.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.executor.label.[LabelName]</code></td>
-  <td>(none)</td>
-  <td>
-    Add the label specified by <code>LabelName</code> to the executor pods.
-    For example, <code>spark.kubernetes.executor.label.something=true</code>.
-    Note that Spark also adds its own labels to the driver pod
-    for bookkeeping purposes.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.executor.annotation.[AnnotationName]</code></td>
-  <td>(none)</td>
-  <td>
-    Add the annotation specified by <code>AnnotationName</code> to the executor pods.
-    For example, <code>spark.kubernetes.executor.annotation.something=true</code>.
->>>>>>> master
   </td>
 </tr>
 <tr>
@@ -1073,7 +627,6 @@ specific to Spark on Kubernetes.
   </td>
 </tr>
 <tr>
-<<<<<<< HEAD
   <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
   <td><code>true</code></td>
   <td>
@@ -1215,146 +768,24 @@ specific to Spark on Kubernetes.
   <td>
     Add the environment variable specified by <code>EnvironmentVariableName</code> to
     the Driver process. The user can specify multiple of these to set multiple environment variables.
-=======
-  <td><code>spark.kubernetes.executor.podNamePrefix</code></td>
-  <td>(none)</td>
-  <td>
-    Prefix for naming the executor pods.
-    If not set, the executor pod name is set to driver pod name suffixed by an integer.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.executor.lostCheck.maxAttempts</code></td>
-  <td><code>10</code></td>
-  <td>
-    Number of times that the driver will try to ascertain the loss reason for a specific executor.
-    The loss reason is used to ascertain whether the executor failure is due to a framework or an application error
-    which in turn decides whether the executor is removed and replaced, or placed into a failed state for debugging.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.submission.waitAppCompletion</code></td>
-  <td><code>true</code></td>
-  <td>
-    In cluster mode, whether to wait for the application to finish before exiting the launcher process.  When changed to
-    false, the launcher has a "fire-and-forget" behavior when launching the Spark job.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.report.interval</code></td>
-  <td><code>1s</code></td>
-  <td>
-    Interval between reports of the current Spark job status in cluster mode.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driver.limit.cores</code></td>
-  <td>(none)</td>
-  <td>
-    Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for the driver pod.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.executor.limit.cores</code></td>
-  <td>(none)</td>
-  <td>
-    Specify the hard CPU [limit](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#resource-requests-and-limits-of-pod-and-container) for each executor pod launched for the Spark Application.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.node.selector.[labelKey]</code></td>
-  <td>(none)</td>
-  <td>
-    Adds to the node selector of the driver pod and executor pods, with key <code>labelKey</code> and the value as the
-    configuration's value. For example, setting <code>spark.kubernetes.node.selector.identifier</code> to <code>myIdentifier</code>
-    will result in the driver pod and executors having a node selector with key <code>identifier</code> and value
-     <code>myIdentifier</code>. Multiple node selector keys can be added by setting multiple configurations with this prefix.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.driverEnv.[EnvironmentVariableName]</code></td>
-  <td>(none)</td>
-  <td>
-    Add the environment variable specified by <code>EnvironmentVariableName</code> to
-    the Driver process. The user can specify multiple of these to set multiple environment variables.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.mountDependencies.jarsDownloadDir</code></td>
-  <td><code>/var/spark-data/spark-jars</code></td>
-  <td>
-    Location to download jars to in the driver and executors.
-    This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.mountDependencies.filesDownloadDir</code></td>
-  <td><code>/var/spark-data/spark-files</code></td>
-  <td>
-    Location to download jars to in the driver and executors.
-    This directory must be empty and will be mounted as an empty directory volume on the driver and executor pods.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.mountDependencies.timeout</code></td>
-  <td>300s</td>
-  <td>
-   Timeout in seconds before aborting the attempt to download and unpack dependencies from remote locations into
-   the driver and executor pods.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.mountDependencies.maxSimultaneousDownloads</code></td>
-  <td>5</td>
-  <td>
-   Maximum number of remote dependencies to download simultaneously in a driver or executor pod.
-  </td>
-</tr>
-<tr>
-  <td><code>spark.kubernetes.initContainer.image</code></td>
-  <td><code>(value of spark.kubernetes.container.image)</code></td>
-  <td>
-   Custom container image for the init container of both driver and executors.
->>>>>>> master
   </td>
 </tr>
 <tr>
   <td><code>spark.kubernetes.driver.secrets.[SecretName]</code></td>
   <td>(none)</td>
   <td>
-<<<<<<< HEAD
-    Mounts the Kubernetes secret named <code>SecretName</code> onto the path specified by the value
-    in the driver Pod. The user can specify multiple instances of this for multiple secrets.
-=======
    Add the <a href="https://kubernetes.io/docs/concepts/configuration/secret/">Kubernetes Secret</a> named <code>SecretName</code> to the driver pod on the path specified in the value. For example,
    <code>spark.kubernetes.driver.secrets.spark-secret=/etc/secrets</code>. Note that if an init-container is used,
    the secret will also be added to the init-container in the driver pod.
->>>>>>> master
   </td>
 </tr>
 <tr>
   <td><code>spark.kubernetes.executor.secrets.[SecretName]</code></td>
   <td>(none)</td>
   <td>
-<<<<<<< HEAD
-    Mounts the Kubernetes secret named <code>SecretName</code> onto the path specified by the value
-    in the executor Pods. The user can specify multiple instances of this for multiple secrets.
-  </td>
-</tr>
-</table>
-
-
-## Current Limitations
-
-Running Spark on Kubernetes is currently an experimental feature. Some restrictions on the current implementation that
-should be lifted in the future include:
-* Applications can only run in cluster mode.
-* Only Scala and Java applications can be run.
-=======
    Add the <a href="https://kubernetes.io/docs/concepts/configuration/secret/">Kubernetes Secret</a> named <code>SecretName</code> to the executor pod on the path specified in the value. For example,
    <code>spark.kubernetes.executor.secrets.spark-secret=/etc/secrets</code>. Note that if an init-container is used,
    the secret will also be added to the init-container in the executor pod.
   </td>
 </tr>
 </table>
->>>>>>> master
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index a6abe8796253c..d66908f86ccc7 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -971,26 +971,8 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-<<<<<<< HEAD
-  // flakes in palantir/spark
-  ignore("stress test for failOnDataLoss=false") {
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", "failOnDataLoss.*")
-      .option("startingOffsets", "earliest")
-      .option("failOnDataLoss", "false")
-      .option("fetchOffset.retryIntervalMs", "3000")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
-=======
   protected def startStream(ds: Dataset[Int]) = {
     ds.writeStream.foreach(new ForeachWriter[Int] {
->>>>>>> master
 
       override def open(partitionId: Long, version: Long): Boolean = {
         true
diff --git a/pom.xml b/pom.xml
index 336b5316c9384..8f452abdfb4b8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -132,7 +132,7 @@
     <derby.version>10.12.1.1</derby.version>
     <parquet.version>1.9.1-palantir3</parquet.version>
     <feign.version>8.18.0</feign.version>
-    <okhttp3.version>3.9.0</okhttp3.version>
+    <okhttp3.version>3.9.1</okhttp3.version>
     <okhttp.version>2.7.5</okhttp.version>
     <okio.version>1.13.0</okio.version>
     <retrofit.version>2.3.0</retrofit.version>
@@ -158,13 +158,8 @@
     <commons-beanutils.version>1.9.3</commons-beanutils.version>
     <commons-logging.version>1.2</commons-logging.version>
     <!--  org.apache.httpcomponents/httpclient-->
-<<<<<<< HEAD
-    <commons.httpclient.version>4.5.3</commons.httpclient.version>
-    <commons.httpcore.version>4.4.6</commons.httpcore.version>
-=======
     <commons.httpclient.version>4.5.4</commons.httpclient.version>
     <commons.httpcore.version>4.4.8</commons.httpcore.version>
->>>>>>> master
     <!--  commons-httpclient/commons-httpclient-->
     <httpclient.classic.version>3.1</httpclient.classic.version>
     <commons.math3.version>3.4.1</commons.math3.version>
@@ -187,17 +182,11 @@
     <commons-compress.version>1.8.1</commons-compress.version>
     <commons-pool.version>1.6</commons-pool.version>
     <datanucleus-core.version>3.2.10</datanucleus-core.version>
-<<<<<<< HEAD
     <javax-activation.version>1.1.1</javax-activation.version>
     <objenesis.version>2.5.1</objenesis.version>
-    <janino.version>3.0.0</janino.version>
+    <janino.version>3.0.8</janino.version>
     <jersey.version>2.25.1</jersey.version>
     <joda.version>2.9.9</joda.version>
-=======
-    <janino.version>3.0.8</janino.version>
-    <jersey.version>2.22.2</jersey.version>
-    <joda.version>2.9.3</joda.version>
->>>>>>> master
     <jodd.version>3.5.2</jodd.version>
     <jsr305.version>3.0.1</jsr305.version>
     <libthrift.version>0.9.3</libthrift.version>
@@ -720,11 +709,7 @@
       <dependency>
         <groupId>io.netty</groupId>
         <artifactId>netty-all</artifactId>
-<<<<<<< HEAD
-        <version>4.0.50.Final</version>
-=======
         <version>4.1.17.Final</version>
->>>>>>> master
       </dependency>
       <dependency>
         <groupId>io.netty</groupId>
@@ -3048,7 +3033,6 @@
     </profile>
 
     <profile>
-<<<<<<< HEAD
       <id>kubernetes-integration-tests</id>
       <modules>
         <module>resource-managers/kubernetes/docker-minimal-bundle</module>
@@ -3059,8 +3043,6 @@
     </profile>
 
     <profile>
-=======
->>>>>>> master
       <id>hive-thriftserver</id>
       <modules>
         <module>sql/hive-thriftserver</module>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 256d54e7d74e9..e70c4cdf29dd1 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -57,14 +57,8 @@ object BuildCommons {
   val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn,
     streamingFlumeSink, streamingFlume,
     streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
-<<<<<<< HEAD
-    dockerIntegrationTests, hadoopCloud,
-    kubernetes, _*) =
-    Seq("mesos", "yarn",
-=======
     dockerIntegrationTests, hadoopCloud) =
     Seq("kubernetes", "mesos", "yarn",
->>>>>>> master
       "streaming-flume-sink", "streaming-flume",
       "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud",
diff --git a/resource-managers/kubernetes/core/pom.xml b/resource-managers/kubernetes/core/pom.xml
index 8f84cb18c38ff..2240d0e84eb5c 100644
--- a/resource-managers/kubernetes/core/pom.xml
+++ b/resource-managers/kubernetes/core/pom.xml
@@ -20,11 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-<<<<<<< HEAD
-    <version>2.3.0-SNAPSHOT</version>
-=======
     <version>2.4.0-SNAPSHOT</version>
->>>>>>> master
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
@@ -33,11 +29,7 @@
   <name>Spark Project Kubernetes</name>
   <properties>
     <sbt.project.name>kubernetes</sbt.project.name>
-<<<<<<< HEAD
-    <kubernetes.client.version>2.2.13</kubernetes.client.version>
-=======
     <kubernetes.client.version>3.0.0</kubernetes.client.version>
->>>>>>> master
   </properties>
 
   <dependencies>
@@ -62,7 +54,6 @@
       <exclusions>
         <exclusion>
           <groupId>com.fasterxml.jackson.core</groupId>
-<<<<<<< HEAD
           <artifactId>jackson-core</artifactId>
         </exclusion>
         <exclusion>
@@ -72,9 +63,6 @@
         <exclusion>
           <groupId>com.fasterxml.jackson.core</groupId>
           <artifactId>jackson-annotations</artifactId>
-=======
-          <artifactId>*</artifactId>
->>>>>>> master
         </exclusion>
         <exclusion>
           <groupId>com.fasterxml.jackson.dataformat</groupId>
@@ -82,17 +70,12 @@
         </exclusion>
       </exclusions>
     </dependency>
-<<<<<<< HEAD
-=======
-
->>>>>>> master
     <!-- Required by kubernetes-client but we exclude it -->
     <dependency>
       <groupId>com.fasterxml.jackson.dataformat</groupId>
       <artifactId>jackson-dataformat-yaml</artifactId>
       <version>${fasterxml.jackson.version}</version>
     </dependency>
-<<<<<<< HEAD
     <dependency>
       <groupId>org.glassfish.jersey.containers</groupId>
       <artifactId>jersey-container-servlet</artifactId>
@@ -122,9 +105,6 @@
       <groupId>javax.ws.rs</groupId>
       <artifactId>javax.ws.rs-api</artifactId>
     </dependency>
-=======
-
->>>>>>> master
     <!-- Explicitly depend on shaded dependencies from the parent, since shaded deps aren't transitive -->
     <dependency>
       <groupId>com.google.guava</groupId>
@@ -133,7 +113,6 @@
     <!-- End of shaded deps. -->
 
     <dependency>
-<<<<<<< HEAD
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcpkix-jdk15on</artifactId>
     </dependency>
@@ -142,22 +121,11 @@
       <artifactId>bcprov-jdk15on</artifactId>
     </dependency>
     <dependency>
-=======
->>>>>>> master
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
 
-<<<<<<< HEAD
-=======
-    <dependency>
-      <groupId>com.squareup.okhttp3</groupId>
-      <artifactId>okhttp</artifactId>
-      <version>3.8.1</version>
-    </dependency>
-
->>>>>>> master
   </dependencies>
 
 
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
index 544c069186726..cdec3a155a597 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/PodWithDetachedInitContainer.scala
@@ -18,16 +18,6 @@ package org.apache.spark.deploy.k8s
 
 import io.fabric8.kubernetes.api.model.{Container, Pod}
 
-<<<<<<< HEAD
-=======
-/**
- * Represents a pod with a detached init-container (not yet added to the pod).
- *
- * @param pod the pod
- * @param initContainer the init-container in the pod
- * @param mainContainer the main container in the pod
- */
->>>>>>> master
 private[spark] case class PodWithDetachedInitContainer(
     pod: Pod,
     initContainer: Container,
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
index 407a6f88259c1..c90303b6ba506 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkKubernetesClientFactory.scala
@@ -20,20 +20,12 @@ import java.io.File
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
-<<<<<<< HEAD
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
-=======
-import io.fabric8.kubernetes.client.{ConfigBuilder, DefaultKubernetesClient, KubernetesClient}
->>>>>>> master
 import io.fabric8.kubernetes.client.utils.HttpClientUtils
 import okhttp3.Dispatcher
 
 import org.apache.spark.SparkConf
-<<<<<<< HEAD
 import org.apache.spark.deploy.k8s.config._
-=======
-import org.apache.spark.deploy.k8s.Config._
->>>>>>> master
 import org.apache.spark.util.ThreadUtils
 
 /**
@@ -48,18 +40,12 @@ private[spark] object SparkKubernetesClientFactory {
       namespace: Option[String],
       kubernetesAuthConfPrefix: String,
       sparkConf: SparkConf,
-<<<<<<< HEAD
       maybeServiceAccountToken: Option[File],
       maybeServiceAccountCaCert: Option[File]): KubernetesClient = {
-=======
-      defaultServiceAccountToken: Option[File],
-      defaultServiceAccountCaCert: Option[File]): KubernetesClient = {
->>>>>>> master
     val oauthTokenFileConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_FILE_CONF_SUFFIX"
     val oauthTokenConf = s"$kubernetesAuthConfPrefix.$OAUTH_TOKEN_CONF_SUFFIX"
     val oauthTokenFile = sparkConf.getOption(oauthTokenFileConf)
       .map(new File(_))
-<<<<<<< HEAD
       .orElse(maybeServiceAccountToken)
     val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
     OptionRequirements.requireNandDefined(
@@ -95,43 +81,6 @@ private[spark] object SparkKubernetesClientFactory {
         }.withOption(namespace) {
           (ns, configBuilder) => configBuilder.withNamespace(ns)
         }.build()
-=======
-      .orElse(defaultServiceAccountToken)
-    val oauthTokenValue = sparkConf.getOption(oauthTokenConf)
-    KubernetesUtils.requireNandDefined(
-      oauthTokenFile,
-      oauthTokenValue,
-      s"Cannot specify OAuth token through both a file $oauthTokenFileConf and a " +
-        s"value $oauthTokenConf.")
-
-    val caCertFile = sparkConf
-      .getOption(s"$kubernetesAuthConfPrefix.$CA_CERT_FILE_CONF_SUFFIX")
-      .orElse(defaultServiceAccountCaCert.map(_.getAbsolutePath))
-    val clientKeyFile = sparkConf
-      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_KEY_FILE_CONF_SUFFIX")
-    val clientCertFile = sparkConf
-      .getOption(s"$kubernetesAuthConfPrefix.$CLIENT_CERT_FILE_CONF_SUFFIX")
-    val dispatcher = new Dispatcher(
-      ThreadUtils.newDaemonCachedThreadPool("kubernetes-dispatcher"))
-    val config = new ConfigBuilder()
-      .withApiVersion("v1")
-      .withMasterUrl(master)
-      .withWebsocketPingInterval(0)
-      .withOption(oauthTokenValue) {
-        (token, configBuilder) => configBuilder.withOauthToken(token)
-      }.withOption(oauthTokenFile) {
-        (file, configBuilder) =>
-            configBuilder.withOauthToken(Files.toString(file, Charsets.UTF_8))
-      }.withOption(caCertFile) {
-        (file, configBuilder) => configBuilder.withCaCertFile(file)
-      }.withOption(clientKeyFile) {
-        (file, configBuilder) => configBuilder.withClientKeyFile(file)
-      }.withOption(clientCertFile) {
-        (file, configBuilder) => configBuilder.withClientCertFile(file)
-      }.withOption(namespace) {
-        (ns, configBuilder) => configBuilder.withNamespace(ns)
-      }.build()
->>>>>>> master
     val baseHttpClient = HttpClientUtils.createHttpClient(config)
     val httpClientWithCustomDispatcher = baseHttpClient.newBuilder()
       .dispatcher(dispatcher)
@@ -139,7 +88,6 @@ private[spark] object SparkKubernetesClientFactory {
     new DefaultKubernetesClient(httpClientWithCustomDispatcher, config)
   }
 
-<<<<<<< HEAD
   private implicit class OptionConfigurableConfigBuilder(configBuilder: ConfigBuilder) {
 
     def withOption[T]
@@ -151,17 +99,5 @@ private[spark] object SparkKubernetesClientFactory {
     }
 
     def build(): Config = configBuilder.build()
-=======
-  private implicit class OptionConfigurableConfigBuilder(val configBuilder: ConfigBuilder)
-    extends AnyVal {
-
-    def withOption[T]
-        (option: Option[T])
-        (configurator: ((T, ConfigBuilder) => ConfigBuilder)): ConfigBuilder = {
-      option.map { opt =>
-        configurator(opt, configBuilder)
-      }.getOrElse(configBuilder)
-    }
->>>>>>> master
   }
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
index b4ed3a6dd1df4..c645b008d736d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
@@ -18,18 +18,10 @@ package org.apache.spark.deploy.k8s.submit
 
 import java.util.concurrent.{CountDownLatch, TimeUnit}
 
-<<<<<<< HEAD
 import io.fabric8.kubernetes.api.model.{ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, Pod}
 import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 import scala.collection.JavaConverters._
-=======
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, Pod, Time}
-import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
-import io.fabric8.kubernetes.client.Watcher.Action
->>>>>>> master
 
 import org.apache.spark.SparkException
 import org.apache.spark.internal.Logging
@@ -43,23 +35,13 @@ private[k8s] trait LoggingPodStatusWatcher extends Watcher[Pod] {
  * A monitor for the running Kubernetes pod of a Spark application. Status logging occurs on
  * every state change and also at an interval for liveness.
  *
-<<<<<<< HEAD
  * @param appId
-=======
- * @param appId application ID.
->>>>>>> master
  * @param maybeLoggingInterval ms between each state request. If provided, must be a positive
  *                             number.
  */
 private[k8s] class LoggingPodStatusWatcherImpl(
-<<<<<<< HEAD
       appId: String, maybeLoggingInterval: Option[Long])
     extends LoggingPodStatusWatcher with Logging {
-=======
-    appId: String,
-    maybeLoggingInterval: Option[Long])
-  extends LoggingPodStatusWatcher with Logging {
->>>>>>> master
 
   private val podCompletedFuture = new CountDownLatch(1)
   // start timer for periodic logging
@@ -75,10 +57,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
 
   def start(): Unit = {
     maybeLoggingInterval.foreach { interval =>
-<<<<<<< HEAD
       require(interval > 0, s"Logging interval must be a positive time value, got: $interval ms.")
-=======
->>>>>>> master
       scheduler.scheduleAtFixedRate(logRunnable, 0, interval, TimeUnit.MILLISECONDS)
     }
   }
@@ -86,14 +65,10 @@ private[k8s] class LoggingPodStatusWatcherImpl(
   override def eventReceived(action: Action, pod: Pod): Unit = {
     this.pod = Option(pod)
     action match {
-<<<<<<< HEAD
       case Action.DELETED =>
         closeWatch()
 
       case Action.ERROR =>
-=======
-      case Action.DELETED | Action.ERROR =>
->>>>>>> master
         closeWatch()
 
       case _ =>
@@ -114,11 +89,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
   }
 
   private def logLongStatus() = {
-<<<<<<< HEAD
     logInfo("State changed, new state: " + pod.map(formatPodState(_)).getOrElse("unknown"))
-=======
-    logInfo("State changed, new state: " + pod.map(formatPodState).getOrElse("unknown"))
->>>>>>> master
   }
 
   private def hasCompleted(): Boolean = {
@@ -131,7 +102,6 @@ private[k8s] class LoggingPodStatusWatcherImpl(
   }
 
   private def formatPodState(pod: Pod): String = {
-<<<<<<< HEAD
     // TODO include specific container state
     val details = Seq[(String, String)](
       // pod metadata
@@ -156,32 +126,6 @@ private[k8s] class LoggingPodStatusWatcherImpl(
       ("phase", pod.getStatus.getPhase()),
       ("status", pod.getStatus.getContainerStatuses().toString)
     )
-=======
-    val details = Seq[(String, String)](
-      // pod metadata
-      ("pod name", pod.getMetadata.getName),
-      ("namespace", pod.getMetadata.getNamespace),
-      ("labels", pod.getMetadata.getLabels.asScala.mkString(", ")),
-      ("pod uid", pod.getMetadata.getUid),
-      ("creation time", formatTime(pod.getMetadata.getCreationTimestamp)),
-
-      // spec details
-      ("service account name", pod.getSpec.getServiceAccountName),
-      ("volumes", pod.getSpec.getVolumes.asScala.map(_.getName).mkString(", ")),
-      ("node name", pod.getSpec.getNodeName),
-
-      // status
-      ("start time", formatTime(pod.getStatus.getStartTime)),
-      ("container images",
-        pod.getStatus.getContainerStatuses
-          .asScala
-          .map(_.getImage)
-          .mkString(", ")),
-      ("phase", pod.getStatus.getPhase),
-      ("status", pod.getStatus.getContainerStatuses.toString)
-    )
-
->>>>>>> master
     formatPairsBundle(details)
   }
 
@@ -212,7 +156,6 @@ private[k8s] class LoggingPodStatusWatcherImpl(
       containerStatus: ContainerStatus): Seq[(String, String)] = {
     val state = containerStatus.getState
     Option(state.getRunning)
-<<<<<<< HEAD
         .orElse(Option(state.getTerminated))
         .orElse(Option(state.getWaiting))
         .map {
@@ -232,29 +175,4 @@ private[k8s] class LoggingPodStatusWatcherImpl(
             throw new SparkException(s"Unexpected container status type ${unknown.getClass}.")
         }.getOrElse(Seq(("Container state", "N/A")))
   }
-=======
-      .orElse(Option(state.getTerminated))
-      .orElse(Option(state.getWaiting))
-      .map {
-        case running: ContainerStateRunning =>
-          Seq(
-            ("Container state", "Running"),
-            ("Container started at", formatTime(running.getStartedAt)))
-        case waiting: ContainerStateWaiting =>
-          Seq(
-            ("Container state", "Waiting"),
-            ("Pending reason", waiting.getReason))
-        case terminated: ContainerStateTerminated =>
-          Seq(
-            ("Container state", "Terminated"),
-            ("Exit code", terminated.getExitCode.toString))
-        case unknown =>
-          throw new SparkException(s"Unexpected container status type ${unknown.getClass}.")
-        }.getOrElse(Seq(("Container state", "N/A")))
-  }
-
-  private def formatTime(time: Time): String = {
-    if (time != null) time.getTime else "N/A"
-  }
->>>>>>> master
 }
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
index ab071f61f87f7..f806e65974fcc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/MainAppResource.scala
@@ -18,9 +18,6 @@ package org.apache.spark.deploy.k8s.submit
 
 private[spark] sealed trait MainAppResource
 
-<<<<<<< HEAD
 private[spark] case class PythonMainAppResource(primaryPyFile: String) extends MainAppResource
 
-=======
->>>>>>> master
 private[spark] case class JavaMainAppResource(primaryResource: String) extends MainAppResource
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
index ee58b7c038239..98a0d879b6a58 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactory.scala
@@ -16,7 +16,6 @@
  */
 package org.apache.spark.scheduler.cluster.k8s
 
-<<<<<<< HEAD
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, ContainerPortBuilder, EnvVar, EnvVarBuilder, EnvVarSourceBuilder, Pod, PodBuilder, QuantityBuilder, VolumeBuilder, VolumeMountBuilder}
 import scala.collection.JavaConverters._
 
@@ -104,92 +103,6 @@ private[spark] class ExecutorPodFactoryImpl(
   private val executorLimitCores = sparkConf.getOption(KUBERNETES_EXECUTOR_LIMIT_CORES.key)
 
   override def createExecutorPod(
-=======
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model._
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap, PodWithDetachedInitContainer}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.internal.config.{EXECUTOR_CLASS_PATH, EXECUTOR_JAVA_OPTIONS, EXECUTOR_MEMORY, EXECUTOR_MEMORY_OVERHEAD}
-import org.apache.spark.util.Utils
-
-/**
- * A factory class for bootstrapping and creating executor pods with the given bootstrapping
- * components.
- *
- * @param sparkConf Spark configuration
- * @param mountSecretsBootstrap an optional component for mounting user-specified secrets onto
- *                              user-specified paths into the executor container
- * @param initContainerBootstrap an optional component for bootstrapping the executor init-container
- *                               if one is needed, i.e., when there are remote dependencies to
- *                               localize
- * @param initContainerMountSecretsBootstrap an optional component for mounting user-specified
- *                                           secrets onto user-specified paths into the executor
- *                                           init-container
- */
-private[spark] class ExecutorPodFactory(
-    sparkConf: SparkConf,
-    mountSecretsBootstrap: Option[MountSecretsBootstrap],
-    initContainerBootstrap: Option[InitContainerBootstrap],
-    initContainerMountSecretsBootstrap: Option[MountSecretsBootstrap]) {
-
-  private val executorExtraClasspath = sparkConf.get(EXECUTOR_CLASS_PATH)
-
-  private val executorLabels = KubernetesUtils.parsePrefixedKeyValuePairs(
-    sparkConf,
-    KUBERNETES_EXECUTOR_LABEL_PREFIX)
-  require(
-    !executorLabels.contains(SPARK_APP_ID_LABEL),
-    s"Custom executor labels cannot contain $SPARK_APP_ID_LABEL as it is reserved for Spark.")
-  require(
-    !executorLabels.contains(SPARK_EXECUTOR_ID_LABEL),
-    s"Custom executor labels cannot contain $SPARK_EXECUTOR_ID_LABEL as it is reserved for" +
-      " Spark.")
-  require(
-    !executorLabels.contains(SPARK_ROLE_LABEL),
-    s"Custom executor labels cannot contain $SPARK_ROLE_LABEL as it is reserved for Spark.")
-
-  private val executorAnnotations =
-    KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf,
-      KUBERNETES_EXECUTOR_ANNOTATION_PREFIX)
-  private val nodeSelector =
-    KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf,
-      KUBERNETES_NODE_SELECTOR_PREFIX)
-
-  private val executorContainerImage = sparkConf
-    .get(EXECUTOR_CONTAINER_IMAGE)
-    .getOrElse(throw new SparkException("Must specify the executor container image"))
-  private val imagePullPolicy = sparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
-  private val blockManagerPort = sparkConf
-    .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
-
-  private val executorPodNamePrefix = sparkConf.get(KUBERNETES_EXECUTOR_POD_NAME_PREFIX)
-
-  private val executorMemoryMiB = sparkConf.get(EXECUTOR_MEMORY)
-  private val executorMemoryString = sparkConf.get(
-    EXECUTOR_MEMORY.key, EXECUTOR_MEMORY.defaultValueString)
-
-  private val memoryOverheadMiB = sparkConf
-    .get(EXECUTOR_MEMORY_OVERHEAD)
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * executorMemoryMiB).toInt,
-      MEMORY_OVERHEAD_MIN_MIB))
-  private val executorMemoryWithOverhead = executorMemoryMiB + memoryOverheadMiB
-
-  private val executorCores = sparkConf.getDouble("spark.executor.cores", 1)
-  private val executorLimitCores = sparkConf.get(KUBERNETES_EXECUTOR_LIMIT_CORES)
-
-  private val executorJarsDownloadDir = sparkConf.get(JARS_DOWNLOAD_LOCATION)
-
-  /**
-   * Configure and construct an executor pod with the given parameters.
-   */
-  def createExecutorPod(
->>>>>>> master
       executorId: String,
       applicationId: String,
       driverUrl: String,
@@ -200,7 +113,6 @@ private[spark] class ExecutorPodFactory(
 
     // hostname must be no longer than 63 characters, so take the last 63 characters of the pod
     // name as the hostname.  This preserves uniqueness since the end of name contains
-<<<<<<< HEAD
     // executorId and applicationId
     val hostname = name.substring(Math.max(0, name.length - 63))
     val resolvedExecutorLabels = Map(
@@ -208,15 +120,6 @@ private[spark] class ExecutorPodFactory(
         SPARK_APP_ID_LABEL -> applicationId,
         SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE) ++
         executorLabels
-=======
-    // executorId
-    val hostname = name.substring(Math.max(0, name.length - 63))
-    val resolvedExecutorLabels = Map(
-      SPARK_EXECUTOR_ID_LABEL -> executorId,
-      SPARK_APP_ID_LABEL -> applicationId,
-      SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE) ++
-      executorLabels
->>>>>>> master
     val executorMemoryQuantity = new QuantityBuilder(false)
       .withAmount(s"${executorMemoryMiB}Mi")
       .build()
@@ -228,20 +131,12 @@ private[spark] class ExecutorPodFactory(
       .build()
     val executorExtraClasspathEnv = executorExtraClasspath.map { cp =>
       new EnvVarBuilder()
-<<<<<<< HEAD
         .withName(ENV_EXECUTOR_EXTRA_CLASSPATH)
-=======
-        .withName(ENV_CLASSPATH)
->>>>>>> master
         .withValue(cp)
         .build()
     }
     val executorExtraJavaOptionsEnv = sparkConf
-<<<<<<< HEAD
       .get(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS)
-=======
-      .get(EXECUTOR_JAVA_OPTIONS)
->>>>>>> master
       .map { opts =>
         val delimitedOpts = Utils.splitCommandString(opts)
         delimitedOpts.zipWithIndex.map {
@@ -250,10 +145,7 @@ private[spark] class ExecutorPodFactory(
         }
       }.getOrElse(Seq.empty[EnvVar])
     val executorEnv = (Seq(
-<<<<<<< HEAD
       (ENV_EXECUTOR_PORT, executorPort.toString),
-=======
->>>>>>> master
       (ENV_DRIVER_URL, driverUrl),
       // Executor backend expects integral value for executor cores, so round it up to an int.
       (ENV_EXECUTOR_CORES, math.ceil(executorCores).toInt.toString),
@@ -274,7 +166,6 @@ private[spark] class ExecutorPodFactory(
         .build()
     ) ++ executorExtraJavaOptionsEnv ++ executorExtraClasspathEnv.toSeq
     val requiredPorts = Seq(
-<<<<<<< HEAD
       (EXECUTOR_PORT_NAME, executorPort),
       (BLOCK_MANAGER_PORT_NAME, blockmanagerPort))
       .map(port => {
@@ -290,20 +181,6 @@ private[spark] class ExecutorPodFactory(
       .withName(s"executor")
       .withImage(executorDockerImage)
       .withImagePullPolicy(dockerImagePullPolicy)
-=======
-      (BLOCK_MANAGER_PORT_NAME, blockManagerPort))
-      .map { case (name, port) =>
-        new ContainerPortBuilder()
-          .withName(name)
-          .withContainerPort(port)
-          .build()
-      }
-
-    val executorContainer = new ContainerBuilder()
-      .withName("executor")
-      .withImage(executorContainerImage)
-      .withImagePullPolicy(imagePullPolicy)
->>>>>>> master
       .withNewResources()
         .addToRequests("memory", executorMemoryQuantity)
         .addToLimits("memory", executorMemoryLimitQuantity)
@@ -311,11 +188,7 @@ private[spark] class ExecutorPodFactory(
         .endResources()
       .addAllToEnv(executorEnv.asJava)
       .withPorts(requiredPorts.asJava)
-<<<<<<< HEAD
       .addAllToVolumeMounts(shuffleVolumesWithMounts.map(_._2).asJava)
-=======
-      .addToArgs("executor")
->>>>>>> master
       .build()
 
     val executorPod = new PodBuilder()
@@ -336,7 +209,6 @@ private[spark] class ExecutorPodFactory(
         .withHostname(hostname)
         .withRestartPolicy("Never")
         .withNodeSelector(nodeSelector.asJava)
-<<<<<<< HEAD
         .addAllToVolumes(shuffleVolumesWithMounts.map(_._1).asJava)
         .endSpec()
       .build()
@@ -392,59 +264,11 @@ private[spark] class ExecutorPodFactory(
     new PodBuilder(executorPodWithNodeAffinity)
       .editSpec()
         .addToContainers(initBootstrappedExecutorContainer)
-=======
-        .endSpec()
-      .build()
-
-    val containerWithLimitCores = executorLimitCores.map { limitCores =>
-      val executorCpuLimitQuantity = new QuantityBuilder(false)
-        .withAmount(limitCores)
-        .build()
-      new ContainerBuilder(executorContainer)
-        .editResources()
-        .addToLimits("cpu", executorCpuLimitQuantity)
-        .endResources()
-        .build()
-    }.getOrElse(executorContainer)
-
-    val (maybeSecretsMountedPod, maybeSecretsMountedContainer) =
-      mountSecretsBootstrap.map { bootstrap =>
-        (bootstrap.addSecretVolumes(executorPod), bootstrap.mountSecrets(containerWithLimitCores))
-      }.getOrElse((executorPod, containerWithLimitCores))
-
-    val (bootstrappedPod, bootstrappedContainer) =
-      initContainerBootstrap.map { bootstrap =>
-        val podWithInitContainer = bootstrap.bootstrapInitContainer(
-          PodWithDetachedInitContainer(
-            maybeSecretsMountedPod,
-            new ContainerBuilder().build(),
-            maybeSecretsMountedContainer))
-
-        val (pod, mayBeSecretsMountedInitContainer) =
-          initContainerMountSecretsBootstrap.map { bootstrap =>
-            // Mount the secret volumes given that the volumes have already been added to the
-            // executor pod when mounting the secrets into the main executor container.
-            (podWithInitContainer.pod, bootstrap.mountSecrets(podWithInitContainer.initContainer))
-          }.getOrElse((podWithInitContainer.pod, podWithInitContainer.initContainer))
-
-        val bootstrappedPod = KubernetesUtils.appendInitContainer(
-          pod, mayBeSecretsMountedInitContainer)
-
-        (bootstrappedPod, podWithInitContainer.mainContainer)
-      }.getOrElse((maybeSecretsMountedPod, maybeSecretsMountedContainer))
-
-    new PodBuilder(bootstrappedPod)
-      .editSpec()
-        .addToContainers(bootstrappedContainer)
->>>>>>> master
         .endSpec()
       .build()
   }
 }
-<<<<<<< HEAD
 
 private object ExecutorPodFactoryImpl {
   private val DEFAULT_STATIC_PORT = 10000
 }
-=======
->>>>>>> master
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
index 839ac2acca740..6ae76c4a7bda3 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterManager.scala
@@ -20,7 +20,6 @@ import java.io.File
 
 import io.fabric8.kubernetes.client.Config
 
-<<<<<<< HEAD
 import org.apache.spark.SparkContext
 import org.apache.spark.deploy.k8s.{ConfigurationUtils, InitContainerResourceStagingServerSecretPluginImpl, SparkKubernetesClientFactory, SparkPodInitContainerBootstrapImpl}
 import org.apache.spark.deploy.k8s.config._
@@ -31,22 +30,12 @@ import org.apache.spark.network.netty.SparkTransportConf
 import org.apache.spark.network.shuffle.kubernetes.KubernetesExternalShuffleClientImpl
 import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
 import org.apache.spark.util.{ThreadUtils, Utils}
-=======
-import org.apache.spark.{SparkContext, SparkException}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap, SparkKubernetesClientFactory}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.internal.Logging
-import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
-import org.apache.spark.util.ThreadUtils
->>>>>>> master
 
 private[spark] class KubernetesClusterManager extends ExternalClusterManager with Logging {
 
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
-<<<<<<< HEAD
     val scheduler = new KubernetesTaskSchedulerImpl(sc)
     sc.taskScheduler = scheduler
     scheduler
@@ -73,37 +62,11 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
       new InitContainerResourceStagingServerSecretPluginImpl(
         initContainerSecretName,
         initContainerSecretMountPath)
-=======
-    if (masterURL.startsWith("k8s") && sc.deployMode == "client") {
-      throw new SparkException("Client mode is currently not supported for Kubernetes.")
-    }
-
-    new TaskSchedulerImpl(sc)
-  }
-
-  override def createSchedulerBackend(
-      sc: SparkContext,
-      masterURL: String,
-      scheduler: TaskScheduler): SchedulerBackend = {
-    val sparkConf = sc.getConf
-    val initContainerConfigMap = sparkConf.get(INIT_CONTAINER_CONFIG_MAP_NAME)
-    val initContainerConfigMapKey = sparkConf.get(INIT_CONTAINER_CONFIG_MAP_KEY_CONF)
-
-    if (initContainerConfigMap.isEmpty) {
-      logWarning("The executor's init-container config map is not specified. Executors will " +
-        "therefore not attempt to fetch remote or submitted dependencies.")
-    }
-
-    if (initContainerConfigMapKey.isEmpty) {
-      logWarning("The executor's init-container config map key is not specified. Executors will " +
-        "therefore not attempt to fetch remote or submitted dependencies.")
->>>>>>> master
     }
 
     // Only set up the bootstrap if they've provided both the config map key and the config map
     // name. The config map might not be provided if init-containers aren't being used to
     // bootstrap dependencies.
-<<<<<<< HEAD
     val executorInitContainerBootstrap = for {
       configMap <- maybeInitContainerConfigMap
       configMapKey <- maybeInitContainerConfigMapKey
@@ -185,71 +148,6 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager wit
         kubernetesClient,
         allocatorExecutor,
         requestExecutorsService)
-=======
-    val initContainerBootstrap = for {
-      configMap <- initContainerConfigMap
-      configMapKey <- initContainerConfigMapKey
-    } yield {
-      val initContainerImage = sparkConf
-        .get(INIT_CONTAINER_IMAGE)
-        .getOrElse(throw new SparkException(
-          "Must specify the init-container image when there are remote dependencies"))
-      new InitContainerBootstrap(
-        initContainerImage,
-        sparkConf.get(CONTAINER_IMAGE_PULL_POLICY),
-        sparkConf.get(JARS_DOWNLOAD_LOCATION),
-        sparkConf.get(FILES_DOWNLOAD_LOCATION),
-        configMap,
-        configMapKey,
-        SPARK_POD_EXECUTOR_ROLE,
-        sparkConf)
-    }
-
-    val executorSecretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf, KUBERNETES_EXECUTOR_SECRETS_PREFIX)
-    val mountSecretBootstrap = if (executorSecretNamesToMountPaths.nonEmpty) {
-      Some(new MountSecretsBootstrap(executorSecretNamesToMountPaths))
-    } else {
-      None
-    }
-    // Mount user-specified executor secrets also into the executor's init-container. The
-    // init-container may need credentials in the secrets to be able to download remote
-    // dependencies. The executor's main container and its init-container share the secrets
-    // because the init-container is sort of an implementation details and this sharing
-    // avoids introducing a dedicated configuration property just for the init-container.
-    val initContainerMountSecretsBootstrap = if (initContainerBootstrap.nonEmpty &&
-      executorSecretNamesToMountPaths.nonEmpty) {
-      Some(new MountSecretsBootstrap(executorSecretNamesToMountPaths))
-    } else {
-      None
-    }
-
-    val kubernetesClient = SparkKubernetesClientFactory.createKubernetesClient(
-      KUBERNETES_MASTER_INTERNAL_URL,
-      Some(sparkConf.get(KUBERNETES_NAMESPACE)),
-      KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX,
-      sparkConf,
-      Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_TOKEN_PATH)),
-      Some(new File(Config.KUBERNETES_SERVICE_ACCOUNT_CA_CRT_PATH)))
-
-    val executorPodFactory = new ExecutorPodFactory(
-      sparkConf,
-      mountSecretBootstrap,
-      initContainerBootstrap,
-      initContainerMountSecretsBootstrap)
-
-    val allocatorExecutor = ThreadUtils
-      .newDaemonSingleThreadScheduledExecutor("kubernetes-pod-allocator")
-    val requestExecutorsService = ThreadUtils.newDaemonCachedThreadPool(
-      "kubernetes-executor-requests")
-    new KubernetesClusterSchedulerBackend(
-      scheduler.asInstanceOf[TaskSchedulerImpl],
-      sc.env.rpcEnv,
-      executorPodFactory,
-      kubernetesClient,
-      allocatorExecutor,
-      requestExecutorsService)
->>>>>>> master
   }
 
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index b7a131622d3d5..47c3a4bb5bf23 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -18,15 +18,8 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import java.io.Closeable
 import java.net.InetAddress
-<<<<<<< HEAD
-import java.util.Collections
-import java.util.concurrent.{ConcurrentHashMap, ExecutorService, ScheduledExecutorService, ThreadPoolExecutor, TimeUnit}
-import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
-=======
 import java.util.concurrent.{ConcurrentHashMap, ExecutorService, ScheduledExecutorService, TimeUnit}
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
-import javax.annotation.concurrent.GuardedBy
->>>>>>> master
 
 import io.fabric8.kubernetes.api.model._
 import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
@@ -35,7 +28,6 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 
-<<<<<<< HEAD
 import org.apache.spark.{SparkEnv, SparkException}
 import org.apache.spark.deploy.k8s.config._
 import org.apache.spark.deploy.k8s.constants._
@@ -43,24 +35,13 @@ import org.apache.spark.rpc.{RpcAddress, RpcCallContext, RpcEndpointAddress, Rpc
 import org.apache.spark.scheduler.{ExecutorExited, SlaveLost, TaskSchedulerImpl}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RetrieveSparkAppConfig, SparkAppConfig}
 import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-=======
-import org.apache.spark.SparkException
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.rpc.{RpcAddress, RpcEndpointAddress, RpcEnv}
-import org.apache.spark.scheduler.{ExecutorExited, SlaveLost, TaskSchedulerImpl}
-import org.apache.spark.scheduler.cluster.{CoarseGrainedSchedulerBackend, SchedulerBackendUtils}
->>>>>>> master
 import org.apache.spark.util.Utils
 
 private[spark] class KubernetesClusterSchedulerBackend(
     scheduler: TaskSchedulerImpl,
     rpcEnv: RpcEnv,
     executorPodFactory: ExecutorPodFactory,
-<<<<<<< HEAD
     shuffleManager: Option[KubernetesExternalShuffleManager],
-=======
->>>>>>> master
     kubernetesClient: KubernetesClient,
     allocatorExecutor: ScheduledExecutorService,
     requestExecutorsService: ExecutorService)
@@ -70,15 +51,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private val EXECUTOR_ID_COUNTER = new AtomicLong(0L)
   private val RUNNING_EXECUTOR_PODS_LOCK = new Object
-<<<<<<< HEAD
   // Indexed by executor IDs and guarded by RUNNING_EXECUTOR_PODS_LOCK.
   private val runningExecutorsToPods = new mutable.HashMap[String, Pod]
   // Indexed by executor pod names and guarded by RUNNING_EXECUTOR_PODS_LOCK.
   private val runningPodsToExecutors = new mutable.HashMap[String, String]
-=======
-  @GuardedBy("RUNNING_EXECUTOR_PODS_LOCK")
-  private val runningExecutorsToPods = new mutable.HashMap[String, Pod]
->>>>>>> master
   private val executorPodsByIPs = new ConcurrentHashMap[String, Pod]()
   private val podsWithKnownExitReasons = new ConcurrentHashMap[String, ExecutorExited]()
   private val disconnectedPodsByExecutorIdPendingRemoval = new ConcurrentHashMap[String, Pod]()
@@ -87,7 +63,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   private val kubernetesDriverPodName = conf
     .get(KUBERNETES_DRIVER_POD_NAME)
-<<<<<<< HEAD
     .getOrElse(
       throw new SparkException("Must specify the driver pod name"))
   private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
@@ -103,18 +78,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override val minRegisteredRatio =
-=======
-    .getOrElse(throw new SparkException("Must specify the driver pod name"))
-  private implicit val requestExecutorContext = ExecutionContext.fromExecutorService(
-    requestExecutorsService)
-
-  private val driverPod = kubernetesClient.pods()
-    .inNamespace(kubernetesNamespace)
-    .withName(kubernetesDriverPodName)
-    .get()
-
-  protected override val minRegisteredRatio =
->>>>>>> master
     if (conf.getOption("spark.scheduler.minRegisteredResourcesRatio").isEmpty) {
       0.8
     } else {
@@ -122,7 +85,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
 
   private val executorWatchResource = new AtomicReference[Closeable]
-<<<<<<< HEAD
   protected var totalExpectedExecutors = new AtomicInteger(0)
 
   private val driverUrl = RpcEndpointAddress(
@@ -141,23 +103,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   require(podAllocationSize > 0, s"Allocation batch size " +
     s"$KUBERNETES_ALLOCATION_BATCH_SIZE " +
     s"is $podAllocationSize, should be a positive integer")
-=======
-  private val totalExpectedExecutors = new AtomicInteger(0)
-
-  private val driverUrl = RpcEndpointAddress(
-    conf.get("spark.driver.host"),
-    conf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT),
-    CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
-
-  private val initialExecutors = SchedulerBackendUtils.getInitialTargetExecutorNumber(conf)
-
-  private val podAllocationInterval = conf.get(KUBERNETES_ALLOCATION_BATCH_DELAY)
-
-  private val podAllocationSize = conf.get(KUBERNETES_ALLOCATION_BATCH_SIZE)
-
-  private val executorLostReasonCheckMaxAttempts = conf.get(
-    KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
->>>>>>> master
 
   private val allocatorRunnable = new Runnable {
 
@@ -167,7 +112,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
     override def run(): Unit = {
       handleDisconnectedExecutors()
-<<<<<<< HEAD
       RUNNING_EXECUTOR_PODS_LOCK.synchronized {
         if (totalRegisteredExecutors.get() < runningExecutorsToPods.size) {
           logDebug("Waiting for pending executors before scaling")
@@ -180,60 +124,17 @@ private[spark] class KubernetesClusterSchedulerBackend(
             val (executorId, pod) = allocateNewExecutorPod(nodeToLocalTaskCount)
             runningExecutorsToPods.put(executorId, pod)
             runningPodsToExecutors.put(pod.getMetadata.getName, executorId)
-=======
-
-      val executorsToAllocate = mutable.Map[String, Pod]()
-      val currentTotalRegisteredExecutors = totalRegisteredExecutors.get
-      val currentTotalExpectedExecutors = totalExpectedExecutors.get
-      val currentNodeToLocalTaskCount = getNodesWithLocalTaskCounts()
-      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-        if (currentTotalRegisteredExecutors < runningExecutorsToPods.size) {
-          logDebug("Waiting for pending executors before scaling")
-        } else if (currentTotalExpectedExecutors <= runningExecutorsToPods.size) {
-          logDebug("Maximum allowed executor limit reached. Not scaling up further.")
-        } else {
-          for (_ <- 0 until math.min(
-            currentTotalExpectedExecutors - runningExecutorsToPods.size, podAllocationSize)) {
-            val executorId = EXECUTOR_ID_COUNTER.incrementAndGet().toString
-            val executorPod = executorPodFactory.createExecutorPod(
-              executorId,
-              applicationId(),
-              driverUrl,
-              conf.getExecutorEnv,
-              driverPod,
-              currentNodeToLocalTaskCount)
-            executorsToAllocate(executorId) = executorPod
->>>>>>> master
             logInfo(
               s"Requesting a new executor, total executors is now ${runningExecutorsToPods.size}")
           }
         }
       }
-<<<<<<< HEAD
-=======
-
-      val allocatedExecutors = executorsToAllocate.mapValues { pod =>
-        Utils.tryLog {
-          kubernetesClient.pods().create(pod)
-        }
-      }
-
-      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-        allocatedExecutors.map {
-          case (executorId, attemptedAllocatedExecutor) =>
-            attemptedAllocatedExecutor.map { successfullyAllocatedExecutor =>
-              runningExecutorsToPods.put(executorId, successfullyAllocatedExecutor)
-            }
-        }
-      }
->>>>>>> master
     }
 
     def handleDisconnectedExecutors(): Unit = {
       // For each disconnected executor, synchronize with the loss reasons that may have been found
       // by the executor pod watcher. If the loss reason was discovered by the watcher,
       // inform the parent class with removeExecutor.
-<<<<<<< HEAD
       disconnectedPodsByExecutorIdPendingRemoval.keys().asScala.foreach { case (executorId) =>
         val executorPod = disconnectedPodsByExecutorIdPendingRemoval.get(executorId)
         val knownExitReason = Option(podsWithKnownExitReasons.remove(
@@ -250,41 +151,12 @@ private[spark] class KubernetesClusterSchedulerBackend(
             deleteExecutorFromClusterAndDataStructures(executorId)
           }
         }
-=======
-      disconnectedPodsByExecutorIdPendingRemoval.asScala.foreach {
-        case (executorId, executorPod) =>
-          val knownExitReason = Option(podsWithKnownExitReasons.remove(
-            executorPod.getMetadata.getName))
-          knownExitReason.fold {
-            removeExecutorOrIncrementLossReasonCheckCount(executorId)
-          } { executorExited =>
-            logWarning(s"Removing executor $executorId with loss reason " + executorExited.message)
-            removeExecutor(executorId, executorExited)
-            // We don't delete the pod running the executor that has an exit condition caused by
-            // the application from the Kubernetes API server. This allows users to debug later on
-            // through commands such as "kubectl logs <pod name>" and
-            // "kubectl describe pod <pod name>". Note that exited containers have terminated and
-            // therefore won't take CPU and memory resources.
-            // Otherwise, the executor pod is marked to be deleted from the API server.
-            if (executorExited.exitCausedByApp) {
-              logInfo(s"Executor $executorId exited because of the application.")
-              deleteExecutorFromDataStructures(executorId)
-            } else {
-              logInfo(s"Executor $executorId failed because of a framework error.")
-              deleteExecutorFromClusterAndDataStructures(executorId)
-            }
-          }
->>>>>>> master
       }
     }
 
     def removeExecutorOrIncrementLossReasonCheckCount(executorId: String): Unit = {
       val reasonCheckCount = executorReasonCheckAttemptCounts.getOrElse(executorId, 0)
-<<<<<<< HEAD
       if (reasonCheckCount >= MAX_EXECUTOR_LOST_REASON_CHECKS) {
-=======
-      if (reasonCheckCount >= executorLostReasonCheckMaxAttempts) {
->>>>>>> master
         removeExecutor(executorId, SlaveLost("Executor lost for unknown reasons."))
         deleteExecutorFromClusterAndDataStructures(executorId)
       } else {
@@ -293,7 +165,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
 
     def deleteExecutorFromClusterAndDataStructures(executorId: String): Unit = {
-<<<<<<< HEAD
       disconnectedPodsByExecutorIdPendingRemoval.remove(executorId)
       executorReasonCheckAttemptCounts -= executorId
       RUNNING_EXECUTOR_PODS_LOCK.synchronized {
@@ -301,27 +172,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
           kubernetesClient.pods().delete(pod)
           runningPodsToExecutors.remove(pod.getMetadata.getName)
         }.getOrElse(logWarning(s"Unable to remove pod for unknown executor $executorId"))
-=======
-      deleteExecutorFromDataStructures(executorId).foreach { pod =>
-        kubernetesClient.pods().delete(pod)
-      }
-    }
-
-    def deleteExecutorFromDataStructures(executorId: String): Option[Pod] = {
-      disconnectedPodsByExecutorIdPendingRemoval.remove(executorId)
-      executorReasonCheckAttemptCounts -= executorId
-      podsWithKnownExitReasons.remove(executorId)
-      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-        runningExecutorsToPods.remove(executorId).orElse {
-          logWarning(s"Unable to remove pod for unknown executor $executorId")
-          None
-        }
->>>>>>> master
       }
     }
   }
 
-<<<<<<< HEAD
   private def getInitialTargetExecutorNumber(defaultNumExecutors: Int = 1): Int = {
     if (Utils.isDynamicAllocationEnabled(conf)) {
       val minNumExecutors = conf.getInt("spark.dynamicAllocation.minExecutors", 0)
@@ -340,8 +194,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   override def applicationId(): String = conf.get("spark.app.id", super.applicationId())
 
-=======
->>>>>>> master
   override def sufficientResourcesRegistered(): Boolean = {
     totalRegisteredExecutors.get() >= initialExecutors * minRegisteredRatio
   }
@@ -349,7 +201,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
   override def start(): Unit = {
     super.start()
     executorWatchResource.set(
-<<<<<<< HEAD
         kubernetesClient
             .pods()
             .withLabel(SPARK_APP_ID_LABEL, applicationId())
@@ -358,15 +209,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     allocatorExecutor.scheduleWithFixedDelay(
         allocatorRunnable, 0L, podAllocationInterval, TimeUnit.SECONDS)
     shuffleManager.foreach(_.start(applicationId()))
-=======
-      kubernetesClient
-        .pods()
-        .withLabel(SPARK_APP_ID_LABEL, applicationId())
-        .watch(new ExecutorPodsWatcher()))
-
-    allocatorExecutor.scheduleWithFixedDelay(
-      allocatorRunnable, 0L, podAllocationInterval, TimeUnit.MILLISECONDS)
->>>>>>> master
 
     if (!Utils.isDynamicAllocationEnabled(conf)) {
       doRequestTotalExecutors(initialExecutors)
@@ -376,16 +218,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
   override def stop(): Unit = {
     // stop allocation of new resources and caches.
     allocatorExecutor.shutdown()
-<<<<<<< HEAD
     shuffleManager.foreach(_.stop())
-=======
-    allocatorExecutor.awaitTermination(30, TimeUnit.SECONDS)
->>>>>>> master
 
     // send stop message to executors so they shut down cleanly
     super.stop()
 
-<<<<<<< HEAD
     // then delete the executor pods
     // TODO investigate why Utils.tryLogNonFatalError() doesn't work in this context.
     // When using Utils.tryLogNonFatalError some of the code fails but without any logs or
@@ -397,15 +234,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
         runningPodsToExecutors.clear()
       }
       executorPodsByIPs.clear()
-=======
-    try {
->>>>>>> master
       val resource = executorWatchResource.getAndSet(null)
       if (resource != null) {
         resource.close()
       }
     } catch {
-<<<<<<< HEAD
       case e: Throwable => logError("Uncaught exception while shutting down controllers.", e)
     }
     try {
@@ -413,19 +246,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
       kubernetesClient.close()
     } catch {
       case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e)
-=======
-      case e: Throwable => logWarning("Failed to close the executor pod watcher", e)
-    }
-
-    // then delete the executor pods
-    Utils.tryLogNonFatalError {
-      deleteExecutorPodsOnStop()
-      executorPodsByIPs.clear()
-    }
-    Utils.tryLogNonFatalError {
-      logInfo("Closing kubernetes client")
-      kubernetesClient.close()
->>>>>>> master
     }
   }
 
@@ -434,17 +254,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
    *         locality if an executor launches on the cluster node.
    */
   private def getNodesWithLocalTaskCounts() : Map[String, Int] = {
-<<<<<<< HEAD
     val nodeToLocalTaskCount = mutable.Map[String, Int]() ++
       KubernetesClusterSchedulerBackend.this.synchronized {
         hostToLocalTaskCount
       }
-=======
-    val nodeToLocalTaskCount = synchronized {
-      mutable.Map[String, Int]() ++ hostToLocalTaskCount
-    }
-
->>>>>>> master
     for (pod <- executorPodsByIPs.values().asScala) {
       // Remove cluster nodes that are running our executors already.
       // TODO: This prefers spreading out executors across nodes. In case users want
@@ -458,7 +271,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     nodeToLocalTaskCount.toMap[String, Int]
   }
 
-<<<<<<< HEAD
   /**
    * Allocates a new executor pod
    *
@@ -485,15 +297,12 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
-=======
->>>>>>> master
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
     totalExpectedExecutors.set(requestedTotal)
     true
   }
 
   override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] {
-<<<<<<< HEAD
     RUNNING_EXECUTOR_PODS_LOCK.synchronized {
       for (executor <- executorIds) {
         val maybeRemovedExecutor = runningExecutorsToPods.remove(executor)
@@ -516,33 +325,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // executorPodsByIPs.
       val pod = executorPodsByIPs.get(podIP)
       Option(pod)
-=======
-    val podsToDelete = RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-      executorIds.flatMap { executorId =>
-        runningExecutorsToPods.remove(executorId) match {
-          case Some(pod) =>
-            disconnectedPodsByExecutorIdPendingRemoval.put(executorId, pod)
-            Some(pod)
-
-          case None =>
-            logWarning(s"Unable to remove pod for unknown executor $executorId")
-            None
-        }
-      }
-    }
-
-    kubernetesClient.pods().delete(podsToDelete: _*)
-    true
-  }
-
-  private def deleteExecutorPodsOnStop(): Unit = {
-    val executorPodsToDelete = RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-      val runningExecutorPodsCopy = Seq(runningExecutorsToPods.values.toSeq: _*)
-      runningExecutorsToPods.clear()
-      runningExecutorPodsCopy
-    }
-    kubernetesClient.pods().delete(executorPodsToDelete: _*)
->>>>>>> master
   }
 
   private class ExecutorPodsWatcher extends Watcher[Pod] {
@@ -550,7 +332,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
     private val DEFAULT_CONTAINER_FAILURE_EXIT_STATUS = -1
 
     override def eventReceived(action: Action, pod: Pod): Unit = {
-<<<<<<< HEAD
       if (action == Action.MODIFIED && pod.getStatus.getPhase == "Running"
           && pod.getMetadata.getDeletionTimestamp == null) {
         val podIP = pod.getStatus.getPodIP
@@ -572,47 +353,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
           logInfo(s"Received delete pod $podName event. Reason: " + pod.getStatus.getReason)
           handleDeletedPod(pod)
         }
-=======
-      val podName = pod.getMetadata.getName
-      val podIP = pod.getStatus.getPodIP
-
-      action match {
-        case Action.MODIFIED if (pod.getStatus.getPhase == "Running"
-            && pod.getMetadata.getDeletionTimestamp == null) =>
-          val clusterNodeName = pod.getSpec.getNodeName
-          logInfo(s"Executor pod $podName ready, launched at $clusterNodeName as IP $podIP.")
-          executorPodsByIPs.put(podIP, pod)
-
-        case Action.DELETED | Action.ERROR =>
-          val executorId = getExecutorId(pod)
-          logDebug(s"Executor pod $podName at IP $podIP was at $action.")
-          if (podIP != null) {
-            executorPodsByIPs.remove(podIP)
-          }
-
-          val executorExitReason = if (action == Action.ERROR) {
-            logWarning(s"Received error event of executor pod $podName. Reason: " +
-              pod.getStatus.getReason)
-            executorExitReasonOnError(pod)
-          } else if (action == Action.DELETED) {
-            logWarning(s"Received delete event of executor pod $podName. Reason: " +
-              pod.getStatus.getReason)
-            executorExitReasonOnDelete(pod)
-          } else {
-            throw new IllegalStateException(
-              s"Unknown action that should only be DELETED or ERROR: $action")
-          }
-          podsWithKnownExitReasons.put(pod.getMetadata.getName, executorExitReason)
-
-          if (!disconnectedPodsByExecutorIdPendingRemoval.containsKey(executorId)) {
-            log.warn(s"Executor with id $executorId was not marked as disconnected, but the " +
-              s"watch received an event of type $action for this executor. The executor may " +
-              "have failed to start in the first place and never registered with the driver.")
-          }
-          disconnectedPodsByExecutorIdPendingRemoval.put(executorId, pod)
-
-        case _ => logDebug(s"Received event of executor pod $podName: " + action)
->>>>>>> master
       }
     }
 
@@ -620,11 +360,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       logDebug("Executor pod watch closed.", cause)
     }
 
-<<<<<<< HEAD
     def getExecutorExitStatus(pod: Pod): Int = {
-=======
-    private def getExecutorExitStatus(pod: Pod): Int = {
->>>>>>> master
       val containerStatuses = pod.getStatus.getContainerStatuses
       if (!containerStatuses.isEmpty) {
         // we assume the first container represents the pod status. This assumption may not hold
@@ -634,7 +370,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
       } else DEFAULT_CONTAINER_FAILURE_EXIT_STATUS
     }
 
-<<<<<<< HEAD
     def getExecutorExitStatus(containerStatus: ContainerStatus): Int = {
       Option(containerStatus.getState).map(containerState =>
         Option(containerState.getTerminated).map(containerStateTerminated =>
@@ -672,57 +407,14 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
 
     def handleDeletedPod(pod: Pod): Unit = {
-=======
-    private def getExecutorExitStatus(containerStatus: ContainerStatus): Int = {
-      Option(containerStatus.getState).map { containerState =>
-        Option(containerState.getTerminated).map { containerStateTerminated =>
-          containerStateTerminated.getExitCode.intValue()
-        }.getOrElse(UNKNOWN_EXIT_CODE)
-      }.getOrElse(UNKNOWN_EXIT_CODE)
-    }
-
-    private def isPodAlreadyReleased(pod: Pod): Boolean = {
-      val executorId = pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL)
-      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
-        !runningExecutorsToPods.contains(executorId)
-      }
-    }
-
-    private def executorExitReasonOnError(pod: Pod): ExecutorExited = {
-      val containerExitStatus = getExecutorExitStatus(pod)
-      // container was probably actively killed by the driver.
-      if (isPodAlreadyReleased(pod)) {
-        ExecutorExited(containerExitStatus, exitCausedByApp = false,
-          s"Container in pod ${pod.getMetadata.getName} exited from explicit termination " +
-            "request.")
-      } else {
-        val containerExitReason = s"Pod ${pod.getMetadata.getName}'s executor container " +
-          s"exited with exit status code $containerExitStatus."
-        ExecutorExited(containerExitStatus, exitCausedByApp = true, containerExitReason)
-      }
-    }
-
-    private def executorExitReasonOnDelete(pod: Pod): ExecutorExited = {
->>>>>>> master
       val exitMessage = if (isPodAlreadyReleased(pod)) {
         s"Container in pod ${pod.getMetadata.getName} exited from explicit termination request."
       } else {
         s"Pod ${pod.getMetadata.getName} deleted or lost."
       }
-<<<<<<< HEAD
       val exitReason = ExecutorExited(
           getExecutorExitStatus(pod), exitCausedByApp = false, exitMessage)
       podsWithKnownExitReasons.put(pod.getMetadata.getName, exitReason)
-=======
-      ExecutorExited(getExecutorExitStatus(pod), exitCausedByApp = false, exitMessage)
-    }
-
-    private def getExecutorId(pod: Pod): String = {
-      val executorId = pod.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL)
-      require(executorId != null, "Unexpected pod metadata; expected all executor pods " +
-        s"to have label $SPARK_EXECUTOR_ID_LABEL.")
-      executorId
->>>>>>> master
     }
   }
 
@@ -731,13 +423,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   private class KubernetesDriverEndpoint(
-<<<<<<< HEAD
     rpcEnv: RpcEnv,
     sparkProperties: Seq[(String, String)])
-=======
-      rpcEnv: RpcEnv,
-      sparkProperties: Seq[(String, String)])
->>>>>>> master
     extends DriverEndpoint(rpcEnv, sparkProperties) {
 
     override def onDisconnected(rpcAddress: RpcAddress): Unit = {
@@ -751,7 +438,6 @@ private[spark] class KubernetesClusterSchedulerBackend(
         }
       }
     }
-<<<<<<< HEAD
 
     override def receiveAndReply(
       context: RpcCallContext): PartialFunction[Any, Unit] = {
@@ -784,13 +470,10 @@ private[spark] class KubernetesClusterSchedulerBackend(
         }
       }.orElse(super.receiveAndReply(context))
     }
-=======
->>>>>>> master
   }
 }
 
 private object KubernetesClusterSchedulerBackend {
-<<<<<<< HEAD
   private val VMEM_EXCEEDED_EXIT_CODE = -103
   private val PMEM_EXCEEDED_EXIT_CODE = -104
   private val UNKNOWN_EXIT_CODE = -111
@@ -803,8 +486,3 @@ private object KubernetesClusterSchedulerBackend {
       " Consider boosting spark executor memory overhead."
   }
 }
-
-=======
-  private val UNKNOWN_EXIT_CODE = -1
-}
->>>>>>> master
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
index 4806d91869a48..041f51e912002 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/ClientSuite.scala
@@ -16,38 +16,21 @@
  */
 package org.apache.spark.deploy.k8s.submit
 
-<<<<<<< HEAD
 import com.google.common.collect.Iterables
 import io.fabric8.kubernetes.api.model.{ContainerBuilder, DoneablePod, HasMetadata, Pod, PodBuilder, PodList, Secret, SecretBuilder}
 import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
 import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, NamespaceVisitFromServerGetWatchDeleteRecreateWaitApplicable, PodResource, Resource}
-=======
-import scala.collection.JavaConverters._
-
-import com.google.common.collect.Iterables
-import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.{KubernetesClient, Watch}
-import io.fabric8.kubernetes.client.dsl.{MixedOperation, NamespaceListVisitFromServerGetDeleteRecreateWaitApplicable, PodResource}
->>>>>>> master
 import org.mockito.{ArgumentCaptor, Mock, MockitoAnnotations}
 import org.mockito.Mockito.{doReturn, verify, when}
 import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfter
-<<<<<<< HEAD
 import org.scalatest.mock.MockitoSugar._
 import scala.collection.JavaConverters._
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.deploy.k8s.constants._
 import org.apache.spark.deploy.k8s.submit.submitsteps.{DriverConfigurationStep, KubernetesDriverSpec}
-=======
-import org.scalatest.mockito.MockitoSugar._
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.steps.DriverConfigurationStep
->>>>>>> master
 
 class ClientSuite extends SparkFunSuite with BeforeAndAfter {
 
@@ -99,37 +82,22 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
     when(podOperations.withName(FirstTestConfigurationStep.podName)).thenReturn(namedPods)
     when(namedPods.watch(loggingPodStatusWatcher)).thenReturn(mock[Watch])
     doReturn(resourceList)
-<<<<<<< HEAD
         .when(kubernetesClient)
         .resourceList(createdResourcesArgumentCaptor.capture())
-=======
-      .when(kubernetesClient)
-      .resourceList(createdResourcesArgumentCaptor.capture())
->>>>>>> master
   }
 
   test("The client should configure the pod with the submission steps.") {
     val submissionClient = new Client(
-<<<<<<< HEAD
         submissionSteps,
         new SparkConf(false),
         kubernetesClient,
         false,
         "spark",
         loggingPodStatusWatcher)
-=======
-      submissionSteps,
-      new SparkConf(false),
-      kubernetesClient,
-      false,
-      "spark",
-      loggingPodStatusWatcher)
->>>>>>> master
     submissionClient.run()
     val createdPod = createdPodArgumentCaptor.getValue
     assert(createdPod.getMetadata.getName === FirstTestConfigurationStep.podName)
     assert(createdPod.getMetadata.getLabels.asScala ===
-<<<<<<< HEAD
         Map(FirstTestConfigurationStep.labelKey -> FirstTestConfigurationStep.labelValue))
     assert(createdPod.getMetadata.getAnnotations.asScala ===
         Map(SecondTestConfigurationStep.annotationKey ->
@@ -137,34 +105,16 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
     assert(createdPod.getSpec.getContainers.size() === 1)
     assert(createdPod.getSpec.getContainers.get(0).getName ===
         SecondTestConfigurationStep.containerName)
-=======
-      Map(FirstTestConfigurationStep.labelKey -> FirstTestConfigurationStep.labelValue))
-    assert(createdPod.getMetadata.getAnnotations.asScala ===
-      Map(SecondTestConfigurationStep.annotationKey ->
-        SecondTestConfigurationStep.annotationValue))
-    assert(createdPod.getSpec.getContainers.size() === 1)
-    assert(createdPod.getSpec.getContainers.get(0).getName ===
-      SecondTestConfigurationStep.containerName)
->>>>>>> master
   }
 
   test("The client should create the secondary Kubernetes resources.") {
     val submissionClient = new Client(
-<<<<<<< HEAD
         submissionSteps,
         new SparkConf(false),
         kubernetesClient,
         false,
         "spark",
         loggingPodStatusWatcher)
-=======
-      submissionSteps,
-      new SparkConf(false),
-      kubernetesClient,
-      false,
-      "spark",
-      loggingPodStatusWatcher)
->>>>>>> master
     submissionClient.run()
     val createdPod = createdPodArgumentCaptor.getValue
     val otherCreatedResources = createdResourcesArgumentCaptor.getAllValues
@@ -172,11 +122,7 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
     val createdResource = Iterables.getOnlyElement(otherCreatedResources).asInstanceOf[Secret]
     assert(createdResource.getMetadata.getName === FirstTestConfigurationStep.secretName)
     assert(createdResource.getData.asScala ===
-<<<<<<< HEAD
         Map(FirstTestConfigurationStep.secretKey -> FirstTestConfigurationStep.secretData))
-=======
-      Map(FirstTestConfigurationStep.secretKey -> FirstTestConfigurationStep.secretData))
->>>>>>> master
     val ownerReference = Iterables.getOnlyElement(createdResource.getMetadata.getOwnerReferences)
     assert(ownerReference.getName === createdPod.getMetadata.getName)
     assert(ownerReference.getKind === DRIVER_POD_KIND)
@@ -186,7 +132,6 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
 
   test("The client should attach the driver container with the appropriate JVM options.") {
     val sparkConf = new SparkConf(false)
-<<<<<<< HEAD
         .set("spark.logConf", "true")
         .set(
           org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS,
@@ -198,19 +143,6 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
         false,
         "spark",
         loggingPodStatusWatcher)
-=======
-      .set("spark.logConf", "true")
-      .set(
-        org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS,
-          "-XX:+HeapDumpOnOutOfMemoryError -XX:+PrintGCDetails")
-    val submissionClient = new Client(
-      submissionSteps,
-      sparkConf,
-      kubernetesClient,
-      false,
-      "spark",
-      loggingPodStatusWatcher)
->>>>>>> master
     submissionClient.run()
     val createdPod = createdPodArgumentCaptor.getValue
     val driverContainer = Iterables.getOnlyElement(createdPod.getSpec.getContainers)
@@ -221,19 +153,11 @@ class ClientSuite extends SparkFunSuite with BeforeAndAfter {
     assert(driverJvmOptsEnvs.size === 4)
 
     val expectedJvmOptsValues = Seq(
-<<<<<<< HEAD
         "-Dspark.logConf=true",
         s"-D${SecondTestConfigurationStep.sparkConfKey}=" +
             s"${SecondTestConfigurationStep.sparkConfValue}",
         s"-XX:+HeapDumpOnOutOfMemoryError",
         s"-XX:+PrintGCDetails")
-=======
-      "-Dspark.logConf=true",
-      s"-D${SecondTestConfigurationStep.sparkConfKey}=" +
-        s"${SecondTestConfigurationStep.sparkConfValue}",
-      "-XX:+HeapDumpOnOutOfMemoryError",
-      "-XX:+PrintGCDetails")
->>>>>>> master
     driverJvmOptsEnvs.zip(expectedJvmOptsValues).zipWithIndex.foreach {
       case ((resolvedEnv, expectedJvmOpt), index) =>
         assert(resolvedEnv.getName === s"$ENV_JAVA_OPT_PREFIX$index")
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
index d7c816d225ab4..bb09cb801b5a9 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodFactorySuite.scala
@@ -16,7 +16,6 @@
  */
 package org.apache.spark.scheduler.cluster.k8s
 
-<<<<<<< HEAD
 import io.fabric8.kubernetes.api.model.{Pod, PodBuilder, VolumeBuilder, VolumeMountBuilder}
 import io.fabric8.kubernetes.client.KubernetesClient
 import org.mockito.{AdditionalAnswers, Mock, Mockito, MockitoAnnotations}
@@ -35,30 +34,10 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
   private val driverPodName: String = "driver-pod"
   private val driverPodUid: String = "driver-uid"
   private val driverUrl: String = "driver-url"
-=======
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model._
-import org.mockito.{AdditionalAnswers, MockitoAnnotations}
-import org.mockito.Matchers.any
-import org.mockito.Mockito._
-import org.scalatest.{BeforeAndAfter, BeforeAndAfterEach}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, MountSecretsBootstrap, PodWithDetachedInitContainer, SecretVolumeUtils}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-
-class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with BeforeAndAfterEach {
-
-  private val driverPodName: String = "driver-pod"
-  private val driverPodUid: String = "driver-uid"
->>>>>>> master
   private val executorPrefix: String = "base"
   private val executorImage: String = "executor-image"
   private val driverPod = new PodBuilder()
     .withNewMetadata()
-<<<<<<< HEAD
       .withName(driverPodName)
       .withUid(driverPodUid)
       .endMetadata()
@@ -77,26 +56,11 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
   @Mock
   private var executorLocalDirVolumeProvider: ExecutorLocalDirVolumeProvider = _
 
-=======
-    .withName(driverPodName)
-    .withUid(driverPodUid)
-    .endMetadata()
-    .withNewSpec()
-    .withNodeName("some-node")
-    .endSpec()
-    .withNewStatus()
-    .withHostIP("192.168.99.100")
-    .endStatus()
-    .build()
-  private var baseConf: SparkConf = _
-
->>>>>>> master
   before {
     MockitoAnnotations.initMocks(this)
     baseConf = new SparkConf()
       .set(KUBERNETES_DRIVER_POD_NAME, driverPodName)
       .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, executorPrefix)
-<<<<<<< HEAD
       .set(EXECUTOR_DOCKER_IMAGE, executorImage)
     when(nodeAffinityExecutorPodModifier.addNodeAffinityAnnotationIfUseful(
       any(classOf[Pod]),
@@ -123,20 +87,6 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     // The executor pod name and default labels.
     assert(executor.getMetadata.getName === s"$executorPrefix-exec-1")
     assert(executor.getMetadata.getLabels.size() === 3)
-=======
-      .set(CONTAINER_IMAGE, executorImage)
-  }
-
-  test("basic executor pod has reasonable defaults") {
-    val factory = new ExecutorPodFactory(baseConf, None, None, None)
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
-
-    // The executor pod name and default labels.
-    assert(executor.getMetadata.getName === s"$executorPrefix-exec-1")
-    assert(executor.getMetadata.getLabels.size() === 3)
-    assert(executor.getMetadata.getLabels.get(SPARK_EXECUTOR_ID_LABEL) === "1")
->>>>>>> master
 
     // There is exactly 1 container with no volume mounts and default memory limits.
     // Default memory limit is 1024M + 384M (minimum overhead constant).
@@ -160,7 +110,6 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     conf.set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX,
       "loremipsumdolorsitametvimatelitrefficiendisuscipianturvixlegeresple")
 
-<<<<<<< HEAD
     val factory = new ExecutorPodFactoryImpl(
         conf,
         nodeAffinityExecutorPodModifier,
@@ -196,43 +145,6 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
     verify(nodeAffinityExecutorPodModifier, times(1))
       .addNodeAffinityAnnotationIfUseful(any(classOf[Pod]), any(classOf[Map[String, Int]]))
 
-=======
-    val factory = new ExecutorPodFactory(conf, None, None, None)
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
-
-    assert(executor.getSpec.getHostname.length === 63)
-  }
-
-  test("classpath and extra java options get translated into environment variables") {
-    val conf = baseConf.clone()
-    conf.set(org.apache.spark.internal.config.EXECUTOR_JAVA_OPTIONS, "foo=bar")
-    conf.set(org.apache.spark.internal.config.EXECUTOR_CLASS_PATH, "bar=baz")
-
-    val factory = new ExecutorPodFactory(conf, None, None, None)
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)]("qux" -> "quux"), driverPod, Map[String, Int]())
-
-    checkEnv(executor,
-      Map("SPARK_JAVA_OPT_0" -> "foo=bar",
-        ENV_CLASSPATH -> "bar=baz",
-        "qux" -> "quux"))
-    checkOwnerReferences(executor, driverPodUid)
-  }
-
-  test("executor secrets get mounted") {
-    val conf = baseConf.clone()
-
-    val secretsBootstrap = new MountSecretsBootstrap(Map("secret1" -> "/var/secret1"))
-    val factory = new ExecutorPodFactory(
-      conf,
-      Some(secretsBootstrap),
-      None,
-      None)
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
-
->>>>>>> master
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getVolumeMounts.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getVolumeMounts.get(0).getName
@@ -249,7 +161,6 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
 
   test("init-container bootstrap step adds an init container") {
     val conf = baseConf.clone()
-<<<<<<< HEAD
     val initContainerBootstrap = mock(classOf[SparkPodInitContainerBootstrap])
     when(initContainerBootstrap.bootstrapInitContainerAndVolumes(
       any(classOf[PodWithDetachedInitContainer]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
@@ -358,48 +269,6 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
           "SPARK_EXECUTOR_EXTRA_CLASSPATH" -> "bar=baz",
           "qux" -> "quux"))
     checkOwnerReferences(executor, driverPodUid)
-=======
-    val initContainerBootstrap = mock(classOf[InitContainerBootstrap])
-    when(initContainerBootstrap.bootstrapInitContainer(
-      any(classOf[PodWithDetachedInitContainer]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
-
-    val factory = new ExecutorPodFactory(
-      conf,
-      None,
-      Some(initContainerBootstrap),
-      None)
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
-
-    assert(executor.getSpec.getInitContainers.size() === 1)
-    checkOwnerReferences(executor, driverPodUid)
-  }
-
-  test("init-container with secrets mount bootstrap") {
-    val conf = baseConf.clone()
-    val initContainerBootstrap = mock(classOf[InitContainerBootstrap])
-    when(initContainerBootstrap.bootstrapInitContainer(
-      any(classOf[PodWithDetachedInitContainer]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
-    val secretsBootstrap = new MountSecretsBootstrap(Map("secret1" -> "/var/secret1"))
-
-    val factory = new ExecutorPodFactory(
-      conf,
-      Some(secretsBootstrap),
-      Some(initContainerBootstrap),
-      Some(secretsBootstrap))
-    val executor = factory.createExecutorPod(
-      "1", "dummy", "dummy", Seq[(String, String)](), driverPod, Map[String, Int]())
-
-    assert(executor.getSpec.getVolumes.size() === 1)
-    assert(SecretVolumeUtils.podHasVolume(executor, "secret1-volume"))
-    assert(SecretVolumeUtils.containerHasVolume(
-      executor.getSpec.getContainers.get(0), "secret1-volume", "/var/secret1"))
-    assert(executor.getSpec.getInitContainers.size() === 1)
-    assert(SecretVolumeUtils.containerHasVolume(
-      executor.getSpec.getInitContainers.get(0), "secret1-volume", "/var/secret1"))
-
-    checkOwnerReferences(executor, driverPodUid)
->>>>>>> master
   }
 
   // There is always exactly one controller reference, and it points to the driver pod.
@@ -417,20 +286,12 @@ class ExecutorPodFactorySuite extends SparkFunSuite with BeforeAndAfter with Bef
       ENV_EXECUTOR_CORES -> "1",
       ENV_EXECUTOR_MEMORY -> "1g",
       ENV_APPLICATION_ID -> "dummy",
-<<<<<<< HEAD
       ENV_MOUNTED_CLASSPATH -> "/var/spark-data/spark-jars/*",
       ENV_EXECUTOR_POD_IP -> null,
       ENV_EXECUTOR_PORT -> "10000") ++ additionalEnvVars
 
     assert(executor.getSpec.getContainers.size() === 1)
     assert(executor.getSpec.getContainers.get(0).getEnv().size() === defaultEnvs.size)
-=======
-      ENV_EXECUTOR_POD_IP -> null,
-      ENV_MOUNTED_CLASSPATH -> "/var/spark-data/spark-jars/*") ++ additionalEnvVars
-
-    assert(executor.getSpec.getContainers.size() === 1)
-    assert(executor.getSpec.getContainers.get(0).getEnv.size() === defaultEnvs.size)
->>>>>>> master
     val mapEnvs = executor.getSpec.getContainers.get(0).getEnv.asScala.map {
       x => (x.getName, x.getValue)
     }.toMap
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index b1617dc3cc3d7..a9a2937869edd 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -26,16 +26,11 @@ import org.mockito.{AdditionalAnswers, ArgumentCaptor, Mock, MockitoAnnotations}
 import org.mockito.Matchers.{any, eq => mockitoEq}
 import org.mockito.Mockito.{doNothing, never, times, verify, when}
 import org.scalatest.BeforeAndAfter
-<<<<<<< HEAD
 import org.scalatest.mock.MockitoSugar._
-=======
-import org.scalatest.mockito.MockitoSugar._
->>>>>>> master
 import scala.collection.JavaConverters._
 import scala.concurrent.Future
 
 import org.apache.spark.{SparkConf, SparkContext, SparkFunSuite}
-<<<<<<< HEAD
 import org.apache.spark.deploy.k8s.config._
 import org.apache.spark.deploy.k8s.constants._
 import org.apache.spark.rpc.{RpcAddress, RpcCallContext, RpcEndpoint, RpcEndpointAddress, RpcEndpointRef, RpcEnv, RpcTimeout}
@@ -45,32 +40,15 @@ import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 
 private[spark] class KubernetesClusterSchedulerBackendSuite
     extends SparkFunSuite with BeforeAndAfter {
-=======
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.rpc._
-import org.apache.spark.scheduler.{ExecutorExited, LiveListenerBus, SlaveLost, TaskSchedulerImpl}
-import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.{RegisterExecutor, RemoveExecutor}
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
-import org.apache.spark.util.ThreadUtils
-
-class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAndAfter {
->>>>>>> master
 
   private val APP_ID = "test-spark-app"
   private val DRIVER_POD_NAME = "spark-driver-pod"
   private val NAMESPACE = "test-namespace"
   private val SPARK_DRIVER_HOST = "localhost"
   private val SPARK_DRIVER_PORT = 7077
-<<<<<<< HEAD
   private val POD_ALLOCATION_INTERVAL = 60L
   private val DRIVER_URL = RpcEndpointAddress(
       SPARK_DRIVER_HOST, SPARK_DRIVER_PORT, CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
-=======
-  private val POD_ALLOCATION_INTERVAL = "1m"
-  private val DRIVER_URL = RpcEndpointAddress(
-    SPARK_DRIVER_HOST, SPARK_DRIVER_PORT, CoarseGrainedSchedulerBackend.ENDPOINT_NAME).toString
->>>>>>> master
   private val FIRST_EXECUTOR_POD = new PodBuilder()
     .withNewMetadata()
       .withName("pod1")
@@ -95,17 +73,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     .build()
 
   private type PODS = MixedOperation[Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
-<<<<<<< HEAD
   private type LABELLED_PODS = FilterWatchListDeletable[
       Pod, PodList, java.lang.Boolean, Watch, Watcher[Pod]]
   private type IN_NAMESPACE_PODS = NonNamespaceOperation[
       Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
-=======
-  private type LABELED_PODS = FilterWatchListDeletable[
-    Pod, PodList, java.lang.Boolean, Watch, Watcher[Pod]]
-  private type IN_NAMESPACE_PODS = NonNamespaceOperation[
-    Pod, PodList, DoneablePod, PodResource[Pod, DoneablePod]]
->>>>>>> master
 
   @Mock
   private var sparkContext: SparkContext = _
@@ -126,23 +97,16 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   private var executorPodFactory: ExecutorPodFactory = _
 
   @Mock
-<<<<<<< HEAD
   private var shuffleManager: KubernetesExternalShuffleManager = _
 
   @Mock
-=======
->>>>>>> master
   private var kubernetesClient: KubernetesClient = _
 
   @Mock
   private var podOperations: PODS = _
 
   @Mock
-<<<<<<< HEAD
   private var podsWithLabelOperations: LABELLED_PODS = _
-=======
-  private var podsWithLabelOperations: LABELED_PODS = _
->>>>>>> master
 
   @Mock
   private var podsInNamespace: IN_NAMESPACE_PODS = _
@@ -159,12 +123,6 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   @Mock
   private var executorPodsWatch: Watch = _
 
-<<<<<<< HEAD
-=======
-  @Mock
-  private var successFuture: Future[Boolean] = _
-
->>>>>>> master
   private var sparkConf: SparkConf = _
   private var executorPodsWatcherArgument: ArgumentCaptor[Watcher[Pod]] = _
   private var allocatorRunnable: ArgumentCaptor[Runnable] = _
@@ -182,20 +140,12 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   before {
     MockitoAnnotations.initMocks(this)
     sparkConf = new SparkConf()
-<<<<<<< HEAD
         .set("spark.app.id", APP_ID)
         .set(KUBERNETES_DRIVER_POD_NAME, DRIVER_POD_NAME)
         .set(KUBERNETES_NAMESPACE, NAMESPACE)
         .set("spark.driver.host", SPARK_DRIVER_HOST)
         .set("spark.driver.port", SPARK_DRIVER_PORT.toString)
         .set(KUBERNETES_ALLOCATION_BATCH_DELAY, POD_ALLOCATION_INTERVAL)
-=======
-      .set(KUBERNETES_DRIVER_POD_NAME, DRIVER_POD_NAME)
-      .set(KUBERNETES_NAMESPACE, NAMESPACE)
-      .set("spark.driver.host", SPARK_DRIVER_HOST)
-      .set("spark.driver.port", SPARK_DRIVER_PORT.toString)
-      .set(KUBERNETES_ALLOCATION_BATCH_DELAY.key, POD_ALLOCATION_INTERVAL)
->>>>>>> master
     executorPodsWatcherArgument = ArgumentCaptor.forClass(classOf[Watcher[Pod]])
     allocatorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
     requestExecutorRunnable = ArgumentCaptor.forClass(classOf[Runnable])
@@ -206,31 +156,19 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     when(kubernetesClient.pods()).thenReturn(podOperations)
     when(podOperations.withLabel(SPARK_APP_ID_LABEL, APP_ID)).thenReturn(podsWithLabelOperations)
     when(podsWithLabelOperations.watch(executorPodsWatcherArgument.capture()))
-<<<<<<< HEAD
         .thenReturn(executorPodsWatch)
-=======
-      .thenReturn(executorPodsWatch)
->>>>>>> master
     when(podOperations.inNamespace(NAMESPACE)).thenReturn(podsInNamespace)
     when(podsInNamespace.withName(DRIVER_POD_NAME)).thenReturn(podsWithDriverName)
     when(podsWithDriverName.get()).thenReturn(driverPod)
     when(allocatorExecutor.scheduleWithFixedDelay(
-<<<<<<< HEAD
         allocatorRunnable.capture(),
         mockitoEq(0L),
         mockitoEq(POD_ALLOCATION_INTERVAL),
         mockitoEq(TimeUnit.SECONDS))).thenReturn(null)
-=======
-      allocatorRunnable.capture(),
-      mockitoEq(0L),
-      mockitoEq(TimeUnit.MINUTES.toMillis(1)),
-      mockitoEq(TimeUnit.MILLISECONDS))).thenReturn(null)
->>>>>>> master
     // Creating Futures in Scala backed by a Java executor service resolves to running
     // ExecutorService#execute (as opposed to submit)
     doNothing().when(requestExecutorsService).execute(requestExecutorRunnable.capture())
     when(rpcEnv.setupEndpoint(
-<<<<<<< HEAD
         mockitoEq(CoarseGrainedSchedulerBackend.ENDPOINT_NAME), driverEndpoint.capture()))
         .thenReturn(driverEndpointRef)
     when(driverEndpointRef.ask[Boolean]
@@ -246,33 +184,11 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     assert(allocatorRunnable.getValue != null)
     scheduler.stop()
     verify(shuffleManager).stop()
-=======
-      mockitoEq(CoarseGrainedSchedulerBackend.ENDPOINT_NAME), driverEndpoint.capture()))
-      .thenReturn(driverEndpointRef)
-
-    // Used by the CoarseGrainedSchedulerBackend when making RPC calls.
-    when(driverEndpointRef.ask[Boolean]
-      (any(classOf[Any]))
-      (any())).thenReturn(successFuture)
-    when(successFuture.failed).thenReturn(Future[Throwable] {
-      // emulate behavior of the Future.failed method.
-      throw new NoSuchElementException()
-    }(ThreadUtils.sameThread))
-  }
-
-  test("Basic lifecycle expectations when starting and stopping the scheduler.") {
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    assert(executorPodsWatcherArgument.getValue != null)
-    assert(allocatorRunnable.getValue != null)
-    scheduler.stop()
->>>>>>> master
     verify(executorPodsWatch).close()
   }
 
   test("Static allocation should request executors upon first allocator run.") {
     sparkConf
-<<<<<<< HEAD
         .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
         .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
     val scheduler = newSchedulerBackend(true)
@@ -284,24 +200,10 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     allocatorRunnable.getValue.run()
     verify(podOperations).create(FIRST_EXECUTOR_POD)
     verify(podOperations).create(SECOND_EXECUTOR_POD)
-=======
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    requestExecutorRunnable.getValue.run()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
-    when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
-    allocatorRunnable.getValue.run()
-    verify(podOperations).create(firstResolvedPod)
-    verify(podOperations).create(secondResolvedPod)
->>>>>>> master
   }
 
   test("Killing executors deletes the executor pods") {
     sparkConf
-<<<<<<< HEAD
         .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
         .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
     val scheduler = newSchedulerBackend(true)
@@ -316,40 +218,17 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     requestExecutorRunnable.getAllValues.asScala.last.run()
     verify(podOperations).delete(SECOND_EXECUTOR_POD)
     verify(podOperations, never()).delete(FIRST_EXECUTOR_POD)
-=======
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 2)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    requestExecutorRunnable.getValue.run()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
-    when(podOperations.create(any(classOf[Pod])))
-      .thenAnswer(AdditionalAnswers.returnsFirstArg())
-    allocatorRunnable.getValue.run()
-    scheduler.doKillExecutors(Seq("2"))
-    requestExecutorRunnable.getAllValues.asScala.last.run()
-    verify(podOperations).delete(secondResolvedPod)
-    verify(podOperations, never()).delete(firstResolvedPod)
->>>>>>> master
   }
 
   test("Executors should be requested in batches.") {
     sparkConf
-<<<<<<< HEAD
         .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
         .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
     val scheduler = newSchedulerBackend(true)
-=======
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 2)
-    val scheduler = newSchedulerBackend()
->>>>>>> master
     scheduler.start()
     requestExecutorRunnable.getValue.run()
     when(podOperations.create(any(classOf[Pod])))
       .thenAnswer(AdditionalAnswers.returnsFirstArg())
-<<<<<<< HEAD
     expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
     expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
     allocatorRunnable.getValue.run()
@@ -375,56 +254,18 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     when(podOperations.create(any(classOf[Pod])))
         .thenAnswer(AdditionalAnswers.returnsFirstArg())
     expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-=======
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-    val secondResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
-    allocatorRunnable.getValue.run()
-    verify(podOperations).create(firstResolvedPod)
-    verify(podOperations, never()).create(secondResolvedPod)
-    val registerFirstExecutorMessage = RegisterExecutor(
-      "1", mock[RpcEndpointRef], "localhost", 1, Map.empty[String, String])
-    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
-    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
-      .apply(registerFirstExecutorMessage)
-    allocatorRunnable.getValue.run()
-    verify(podOperations).create(secondResolvedPod)
-  }
-
-  test("Scaled down executors should be cleaned up") {
-    sparkConf
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-
-    // The scheduler backend spins up one executor pod.
-    requestExecutorRunnable.getValue.run()
-    when(podOperations.create(any(classOf[Pod])))
-      .thenAnswer(AdditionalAnswers.returnsFirstArg())
-    val resolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
->>>>>>> master
     allocatorRunnable.getValue.run()
     val executorEndpointRef = mock[RpcEndpointRef]
     when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
     val registerFirstExecutorMessage = RegisterExecutor(
-<<<<<<< HEAD
         "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
     when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
     driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
         .apply(registerFirstExecutorMessage)
-=======
-      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
-    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
-    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
-      .apply(registerFirstExecutorMessage)
-
-    // Request that there are 0 executors and trigger deletion from driver.
->>>>>>> master
     scheduler.doRequestTotalExecutors(0)
     requestExecutorRunnable.getAllValues.asScala.last.run()
     scheduler.doKillExecutors(Seq("1"))
     requestExecutorRunnable.getAllValues.asScala.last.run()
-<<<<<<< HEAD
     verify(podOperations, times(1)).delete(FIRST_EXECUTOR_POD)
     driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
 
@@ -447,41 +288,12 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     val scheduler = newSchedulerBackend(true)
     scheduler.start()
     expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-=======
-    verify(podOperations, times(1)).delete(resolvedPod)
-    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
-
-    val exitedPod = exitPod(resolvedPod, 0)
-    executorPodsWatcherArgument.getValue.eventReceived(Action.DELETED, exitedPod)
-    allocatorRunnable.getValue.run()
-
-    // No more deletion attempts of the executors.
-    // This is graceful termination and should not be detected as a failure.
-    verify(podOperations, times(1)).delete(resolvedPod)
-    verify(driverEndpointRef, times(1)).send(
-      RemoveExecutor("1", ExecutorExited(
-        0,
-        exitCausedByApp = false,
-        s"Container in pod ${exitedPod.getMetadata.getName} exited from" +
-          s" explicit termination request.")))
-  }
-
-  test("Executors that fail should not be deleted.") {
-    sparkConf
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
-
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
->>>>>>> master
     when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
     requestExecutorRunnable.getValue.run()
     allocatorRunnable.getValue.run()
     val executorEndpointRef = mock[RpcEndpointRef]
     when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
     val registerFirstExecutorMessage = RegisterExecutor(
-<<<<<<< HEAD
         "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
     when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
     driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
@@ -511,47 +323,12 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
     val scheduler = newSchedulerBackend(true)
     scheduler.start()
     expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-=======
-      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
-    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
-    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
-      .apply(registerFirstExecutorMessage)
-    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
-    executorPodsWatcherArgument.getValue.eventReceived(
-      Action.ERROR, exitPod(firstResolvedPod, 1))
-
-    // A replacement executor should be created but the error pod should persist.
-    val replacementPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
-    scheduler.doRequestTotalExecutors(1)
-    requestExecutorRunnable.getValue.run()
-    allocatorRunnable.getAllValues.asScala.last.run()
-    verify(podOperations, never()).delete(firstResolvedPod)
-    verify(driverEndpointRef).send(
-      RemoveExecutor("1", ExecutorExited(
-        1,
-        exitCausedByApp = true,
-        s"Pod ${FIRST_EXECUTOR_POD.getMetadata.getName}'s executor container exited with" +
-          " exit status code 1.")))
-  }
-
-  test("Executors disconnected due to unknown reasons are deleted and replaced.") {
-    sparkConf
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
-    val executorLostReasonCheckMaxAttempts = sparkConf.get(
-      KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS)
-
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
->>>>>>> master
     when(podOperations.create(any(classOf[Pod]))).thenAnswer(AdditionalAnswers.returnsFirstArg())
     requestExecutorRunnable.getValue.run()
     allocatorRunnable.getValue.run()
     val executorEndpointRef = mock[RpcEndpointRef]
     when(executorEndpointRef.address).thenReturn(RpcAddress("pod.example.com", 9000))
     val registerFirstExecutorMessage = RegisterExecutor(
-<<<<<<< HEAD
         "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
     when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
     driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
@@ -581,76 +358,6 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
 
   private def exitPod(basePod: Pod, exitCode: Int): Pod = {
     new PodBuilder(FIRST_EXECUTOR_POD)
-=======
-      "1", executorEndpointRef, "localhost:9000", 1, Map.empty[String, String])
-    when(taskSchedulerImpl.resourceOffers(any())).thenReturn(Seq.empty)
-    driverEndpoint.getValue.receiveAndReply(mock[RpcCallContext])
-      .apply(registerFirstExecutorMessage)
-
-    driverEndpoint.getValue.onDisconnected(executorEndpointRef.address)
-    1 to executorLostReasonCheckMaxAttempts foreach { _ =>
-      allocatorRunnable.getValue.run()
-      verify(podOperations, never()).delete(FIRST_EXECUTOR_POD)
-    }
-
-    val recreatedResolvedPod = expectPodCreationWithId(2, SECOND_EXECUTOR_POD)
-    allocatorRunnable.getValue.run()
-    verify(podOperations).delete(firstResolvedPod)
-    verify(driverEndpointRef).send(
-      RemoveExecutor("1", SlaveLost("Executor lost for unknown reasons.")))
-  }
-
-  test("Executors that fail to start on the Kubernetes API call rebuild in the next batch.") {
-    sparkConf
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-    when(podOperations.create(firstResolvedPod))
-      .thenThrow(new RuntimeException("test"))
-    requestExecutorRunnable.getValue.run()
-    allocatorRunnable.getValue.run()
-    verify(podOperations, times(1)).create(firstResolvedPod)
-    val recreatedResolvedPod = expectPodCreationWithId(2, FIRST_EXECUTOR_POD)
-    allocatorRunnable.getValue.run()
-    verify(podOperations).create(recreatedResolvedPod)
-  }
-
-  test("Executors that are initially created but the watch notices them fail are rebuilt" +
-    " in the next batch.") {
-    sparkConf
-      .set(KUBERNETES_ALLOCATION_BATCH_SIZE, 1)
-      .set(org.apache.spark.internal.config.EXECUTOR_INSTANCES, 1)
-    val scheduler = newSchedulerBackend()
-    scheduler.start()
-    val firstResolvedPod = expectPodCreationWithId(1, FIRST_EXECUTOR_POD)
-    when(podOperations.create(FIRST_EXECUTOR_POD)).thenAnswer(AdditionalAnswers.returnsFirstArg())
-    requestExecutorRunnable.getValue.run()
-    allocatorRunnable.getValue.run()
-    verify(podOperations, times(1)).create(firstResolvedPod)
-    executorPodsWatcherArgument.getValue.eventReceived(Action.ERROR, firstResolvedPod)
-    val recreatedResolvedPod = expectPodCreationWithId(2, FIRST_EXECUTOR_POD)
-    allocatorRunnable.getValue.run()
-    verify(podOperations).create(recreatedResolvedPod)
-  }
-
-  private def newSchedulerBackend(): KubernetesClusterSchedulerBackend = {
-    new KubernetesClusterSchedulerBackend(
-      taskSchedulerImpl,
-      rpcEnv,
-      executorPodFactory,
-      kubernetesClient,
-      allocatorExecutor,
-      requestExecutorsService) {
-
-      override def applicationId(): String = APP_ID
-    }
-  }
-
-  private def exitPod(basePod: Pod, exitCode: Int): Pod = {
-    new PodBuilder(basePod)
->>>>>>> master
       .editStatus()
         .addNewContainerStatus()
           .withNewState()
@@ -663,7 +370,6 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
       .build()
   }
 
-<<<<<<< HEAD
   private def expectPodCreationWithId(executorId: Int, expectedPod: Pod): Unit = {
     when(executorPodFactory.createExecutorPod(
         executorId.toString,
@@ -674,21 +380,4 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
         Map.empty)).thenReturn(expectedPod)
   }
 
-=======
-  private def expectPodCreationWithId(executorId: Int, expectedPod: Pod): Pod = {
-    val resolvedPod = new PodBuilder(expectedPod)
-      .editMetadata()
-        .addToLabels(SPARK_EXECUTOR_ID_LABEL, executorId.toString)
-        .endMetadata()
-      .build()
-    when(executorPodFactory.createExecutorPod(
-      executorId.toString,
-      APP_ID,
-      DRIVER_URL,
-      sparkConf.getExecutorEnv,
-      driverPod,
-      Map.empty)).thenReturn(resolvedPod)
-    resolvedPod
-  }
->>>>>>> master
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
index 7c0d6b80c226d..e511d3da7e160 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedColumnReader.java
@@ -17,16 +17,12 @@
 
 package org.apache.spark.sql.execution.datasources.parquet;
 
-<<<<<<< HEAD
 import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL;
 import static org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.ValuesReaderIntIterator;
 import static org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase.createRLEIterator;
-=======
-import java.io.IOException;
-import java.util.TimeZone;
->>>>>>> master
 
 import java.io.IOException;
+import java.util.TimeZone;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.column.Dictionary;

From 55dbfbca37ce4c05f83180777ba3d4fe2d96a02e Mon Sep 17 00:00:00 2001
From: Sameer Agarwal <sameerag@apache.org>
Date: Fri, 12 Jan 2018 15:00:00 -0800
Subject: [PATCH 327/356] Revert "[SPARK-22908] Add kafka source and sink for
 continuous processing."

This reverts commit 6f7aaed805070d29dcba32e04ca7a1f581fa54b9.
---
 .../sql/kafka010/KafkaContinuousReader.scala  | 232 ---------
 .../sql/kafka010/KafkaContinuousWriter.scala  | 119 -----
 .../sql/kafka010/KafkaOffsetReader.scala      |  21 +-
 .../spark/sql/kafka010/KafkaSource.scala      |  17 +-
 .../sql/kafka010/KafkaSourceOffset.scala      |   7 +-
 .../sql/kafka010/KafkaSourceProvider.scala    | 105 +---
 .../spark/sql/kafka010/KafkaWriteTask.scala   |  71 +--
 .../spark/sql/kafka010/KafkaWriter.scala      |   5 +-
 .../kafka010/KafkaContinuousSinkSuite.scala   | 474 ------------------
 .../kafka010/KafkaContinuousSourceSuite.scala |  96 ----
 .../sql/kafka010/KafkaContinuousTest.scala    |  64 ---
 .../spark/sql/kafka010/KafkaSourceSuite.scala | 470 ++++++++---------
 .../apache/spark/sql/DataFrameReader.scala    |  32 +-
 .../apache/spark/sql/DataFrameWriter.scala    |  25 +-
 .../datasources/v2/WriteToDataSourceV2.scala  |   8 +-
 .../execution/streaming/StreamExecution.scala |  15 +-
 .../ContinuousDataSourceRDDIter.scala         |   3 +-
 .../continuous/ContinuousExecution.scala      |  67 +--
 .../continuous/EpochCoordinator.scala         |  21 +-
 .../sql/streaming/DataStreamWriter.scala      |  26 +-
 .../spark/sql/streaming/StreamTest.scala      |  36 +-
 21 files changed, 383 insertions(+), 1531 deletions(-)
 delete mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
 delete mode 100644 external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
 delete mode 100644 external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala

diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
deleted file mode 100644
index 928379544758c..0000000000000
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousReader.scala
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.{util => ju}
-
-import org.apache.kafka.clients.consumer.ConsumerRecord
-import org.apache.kafka.common.TopicPartition
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.catalyst.expressions.UnsafeRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.kafka010.KafkaSource.{INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE, INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE}
-import org.apache.spark.sql.sources.v2.reader._
-import org.apache.spark.sql.sources.v2.streaming.reader.{ContinuousDataReader, ContinuousReader, Offset, PartitionOffset}
-import org.apache.spark.sql.types.StructType
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * A [[ContinuousReader]] for data from kafka.
- *
- * @param offsetReader  a reader used to get kafka offsets. Note that the actual data will be
- *                      read by per-task consumers generated later.
- * @param kafkaParams   String params for per-task Kafka consumers.
- * @param sourceOptions The [[org.apache.spark.sql.sources.v2.DataSourceV2Options]] params which
- *                      are not Kafka consumer params.
- * @param metadataPath Path to a directory this reader can use for writing metadata.
- * @param initialOffsets The Kafka offsets to start reading data at.
- * @param failOnDataLoss Flag indicating whether reading should fail in data loss
- *                       scenarios, where some offsets after the specified initial ones can't be
- *                       properly read.
- */
-class KafkaContinuousReader(
-    offsetReader: KafkaOffsetReader,
-    kafkaParams: ju.Map[String, Object],
-    sourceOptions: Map[String, String],
-    metadataPath: String,
-    initialOffsets: KafkaOffsetRangeLimit,
-    failOnDataLoss: Boolean)
-  extends ContinuousReader with SupportsScanUnsafeRow with Logging {
-
-  private lazy val session = SparkSession.getActiveSession.get
-  private lazy val sc = session.sparkContext
-
-  // Initialized when creating read tasks. If this diverges from the partitions at the latest
-  // offsets, we need to reconfigure.
-  // Exposed outside this object only for unit tests.
-  private[sql] var knownPartitions: Set[TopicPartition] = _
-
-  override def readSchema: StructType = KafkaOffsetReader.kafkaSchema
-
-  private var offset: Offset = _
-  override def setOffset(start: ju.Optional[Offset]): Unit = {
-    offset = start.orElse {
-      val offsets = initialOffsets match {
-        case EarliestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchEarliestOffsets())
-        case LatestOffsetRangeLimit => KafkaSourceOffset(offsetReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => offsetReader.fetchSpecificOffsets(p, reportDataLoss)
-      }
-      logInfo(s"Initial offsets: $offsets")
-      offsets
-    }
-  }
-
-  override def getStartOffset(): Offset = offset
-
-  override def deserializeOffset(json: String): Offset = {
-    KafkaSourceOffset(JsonUtils.partitionOffsets(json))
-  }
-
-  override def createUnsafeRowReadTasks(): ju.List[ReadTask[UnsafeRow]] = {
-    import scala.collection.JavaConverters._
-
-    val oldStartPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(offset)
-
-    val currentPartitionSet = offsetReader.fetchEarliestOffsets().keySet
-    val newPartitions = currentPartitionSet.diff(oldStartPartitionOffsets.keySet)
-    val newPartitionOffsets = offsetReader.fetchEarliestOffsets(newPartitions.toSeq)
-
-    val deletedPartitions = oldStartPartitionOffsets.keySet.diff(currentPartitionSet)
-    if (deletedPartitions.nonEmpty) {
-      reportDataLoss(s"Some partitions were deleted: $deletedPartitions")
-    }
-
-    val startOffsets = newPartitionOffsets ++
-      oldStartPartitionOffsets.filterKeys(!deletedPartitions.contains(_))
-    knownPartitions = startOffsets.keySet
-
-    startOffsets.toSeq.map {
-      case (topicPartition, start) =>
-        KafkaContinuousReadTask(
-          topicPartition, start, kafkaParams, failOnDataLoss)
-          .asInstanceOf[ReadTask[UnsafeRow]]
-    }.asJava
-  }
-
-  /** Stop this source and free any resources it has allocated. */
-  def stop(): Unit = synchronized {
-    offsetReader.close()
-  }
-
-  override def commit(end: Offset): Unit = {}
-
-  override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = {
-    val mergedMap = offsets.map {
-      case KafkaSourcePartitionOffset(p, o) => Map(p -> o)
-    }.reduce(_ ++ _)
-    KafkaSourceOffset(mergedMap)
-  }
-
-  override def needsReconfiguration(): Boolean = {
-    knownPartitions != null && offsetReader.fetchLatestOffsets().keySet != knownPartitions
-  }
-
-  override def toString(): String = s"KafkaSource[$offsetReader]"
-
-  /**
-   * If `failOnDataLoss` is true, this method will throw an `IllegalStateException`.
-   * Otherwise, just log a warning.
-   */
-  private def reportDataLoss(message: String): Unit = {
-    if (failOnDataLoss) {
-      throw new IllegalStateException(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_TRUE")
-    } else {
-      logWarning(message + s". $INSTRUCTION_FOR_FAIL_ON_DATA_LOSS_FALSE")
-    }
-  }
-}
-
-/**
- * A read task for continuous Kafka processing. This will be serialized and transformed into a
- * full reader on executors.
- *
- * @param topicPartition The (topic, partition) pair this task is responsible for.
- * @param startOffset The offset to start reading from within the partition.
- * @param kafkaParams Kafka consumer params to use.
- * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
- *                       are skipped.
- */
-case class KafkaContinuousReadTask(
-    topicPartition: TopicPartition,
-    startOffset: Long,
-    kafkaParams: ju.Map[String, Object],
-    failOnDataLoss: Boolean) extends ReadTask[UnsafeRow] {
-  override def createDataReader(): KafkaContinuousDataReader = {
-    new KafkaContinuousDataReader(topicPartition, startOffset, kafkaParams, failOnDataLoss)
-  }
-}
-
-/**
- * A per-task data reader for continuous Kafka processing.
- *
- * @param topicPartition The (topic, partition) pair this data reader is responsible for.
- * @param startOffset The offset to start reading from within the partition.
- * @param kafkaParams Kafka consumer params to use.
- * @param failOnDataLoss Flag indicating whether data reader should fail if some offsets
- *                       are skipped.
- */
-class KafkaContinuousDataReader(
-    topicPartition: TopicPartition,
-    startOffset: Long,
-    kafkaParams: ju.Map[String, Object],
-    failOnDataLoss: Boolean) extends ContinuousDataReader[UnsafeRow] {
-  private val topic = topicPartition.topic
-  private val kafkaPartition = topicPartition.partition
-  private val consumer = CachedKafkaConsumer.createUncached(topic, kafkaPartition, kafkaParams)
-
-  private val sharedRow = new UnsafeRow(7)
-  private val bufferHolder = new BufferHolder(sharedRow)
-  private val rowWriter = new UnsafeRowWriter(bufferHolder, 7)
-
-  private var nextKafkaOffset = startOffset
-  private var currentRecord: ConsumerRecord[Array[Byte], Array[Byte]] = _
-
-  override def next(): Boolean = {
-    var r: ConsumerRecord[Array[Byte], Array[Byte]] = null
-    while (r == null) {
-      r = consumer.get(
-        nextKafkaOffset,
-        untilOffset = Long.MaxValue,
-        pollTimeoutMs = Long.MaxValue,
-        failOnDataLoss)
-    }
-    nextKafkaOffset = r.offset + 1
-    currentRecord = r
-    true
-  }
-
-  override def get(): UnsafeRow = {
-    bufferHolder.reset()
-
-    if (currentRecord.key == null) {
-      rowWriter.setNullAt(0)
-    } else {
-      rowWriter.write(0, currentRecord.key)
-    }
-    rowWriter.write(1, currentRecord.value)
-    rowWriter.write(2, UTF8String.fromString(currentRecord.topic))
-    rowWriter.write(3, currentRecord.partition)
-    rowWriter.write(4, currentRecord.offset)
-    rowWriter.write(5,
-      DateTimeUtils.fromJavaTimestamp(new java.sql.Timestamp(currentRecord.timestamp)))
-    rowWriter.write(6, currentRecord.timestampType.id)
-    sharedRow.setTotalSize(bufferHolder.totalSize)
-    sharedRow
-  }
-
-  override def getOffset(): KafkaSourcePartitionOffset = {
-    KafkaSourcePartitionOffset(topicPartition, nextKafkaOffset)
-  }
-
-  override def close(): Unit = {
-    consumer.close()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
deleted file mode 100644
index 9843f469c5b25..0000000000000
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousWriter.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import org.apache.kafka.clients.producer.{Callback, ProducerRecord, RecordMetadata}
-import scala.collection.JavaConverters._
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal, UnsafeProjection}
-import org.apache.spark.sql.kafka010.KafkaSourceProvider.{kafkaParamsForProducer, TOPIC_OPTION_KEY}
-import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
-import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.streaming.OutputMode
-import org.apache.spark.sql.types.{BinaryType, StringType, StructType}
-
-/**
- * Dummy commit message. The DataSourceV2 framework requires a commit message implementation but we
- * don't need to really send one.
- */
-case object KafkaWriterCommitMessage extends WriterCommitMessage
-
-/**
- * A [[ContinuousWriter]] for Kafka writing. Responsible for generating the writer factory.
- * @param topic The topic this writer is responsible for. If None, topic will be inferred from
- *              a `topic` field in the incoming data.
- * @param producerParams Parameters for Kafka producers in each task.
- * @param schema The schema of the input data.
- */
-class KafkaContinuousWriter(
-    topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends ContinuousWriter with SupportsWriteInternalRow {
-
-  validateQuery(schema.toAttributes, producerParams.toMap[String, Object].asJava, topic)
-
-  override def createInternalRowWriterFactory(): KafkaContinuousWriterFactory =
-    KafkaContinuousWriterFactory(topic, producerParams, schema)
-
-  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
-  override def abort(messages: Array[WriterCommitMessage]): Unit = {}
-}
-
-/**
- * A [[DataWriterFactory]] for Kafka writing. Will be serialized and sent to executors to generate
- * the per-task data writers.
- * @param topic The topic that should be written to. If None, topic will be inferred from
- *              a `topic` field in the incoming data.
- * @param producerParams Parameters for Kafka producers in each task.
- * @param schema The schema of the input data.
- */
-case class KafkaContinuousWriterFactory(
-    topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends DataWriterFactory[InternalRow] {
-
-  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
-    new KafkaContinuousDataWriter(topic, producerParams, schema.toAttributes)
-  }
-}
-
-/**
- * A [[DataWriter]] for Kafka writing. One data writer will be created in each partition to
- * process incoming rows.
- *
- * @param targetTopic The topic that this data writer is targeting. If None, topic will be inferred
- *                    from a `topic` field in the incoming data.
- * @param producerParams Parameters to use for the Kafka producer.
- * @param inputSchema The attributes in the input data.
- */
-class KafkaContinuousDataWriter(
-    targetTopic: Option[String], producerParams: Map[String, String], inputSchema: Seq[Attribute])
-  extends KafkaRowWriter(inputSchema, targetTopic) with DataWriter[InternalRow] {
-  import scala.collection.JavaConverters._
-
-  private lazy val producer = CachedKafkaProducer.getOrCreate(
-    new java.util.HashMap[String, Object](producerParams.asJava))
-
-  def write(row: InternalRow): Unit = {
-    checkForErrors()
-    sendRow(row, producer)
-  }
-
-  def commit(): WriterCommitMessage = {
-    // Send is asynchronous, but we can't commit until all rows are actually in Kafka.
-    // This requires flushing and then checking that no callbacks produced errors.
-    // We also check for errors before to fail as soon as possible - the check is cheap.
-    checkForErrors()
-    producer.flush()
-    checkForErrors()
-    KafkaWriterCommitMessage
-  }
-
-  def abort(): Unit = {}
-
-  def close(): Unit = {
-    checkForErrors()
-    if (producer != null) {
-      producer.flush()
-      checkForErrors()
-      CachedKafkaProducer.close(new java.util.HashMap[String, Object](producerParams.asJava))
-    }
-  }
-}
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
index 551641cfdbca8..3e65949a6fd1b 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala
@@ -117,14 +117,10 @@ private[kafka010] class KafkaOffsetReader(
    * Resolves the specific offsets based on Kafka seek positions.
    * This method resolves offset value -1 to the latest and -2 to the
    * earliest Kafka seek position.
-   *
-   * @param partitionOffsets the specific offsets to resolve
-   * @param reportDataLoss callback to either report or log data loss depending on setting
    */
   def fetchSpecificOffsets(
-      partitionOffsets: Map[TopicPartition, Long],
-      reportDataLoss: String => Unit): KafkaSourceOffset = {
-    val fetched = runUninterruptibly {
+      partitionOffsets: Map[TopicPartition, Long]): Map[TopicPartition, Long] =
+    runUninterruptibly {
       withRetriesWithoutInterrupt {
         // Poll to get the latest assigned partitions
         consumer.poll(0)
@@ -149,19 +145,6 @@ private[kafka010] class KafkaOffsetReader(
       }
     }
 
-    partitionOffsets.foreach {
-      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
-        off != KafkaOffsetRangeLimit.EARLIEST =>
-        if (fetched(tp) != off) {
-          reportDataLoss(
-            s"startingOffsets for $tp was $off but consumer reset to ${fetched(tp)}")
-        }
-      case _ =>
-        // no real way to check that beginning or end is reasonable
-    }
-    KafkaSourceOffset(fetched)
-  }
-
   /**
    * Fetch the earliest offsets for the topic partitions that are indicated
    * in the [[ConsumerStrategy]].
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
index 27da76068a66f..e9cff04ba5f2e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala
@@ -130,7 +130,7 @@ private[kafka010] class KafkaSource(
       val offsets = startingOffsets match {
         case EarliestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchEarliestOffsets())
         case LatestOffsetRangeLimit => KafkaSourceOffset(kafkaReader.fetchLatestOffsets())
-        case SpecificOffsetRangeLimit(p) => kafkaReader.fetchSpecificOffsets(p, reportDataLoss)
+        case SpecificOffsetRangeLimit(p) => fetchAndVerify(p)
       }
       metadataLog.add(0, offsets)
       logInfo(s"Initial offsets: $offsets")
@@ -138,6 +138,21 @@ private[kafka010] class KafkaSource(
     }.partitionToOffsets
   }
 
+  private def fetchAndVerify(specificOffsets: Map[TopicPartition, Long]) = {
+    val result = kafkaReader.fetchSpecificOffsets(specificOffsets)
+    specificOffsets.foreach {
+      case (tp, off) if off != KafkaOffsetRangeLimit.LATEST &&
+          off != KafkaOffsetRangeLimit.EARLIEST =>
+        if (result(tp) != off) {
+          reportDataLoss(
+            s"startingOffsets for $tp was $off but consumer reset to ${result(tp)}")
+        }
+      case _ =>
+      // no real way to check that beginning or end is reasonable
+    }
+    KafkaSourceOffset(result)
+  }
+
   private var currentPartitionOffsets: Option[Map[TopicPartition, Long]] = None
 
   override def schema: StructType = KafkaOffsetReader.kafkaSchema
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
index c82154cfbad7f..b5da415b3097e 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala
@@ -20,22 +20,17 @@ package org.apache.spark.sql.kafka010
 import org.apache.kafka.common.TopicPartition
 
 import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset}
-import org.apache.spark.sql.sources.v2.streaming.reader.{Offset => OffsetV2, PartitionOffset}
 
 /**
  * An [[Offset]] for the [[KafkaSource]]. This one tracks all partitions of subscribed topics and
  * their offsets.
  */
 private[kafka010]
-case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends OffsetV2 {
+case class KafkaSourceOffset(partitionToOffsets: Map[TopicPartition, Long]) extends Offset {
 
   override val json = JsonUtils.partitionOffsets(partitionToOffsets)
 }
 
-private[kafka010]
-case class KafkaSourcePartitionOffset(topicPartition: TopicPartition, partitionOffset: Long)
-  extends PartitionOffset
-
 /** Companion object of the [[KafkaSourceOffset]] */
 private[kafka010] object KafkaSourceOffset {
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
index 3914370a96595..3cb4d8cad12cc 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.kafka010
 
 import java.{util => ju}
-import java.util.{Locale, Optional, UUID}
+import java.util.{Locale, UUID}
 
 import scala.collection.JavaConverters._
 
@@ -27,12 +27,9 @@ import org.apache.kafka.clients.producer.ProducerConfig
 import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer}
 
 import org.apache.spark.internal.Logging
-import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SparkSession, SQLContext}
-import org.apache.spark.sql.execution.streaming.{Offset, Sink, Source}
+import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
-import org.apache.spark.sql.sources.v2.{DataSourceV2, DataSourceV2Options}
-import org.apache.spark.sql.sources.v2.streaming.{ContinuousReadSupport, ContinuousWriteSupport}
-import org.apache.spark.sql.sources.v2.streaming.writer.ContinuousWriter
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -46,8 +43,6 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     with StreamSinkProvider
     with RelationProvider
     with CreatableRelationProvider
-    with ContinuousWriteSupport
-    with ContinuousReadSupport
     with Logging {
   import KafkaSourceProvider._
 
@@ -106,43 +101,6 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
       failOnDataLoss(caseInsensitiveParams))
   }
 
-  override def createContinuousReader(
-      schema: Optional[StructType],
-      metadataPath: String,
-      options: DataSourceV2Options): KafkaContinuousReader = {
-    val parameters = options.asMap().asScala.toMap
-    validateStreamOptions(parameters)
-    // Each running query should use its own group id. Otherwise, the query may be only assigned
-    // partial data since Kafka will assign partitions to multiple consumers having the same group
-    // id. Hence, we should generate a unique id for each query.
-    val uniqueGroupId = s"spark-kafka-source-${UUID.randomUUID}-${metadataPath.hashCode}"
-
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    val specifiedKafkaParams =
-      parameters
-        .keySet
-        .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-        .map { k => k.drop(6).toString -> parameters(k) }
-        .toMap
-
-    val startingStreamOffsets = KafkaSourceProvider.getKafkaOffsetRangeLimit(caseInsensitiveParams,
-      STARTING_OFFSETS_OPTION_KEY, LatestOffsetRangeLimit)
-
-    val kafkaOffsetReader = new KafkaOffsetReader(
-      strategy(caseInsensitiveParams),
-      kafkaParamsForDriver(specifiedKafkaParams),
-      parameters,
-      driverGroupIdPrefix = s"$uniqueGroupId-driver")
-
-    new KafkaContinuousReader(
-      kafkaOffsetReader,
-      kafkaParamsForExecutors(specifiedKafkaParams, uniqueGroupId),
-      parameters,
-      metadataPath,
-      startingStreamOffsets,
-      failOnDataLoss(caseInsensitiveParams))
-  }
-
   /**
    * Returns a new base relation with the given parameters.
    *
@@ -223,22 +181,26 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     }
   }
 
-  override def createContinuousWriter(
-      queryId: String,
-      schema: StructType,
-      mode: OutputMode,
-      options: DataSourceV2Options): Optional[ContinuousWriter] = {
-    import scala.collection.JavaConverters._
-
-    val spark = SparkSession.getActiveSession.get
-    val topic = Option(options.get(TOPIC_OPTION_KEY).orElse(null)).map(_.trim)
-    // We convert the options argument from V2 -> Java map -> scala mutable -> scala immutable.
-    val producerParams = kafkaParamsForProducer(options.asMap.asScala.toMap)
-
-    KafkaWriter.validateQuery(
-      schema.toAttributes, new java.util.HashMap[String, Object](producerParams.asJava), topic)
+  private def kafkaParamsForProducer(parameters: Map[String, String]): Map[String, String] = {
+    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
+          + "are serialized with ByteArraySerializer.")
+    }
 
-    Optional.of(new KafkaContinuousWriter(topic, producerParams, schema))
+    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
+    {
+      throw new IllegalArgumentException(
+        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
+          + "value are serialized with ByteArraySerializer.")
+    }
+    parameters
+      .keySet
+      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
+      .map { k => k.drop(6).toString -> parameters(k) }
+      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
+        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
   }
 
   private def strategy(caseInsensitiveParams: Map[String, String]) =
@@ -488,27 +450,4 @@ private[kafka010] object KafkaSourceProvider extends Logging {
 
     def build(): ju.Map[String, Object] = map
   }
-
-  private[kafka010] def kafkaParamsForProducer(
-      parameters: Map[String, String]): Map[String, String] = {
-    val caseInsensitiveParams = parameters.map { case (k, v) => (k.toLowerCase(Locale.ROOT), v) }
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}")) {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG}' is not supported as keys "
-          + "are serialized with ByteArraySerializer.")
-    }
-
-    if (caseInsensitiveParams.contains(s"kafka.${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}"))
-    {
-      throw new IllegalArgumentException(
-        s"Kafka option '${ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG}' is not supported as "
-          + "value are serialized with ByteArraySerializer.")
-    }
-    parameters
-      .keySet
-      .filter(_.toLowerCase(Locale.ROOT).startsWith("kafka."))
-      .map { k => k.drop(6).toString -> parameters(k) }
-      .toMap + (ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName,
-      ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[ByteArraySerializer].getName)
-  }
 }
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
index baa60febf661d..6fd333e2f43ba 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala
@@ -33,8 +33,10 @@ import org.apache.spark.sql.types.{BinaryType, StringType}
 private[kafka010] class KafkaWriteTask(
     producerConfiguration: ju.Map[String, Object],
     inputSchema: Seq[Attribute],
-    topic: Option[String]) extends KafkaRowWriter(inputSchema, topic) {
+    topic: Option[String]) {
   // used to synchronize with Kafka callbacks
+  @volatile private var failedWrite: Exception = null
+  private val projection = createProjection
   private var producer: KafkaProducer[Array[Byte], Array[Byte]] = _
 
   /**
@@ -44,7 +46,23 @@ private[kafka010] class KafkaWriteTask(
     producer = CachedKafkaProducer.getOrCreate(producerConfiguration)
     while (iterator.hasNext && failedWrite == null) {
       val currentRow = iterator.next()
-      sendRow(currentRow, producer)
+      val projectedRow = projection(currentRow)
+      val topic = projectedRow.getUTF8String(0)
+      val key = projectedRow.getBinary(1)
+      val value = projectedRow.getBinary(2)
+      if (topic == null) {
+        throw new NullPointerException(s"null topic present in the data. Use the " +
+        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
+      }
+      val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
+      val callback = new Callback() {
+        override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
+          if (failedWrite == null && e != null) {
+            failedWrite = e
+          }
+        }
+      }
+      producer.send(record, callback)
     }
   }
 
@@ -56,49 +74,8 @@ private[kafka010] class KafkaWriteTask(
       producer = null
     }
   }
-}
-
-private[kafka010] abstract class KafkaRowWriter(
-    inputSchema: Seq[Attribute], topic: Option[String]) {
-
-  // used to synchronize with Kafka callbacks
-  @volatile protected var failedWrite: Exception = _
-  protected val projection = createProjection
-
-  private val callback = new Callback() {
-    override def onCompletion(recordMetadata: RecordMetadata, e: Exception): Unit = {
-      if (failedWrite == null && e != null) {
-        failedWrite = e
-      }
-    }
-  }
 
-  /**
-   * Send the specified row to the producer, with a callback that will save any exception
-   * to failedWrite. Note that send is asynchronous; subclasses must flush() their producer before
-   * assuming the row is in Kafka.
-   */
-  protected def sendRow(
-      row: InternalRow, producer: KafkaProducer[Array[Byte], Array[Byte]]): Unit = {
-    val projectedRow = projection(row)
-    val topic = projectedRow.getUTF8String(0)
-    val key = projectedRow.getBinary(1)
-    val value = projectedRow.getBinary(2)
-    if (topic == null) {
-      throw new NullPointerException(s"null topic present in the data. Use the " +
-        s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a default topic.")
-    }
-    val record = new ProducerRecord[Array[Byte], Array[Byte]](topic.toString, key, value)
-    producer.send(record, callback)
-  }
-
-  protected def checkForErrors(): Unit = {
-    if (failedWrite != null) {
-      throw failedWrite
-    }
-  }
-
-  private def createProjection = {
+  private def createProjection: UnsafeProjection = {
     val topicExpression = topic.map(Literal(_)).orElse {
       inputSchema.find(_.name == KafkaWriter.TOPIC_ATTRIBUTE_NAME)
     }.getOrElse {
@@ -135,5 +112,11 @@ private[kafka010] abstract class KafkaRowWriter(
       Seq(topicExpression, Cast(keyExpression, BinaryType),
         Cast(valueExpression, BinaryType)), inputSchema)
   }
+
+  private def checkForErrors(): Unit = {
+    if (failedWrite != null) {
+      throw failedWrite
+    }
+  }
 }
 
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
index 15cd44812cb0c..5e9ae35b3f008 100644
--- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
+++ b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala
@@ -43,9 +43,10 @@ private[kafka010] object KafkaWriter extends Logging {
   override def toString: String = "KafkaWriter"
 
   def validateQuery(
-      schema: Seq[Attribute],
+      queryExecution: QueryExecution,
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
+    val schema = queryExecution.analyzed.output
     schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
       if (topic.isEmpty) {
         throw new AnalysisException(s"topic option required when no " +
@@ -83,7 +84,7 @@ private[kafka010] object KafkaWriter extends Logging {
       kafkaParameters: ju.Map[String, Object],
       topic: Option[String] = None): Unit = {
     val schema = queryExecution.analyzed.output
-    validateQuery(schema, kafkaParameters, topic)
+    validateQuery(queryExecution, kafkaParameters, topic)
     queryExecution.toRdd.foreachPartition { iter =>
       val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
       Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
deleted file mode 100644
index dfc97b1c38bb5..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSinkSuite.scala
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.util.Locale
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.apache.kafka.clients.producer.ProducerConfig
-import org.apache.kafka.common.serialization.ByteArraySerializer
-import org.scalatest.time.SpanSugar._
-import scala.collection.JavaConverters._
-
-import org.apache.spark.sql.{AnalysisException, DataFrame, Row, SaveMode}
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, SpecificInternalRow, UnsafeProjection}
-import org.apache.spark.sql.execution.streaming.MemoryStream
-import org.apache.spark.sql.streaming._
-import org.apache.spark.sql.types.{BinaryType, DataType}
-import org.apache.spark.util.Utils
-
-/**
- * This is a temporary port of KafkaSinkSuite, since we do not yet have a V2 memory stream.
- * Once we have one, this will be changed to a specialization of KafkaSinkSuite and we won't have
- * to duplicate all the code.
- */
-class KafkaContinuousSinkSuite extends KafkaContinuousTest {
-  import testImplicits._
-
-  override val streamingTimeout = 30.seconds
-
-  override def beforeAll(): Unit = {
-    super.beforeAll()
-    testUtils = new KafkaTestUtils(
-      withBrokerProps = Map("auto.create.topics.enable" -> "false"))
-    testUtils.setup()
-  }
-
-  override def afterAll(): Unit = {
-    if (testUtils != null) {
-      testUtils.teardown()
-      testUtils = null
-    }
-    super.afterAll()
-  }
-
-  test("streaming - write to kafka with topic field") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = None,
-      withOutputMode = Some(OutputMode.Append))(
-      withSelectExpr = s"'$topic' as topic", "value")
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
-      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("streaming - write w/o topic field, with topic option") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = Some(topic),
-      withOutputMode = Some(OutputMode.Append()))()
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key as STRING) key", "CAST(value as STRING) value")
-      .selectExpr("CAST(key as INT) key", "CAST(value as INT) value")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("streaming - topic field and topic option") {
-    /* The purpose of this test is to ensure that the topic option
-     * overrides the topic field. We begin by writing some data that
-     * includes a topic field and value (e.g., 'foo') along with a topic
-     * option. Then when we read from the topic specified in the option
-     * we should see the data i.e., the data was written to the topic
-     * option, and not to the topic in the data e.g., foo
-     */
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    val writer = createKafkaWriter(
-      input.toDF(),
-      withTopic = Some(topic),
-      withOutputMode = Some(OutputMode.Append()))(
-      withSelectExpr = "'foo' as topic", "CAST(value as STRING) value")
-
-    val reader = createKafkaReader(topic)
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .selectExpr("CAST(key AS INT)", "CAST(value AS INT)")
-      .as[(Int, Int)]
-      .map(_._2)
-
-    try {
-      testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5)
-      testUtils.sendMessages(inputTopic, Array("6", "7", "8", "9", "10"))
-      failAfter(streamingTimeout) {
-        writer.processAllAvailable()
-      }
-      checkDatasetUnorderly(reader, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("null topic attribute") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    /* No topic field or topic option */
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = "CAST(null as STRING) as topic", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getCause.getCause.getMessage
-      .toLowerCase(Locale.ROOT)
-      .contains("null topic present in the data."))
-  }
-
-  test("streaming - write data with bad schema") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    /* No topic field or topic option */
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = "value as key", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage
-      .toLowerCase(Locale.ROOT)
-      .contains("topic option required when no 'topic' attribute is present"))
-
-    try {
-      /* No value field */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "value as key"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "required attribute 'value' not found"))
-  }
-
-  test("streaming - write data with valid schema but wrong types") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-      .selectExpr("CAST(value as STRING) value")
-    val topic = newTopic()
-    testUtils.createTopic(topic)
-
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      /* topic field wrong type */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"CAST('1' as INT) as topic", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("topic type must be a string"))
-
-    try {
-      /* value field wrong type */
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "value attribute type must be a string or binarytype"))
-
-    try {
-      ex = intercept[StreamingQueryException] {
-        /* key field wrong type */
-        writer = createKafkaWriter(input.toDF())(
-          withSelectExpr = s"'$topic' as topic", "CAST(value as INT) as key", "value"
-        )
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        writer.processAllAvailable()
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-      "key attribute type must be a string or binarytype"))
-  }
-
-  test("streaming - write to non-existing topic") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .option("startingOffsets", "earliest")
-      .load()
-    val topic = newTopic()
-
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(input.toDF(), withTopic = Some(topic))()
-        testUtils.sendMessages(inputTopic, Array("1", "2", "3", "4", "5"))
-        eventually(timeout(streamingTimeout)) {
-          assert(writer.exception.isDefined)
-        }
-        throw writer.exception.get
-      }
-    } finally {
-      writer.stop()
-    }
-    assert(ex.getMessage.toLowerCase(Locale.ROOT).contains("job aborted"))
-  }
-
-  test("streaming - exception on config serializer") {
-    val inputTopic = newTopic()
-    testUtils.createTopic(inputTopic, partitions = 1)
-    testUtils.sendMessages(inputTopic, Array("0"))
-
-    val input = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", inputTopic)
-      .load()
-    var writer: StreamingQuery = null
-    var ex: Exception = null
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(
-          input.toDF(),
-          withOptions = Map("kafka.key.serializer" -> "foo"))()
-        writer.processAllAvailable()
-      }
-      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-        "kafka option 'key.serializer' is not supported"))
-    } finally {
-      writer.stop()
-    }
-
-    try {
-      ex = intercept[StreamingQueryException] {
-        writer = createKafkaWriter(
-          input.toDF(),
-          withOptions = Map("kafka.value.serializer" -> "foo"))()
-        writer.processAllAvailable()
-      }
-      assert(ex.getMessage.toLowerCase(Locale.ROOT).contains(
-        "kafka option 'value.serializer' is not supported"))
-    } finally {
-      writer.stop()
-    }
-  }
-
-  test("generic - write big data with small producer buffer") {
-    /* This test ensures that we understand the semantics of Kafka when
-    * is comes to blocking on a call to send when the send buffer is full.
-    * This test will configure the smallest possible producer buffer and
-    * indicate that we should block when it is full. Thus, no exception should
-    * be thrown in the case of a full buffer.
-    */
-    val topic = newTopic()
-    testUtils.createTopic(topic, 1)
-    val options = new java.util.HashMap[String, String]
-    options.put("bootstrap.servers", testUtils.brokerAddress)
-    options.put("buffer.memory", "16384") // min buffer size
-    options.put("block.on.buffer.full", "true")
-    options.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
-    options.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[ByteArraySerializer].getName)
-    val inputSchema = Seq(AttributeReference("value", BinaryType)())
-    val data = new Array[Byte](15000) // large value
-    val writeTask = new KafkaContinuousDataWriter(Some(topic), options.asScala.toMap, inputSchema)
-    try {
-      val fieldTypes: Array[DataType] = Array(BinaryType)
-      val converter = UnsafeProjection.create(fieldTypes)
-      val row = new SpecificInternalRow(fieldTypes)
-      row.update(0, data)
-      val iter = Seq.fill(1000)(converter.apply(row)).iterator
-      iter.foreach(writeTask.write(_))
-      writeTask.commit()
-    } finally {
-      writeTask.close()
-    }
-  }
-
-  private def createKafkaReader(topic: String): DataFrame = {
-    spark.read
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("startingOffsets", "earliest")
-      .option("endingOffsets", "latest")
-      .option("subscribe", topic)
-      .load()
-  }
-
-  private def createKafkaWriter(
-      input: DataFrame,
-      withTopic: Option[String] = None,
-      withOutputMode: Option[OutputMode] = None,
-      withOptions: Map[String, String] = Map[String, String]())
-      (withSelectExpr: String*): StreamingQuery = {
-    var stream: DataStreamWriter[Row] = null
-    val checkpointDir = Utils.createTempDir()
-    var df = input.toDF()
-    if (withSelectExpr.length > 0) {
-      df = df.selectExpr(withSelectExpr: _*)
-    }
-    stream = df.writeStream
-      .format("kafka")
-      .option("checkpointLocation", checkpointDir.getCanonicalPath)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      // We need to reduce blocking time to efficiently test non-existent partition behavior.
-      .option("kafka.max.block.ms", "1000")
-      .trigger(Trigger.Continuous(1000))
-      .queryName("kafkaStream")
-    withTopic.foreach(stream.option("topic", _))
-    withOutputMode.foreach(stream.outputMode(_))
-    withOptions.foreach(opt => stream.option(opt._1, opt._2))
-    stream.start()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
deleted file mode 100644
index b3dade414f625..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import java.util.Properties
-import java.util.concurrent.atomic.AtomicInteger
-
-import org.scalatest.time.SpanSugar._
-import scala.collection.mutable
-import scala.util.Random
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.streaming.{StreamTest, Trigger}
-import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
-
-// Run tests in KafkaSourceSuiteBase in continuous execution mode.
-class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest
-
-class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
-  import testImplicits._
-
-  override val brokerProps = Map("auto.create.topics.enable" -> "false")
-
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
-
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Execute { query =>
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        eventually(timeout(streamingTimeout)) {
-          assert(
-            query.lastExecution.logical.collectFirst {
-              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
-            }.exists { r =>
-              // Ensure the new topic is present and the old topic is gone.
-              r.knownPartitions.exists(_.topic == topic2)
-            },
-            s"query never reconfigured to new topic $topic2")
-        }
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
-}
-
-class KafkaContinuousSourceStressForDontFailOnDataLossSuite
-    extends KafkaSourceStressForDontFailOnDataLossSuite {
-  override protected def startStream(ds: Dataset[Int]) = {
-    ds.writeStream
-      .format("memory")
-      .queryName("memory")
-      .start()
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
deleted file mode 100644
index e713e6695d2bd..0000000000000
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.kafka010
-
-import org.apache.spark.SparkContext
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
-import org.apache.spark.sql.execution.streaming.StreamExecution
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.streaming.Trigger
-import org.apache.spark.sql.test.TestSparkSession
-
-// Trait to configure StreamTest for kafka continuous execution tests.
-trait KafkaContinuousTest extends KafkaSourceTest {
-  override val defaultTrigger = Trigger.Continuous(1000)
-  override val defaultUseV2Sink = true
-
-  // We need more than the default local[2] to be able to schedule all partitions simultaneously.
-  override protected def createSparkSession = new TestSparkSession(
-    new SparkContext(
-      "local[10]",
-      "continuous-stream-test-sql-context",
-      sparkConf.set("spark.sql.testkey", "true")))
-
-  // In addition to setting the partitions in Kafka, we have to wait until the query has
-  // reconfigured to the new count so the test framework can hook in properly.
-  override protected def setTopicPartitions(
-      topic: String, newCount: Int, query: StreamExecution) = {
-    testUtils.addPartitions(topic, newCount)
-    eventually(timeout(streamingTimeout)) {
-      assert(
-        query.lastExecution.logical.collectFirst {
-          case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
-        }.exists(_.knownPartitions.size == newCount),
-        s"query never reconfigured to $newCount partitions")
-    }
-  }
-
-  test("ensure continuous stream is being used") {
-    val query = spark.readStream
-      .format("rate")
-      .option("numPartitions", "1")
-      .option("rowsPerSecond", "1")
-      .load()
-
-    testStream(query)(
-      Execute(q => assert(q.isInstanceOf[ContinuousExecution]))
-    )
-  }
-}
diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
index d66908f86ccc7..2034b9be07f24 100644
--- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
+++ b/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceSuite.scala
@@ -34,14 +34,11 @@ import org.scalatest.concurrent.PatienceConfiguration.Timeout
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
-import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, WriteToDataSourceV2Exec}
+import org.apache.spark.sql.ForeachWriter
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
-import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryWriter
 import org.apache.spark.sql.functions.{count, window}
 import org.apache.spark.sql.kafka010.KafkaSourceProvider._
-import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest, Trigger}
+import org.apache.spark.sql.streaming.{ProcessingTime, StreamTest}
 import org.apache.spark.sql.streaming.util.StreamManualClock
 import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
 import org.apache.spark.util.Utils
@@ -52,11 +49,9 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
 
   override val streamingTimeout = 30.seconds
 
-  protected val brokerProps = Map[String, Object]()
-
   override def beforeAll(): Unit = {
     super.beforeAll()
-    testUtils = new KafkaTestUtils(brokerProps)
+    testUtils = new KafkaTestUtils
     testUtils.setup()
   }
 
@@ -64,25 +59,18 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     if (testUtils != null) {
       testUtils.teardown()
       testUtils = null
+      super.afterAll()
     }
-    super.afterAll()
   }
 
   protected def makeSureGetOffsetCalled = AssertOnQuery { q =>
     // Because KafkaSource's initialPartitionOffsets is set lazily, we need to make sure
-    // its "getOffset" is called before pushing any data. Otherwise, because of the race condition,
+    // its "getOffset" is called before pushing any data. Otherwise, because of the race contion,
     // we don't know which data should be fetched when `startingOffsets` is latest.
-    q match {
-      case c: ContinuousExecution => c.awaitEpoch(0)
-      case m: MicroBatchExecution => m.processAllAvailable()
-    }
+    q.processAllAvailable()
     true
   }
 
-  protected def setTopicPartitions(topic: String, newCount: Int, query: StreamExecution) : Unit = {
-    testUtils.addPartitions(topic, newCount)
-  }
-
   /**
    * Add data to Kafka.
    *
@@ -94,7 +82,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
       message: String = "",
       topicAction: (String, Option[Int]) => Unit = (_, _) => {}) extends AddData {
 
-    override def addData(query: Option[StreamExecution]): (BaseStreamingSource, Offset) = {
+    override def addData(query: Option[StreamExecution]): (Source, Offset) = {
       if (query.get.isActive) {
         // Make sure no Spark job is running when deleting a topic
         query.get.processAllAvailable()
@@ -109,18 +97,16 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
         topicAction(existingTopicPartitions._1, Some(existingTopicPartitions._2))
       }
 
+      // Read all topics again in case some topics are delete.
+      val allTopics = testUtils.getAllTopicsAndPartitionSize().toMap.keys
       require(
         query.nonEmpty,
         "Cannot add data when there is no query for finding the active kafka source")
 
       val sources = query.get.logicalPlan.collect {
-        case StreamingExecutionRelation(source: KafkaSource, _) => source
-      } ++ (query.get.lastExecution match {
-        case null => Seq()
-        case e => e.logical.collect {
-          case DataSourceV2Relation(_, reader: KafkaContinuousReader) => reader
-        }
-      })
+        case StreamingExecutionRelation(source, _) if source.isInstanceOf[KafkaSource] =>
+          source.asInstanceOf[KafkaSource]
+      }
       if (sources.isEmpty) {
         throw new Exception(
           "Could not find Kafka source in the StreamExecution logical plan to add data to")
@@ -151,158 +137,14 @@ abstract class KafkaSourceTest extends StreamTest with SharedSQLContext {
     override def toString: String =
       s"AddKafkaData(topics = $topics, data = $data, message = $message)"
   }
-
-  private val topicId = new AtomicInteger(0)
-  protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
 }
 
-class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
-
-  import testImplicits._
-
-  test("(de)serialization of initial offsets") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("subscribe", topic)
-
-    testStream(reader.load)(
-      makeSureGetOffsetCalled,
-      StopStream,
-      StartStream(),
-      StopStream)
-  }
-
-  test("maxOffsetsPerTrigger") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 3)
-    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
-    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
-    testUtils.sendMessages(topic, Array("1"), Some(2))
-
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("maxOffsetsPerTrigger", 10)
-      .option("subscribe", topic)
-      .option("startingOffsets", "earliest")
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
-
-    val clock = new StreamManualClock
-
-    val waitUntilBatchProcessed = AssertOnQuery { q =>
-      eventually(Timeout(streamingTimeout)) {
-        if (!q.exception.isDefined) {
-          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
-        }
-      }
-      if (q.exception.isDefined) {
-        throw q.exception.get
-      }
-      true
-    }
-
-    testStream(mapped)(
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // 1 from smallest, 1 from middle, 8 from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
-      ),
-      StopStream,
-      StartStream(ProcessingTime(100), clock),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
-      ),
-      AdvanceManualClock(100),
-      waitUntilBatchProcessed,
-      // smallest now empty, 1 more from middle, 9 more from biggest
-      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
-        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
-        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
-        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
-      )
-    )
-  }
-
-  test("input row metrics") {
-    val topic = newTopic()
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
-
-    val kafka = spark
-      .readStream
-      .format("kafka")
-      .option("subscribe", topic)
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
-    testStream(mapped)(
-      StartStream(trigger = ProcessingTime(1)),
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      AssertOnQuery { query =>
-        val recordsRead = query.recentProgress.map(_.numInputRows).sum
-        recordsRead == 3
-      }
-    )
-  }
-
-  test("subscribing topic by pattern with topic deletions") {
-    val topicPrefix = newTopic()
-    val topic = topicPrefix + "-seems"
-    val topic2 = topicPrefix + "-bad"
-    testUtils.createTopic(topic, partitions = 5)
-    testUtils.sendMessages(topic, Array("-1"))
-    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
 
-    val reader = spark
-      .readStream
-      .format("kafka")
-      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topicPrefix-.*")
-      .option("failOnDataLoss", "false")
+class KafkaSourceSuite extends KafkaSourceTest {
 
-    val kafka = reader.load()
-      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
-      .as[(String, String)]
-    val mapped = kafka.map(kv => kv._2.toInt + 1)
+  import testImplicits._
 
-    testStream(mapped)(
-      makeSureGetOffsetCalled,
-      AddKafkaData(Set(topic), 1, 2, 3),
-      CheckAnswer(2, 3, 4),
-      Assert {
-        testUtils.deleteTopic(topic)
-        testUtils.createTopic(topic2, partitions = 5)
-        true
-      },
-      AddKafkaData(Set(topic2), 4, 5, 6),
-      CheckAnswer(2, 3, 4, 5, 6, 7)
-    )
-  }
+  private val topicId = new AtomicInteger(0)
 
   testWithUninterruptibleThread(
     "deserialization of initial offset with Spark 2.1.0") {
@@ -395,51 +237,86 @@ class KafkaMicroBatchSourceSuite extends KafkaSourceSuiteBase {
     }
   }
 
-  test("KafkaSource with watermark") {
-    val now = System.currentTimeMillis()
+  test("(de)serialization of initial offsets") {
     val topic = newTopic()
-    testUtils.createTopic(newTopic(), partitions = 1)
-    testUtils.sendMessages(topic, Array(1).map(_.toString))
+    testUtils.createTopic(topic, partitions = 64)
 
-    val kafka = spark
+    val reader = spark
       .readStream
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
-      .option("kafka.metadata.max.age.ms", "1")
-      .option("startingOffsets", s"earliest")
       .option("subscribe", topic)
-      .load()
-
-    val windowedAggregation = kafka
-      .withWatermark("timestamp", "10 seconds")
-      .groupBy(window($"timestamp", "5 seconds") as 'window)
-      .agg(count("*") as 'count)
-      .select($"window".getField("start") as 'window, $"count")
 
-    val query = windowedAggregation
-      .writeStream
-      .format("memory")
-      .outputMode("complete")
-      .queryName("kafkaWatermark")
-      .start()
-    query.processAllAvailable()
-    val rows = spark.table("kafkaWatermark").collect()
-    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
-    val row = rows(0)
-    // We cannot check the exact window start time as it depands on the time that messages were
-    // inserted by the producer. So here we just use a low bound to make sure the internal
-    // conversion works.
-    assert(
-      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
-      s"Unexpected results: $row")
-    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
-    query.stop()
+    testStream(reader.load)(
+      makeSureGetOffsetCalled,
+      StopStream,
+      StartStream(),
+      StopStream)
   }
-}
 
-class KafkaSourceSuiteBase extends KafkaSourceTest {
+  test("maxOffsetsPerTrigger") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 3)
+    testUtils.sendMessages(topic, (100 to 200).map(_.toString).toArray, Some(0))
+    testUtils.sendMessages(topic, (10 to 20).map(_.toString).toArray, Some(1))
+    testUtils.sendMessages(topic, Array("1"), Some(2))
 
-  import testImplicits._
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("maxOffsetsPerTrigger", 10)
+      .option("subscribe", topic)
+      .option("startingOffsets", "earliest")
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped: org.apache.spark.sql.Dataset[_] = kafka.map(kv => kv._2.toInt)
+
+    val clock = new StreamManualClock
+
+    val waitUntilBatchProcessed = AssertOnQuery { q =>
+      eventually(Timeout(streamingTimeout)) {
+        if (!q.exception.isDefined) {
+          assert(clock.isStreamWaitingAt(clock.getTimeMillis()))
+        }
+      }
+      if (q.exception.isDefined) {
+        throw q.exception.get
+      }
+      true
+    }
+
+    testStream(mapped)(
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // 1 from smallest, 1 from middle, 8 from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116
+      ),
+      StopStream,
+      StartStream(ProcessingTime(100), clock),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125
+      ),
+      AdvanceManualClock(100),
+      waitUntilBatchProcessed,
+      // smallest now empty, 1 more from middle, 9 more from biggest
+      CheckAnswer(1, 10, 100, 101, 102, 103, 104, 105, 106, 107,
+        11, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+        12, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        13, 126, 127, 128, 129, 130, 131, 132, 133, 134
+      )
+    )
+  }
 
   test("cannot stop Kafka stream") {
     val topic = newTopic()
@@ -451,7 +328,7 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       .format("kafka")
       .option("kafka.bootstrap.servers", testUtils.brokerAddress)
       .option("kafka.metadata.max.age.ms", "1")
-      .option("subscribePattern", s"$topic.*")
+      .option("subscribePattern", s"topic-.*")
 
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
@@ -545,6 +422,65 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     }
   }
 
+  test("subscribing topic by pattern with topic deletions") {
+    val topicPrefix = newTopic()
+    val topic = topicPrefix + "-seems"
+    val topic2 = topicPrefix + "-bad"
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("subscribePattern", s"$topicPrefix-.*")
+      .option("failOnDataLoss", "false")
+
+    val kafka = reader.load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      Assert {
+        testUtils.deleteTopic(topic)
+        testUtils.createTopic(topic2, partitions = 5)
+        true
+      },
+      AddKafkaData(Set(topic2), 4, 5, 6),
+      CheckAnswer(2, 3, 4, 5, 6, 7)
+    )
+  }
+
+  test("starting offset is latest by default") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("0"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val reader = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("subscribe", topic)
+
+    val kafka = reader.load()
+      .selectExpr("CAST(value AS STRING)")
+      .as[String]
+    val mapped = kafka.map(_.toInt)
+
+    testStream(mapped)(
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(1, 2, 3)  // should not have 0
+    )
+  }
+
   test("bad source options") {
     def testBadOptions(options: (String, String)*)(expectedMsgs: String*): Unit = {
       val ex = intercept[IllegalArgumentException] {
@@ -604,6 +540,34 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     testUnsupportedConfig("kafka.auto.offset.reset", "latest")
   }
 
+  test("input row metrics") {
+    val topic = newTopic()
+    testUtils.createTopic(topic, partitions = 5)
+    testUtils.sendMessages(topic, Array("-1"))
+    require(testUtils.getLatestOffsets(Set(topic)).size === 5)
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("subscribe", topic)
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .load()
+      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
+      .as[(String, String)]
+
+    val mapped = kafka.map(kv => kv._2.toInt + 1)
+    testStream(mapped)(
+      StartStream(trigger = ProcessingTime(1)),
+      makeSureGetOffsetCalled,
+      AddKafkaData(Set(topic), 1, 2, 3),
+      CheckAnswer(2, 3, 4),
+      AssertOnQuery { query =>
+        val recordsRead = query.recentProgress.map(_.numInputRows).sum
+        recordsRead == 3
+      }
+    )
+  }
+
   test("delete a topic when a Spark job is running") {
     KafkaSourceSuite.collectedData.clear()
 
@@ -665,6 +629,8 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     }
   }
 
+  private def newTopic(): String = s"topic-${topicId.getAndIncrement()}"
+
   private def assignString(topic: String, partitions: Iterable[Int]): String = {
     JsonUtils.partitions(partitions.map(p => new TopicPartition(topic, p)))
   }
@@ -710,10 +676,6 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
 
     testStream(mapped)(
       makeSureGetOffsetCalled,
-      Execute { q =>
-        // wait to reach the last offset in every partition
-        q.awaitOffset(0, KafkaSourceOffset(partitionOffsets.mapValues(_ => 3L)))
-      },
       CheckAnswer(-20, -21, -22, 0, 1, 2, 11, 12, 22),
       StopStream,
       StartStream(),
@@ -744,7 +706,6 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       .format("memory")
       .outputMode("append")
       .queryName("kafkaColumnTypes")
-      .trigger(defaultTrigger)
       .start()
     query.processAllAvailable()
     val rows = spark.table("kafkaColumnTypes").collect()
@@ -762,6 +723,47 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
     query.stop()
   }
 
+  test("KafkaSource with watermark") {
+    val now = System.currentTimeMillis()
+    val topic = newTopic()
+    testUtils.createTopic(newTopic(), partitions = 1)
+    testUtils.sendMessages(topic, Array(1).map(_.toString))
+
+    val kafka = spark
+      .readStream
+      .format("kafka")
+      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
+      .option("kafka.metadata.max.age.ms", "1")
+      .option("startingOffsets", s"earliest")
+      .option("subscribe", topic)
+      .load()
+
+    val windowedAggregation = kafka
+      .withWatermark("timestamp", "10 seconds")
+      .groupBy(window($"timestamp", "5 seconds") as 'window)
+      .agg(count("*") as 'count)
+      .select($"window".getField("start") as 'window, $"count")
+
+    val query = windowedAggregation
+      .writeStream
+      .format("memory")
+      .outputMode("complete")
+      .queryName("kafkaWatermark")
+      .start()
+    query.processAllAvailable()
+    val rows = spark.table("kafkaWatermark").collect()
+    assert(rows.length === 1, s"Unexpected results: ${rows.toList}")
+    val row = rows(0)
+    // We cannot check the exact window start time as it depands on the time that messages were
+    // inserted by the producer. So here we just use a low bound to make sure the internal
+    // conversion works.
+    assert(
+      row.getAs[java.sql.Timestamp]("window").getTime >= now - 5 * 1000,
+      s"Unexpected results: $row")
+    assert(row.getAs[Int]("count") === 1, s"Unexpected results: $row")
+    query.stop()
+  }
+
   private def testFromLatestOffsets(
       topic: String,
       addPartitions: Boolean,
@@ -798,7 +800,9 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       AddKafkaData(Set(topic), 7, 8),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) setTopicPartitions(topic, 10, query)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -839,7 +843,9 @@ class KafkaSourceSuiteBase extends KafkaSourceTest {
       StartStream(),
       CheckAnswer(2, 3, 4, 5, 6, 7, 8, 9),
       AssertOnQuery("Add partitions") { query: StreamExecution =>
-        if (addPartitions) setTopicPartitions(topic, 10, query)
+        if (addPartitions) {
+          testUtils.addPartitions(topic, 10)
+        }
         true
       },
       AddKafkaData(Set(topic), 9, 10, 11, 12, 13, 14, 15, 16),
@@ -971,23 +977,6 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     }
   }
 
-  protected def startStream(ds: Dataset[Int]) = {
-    ds.writeStream.foreach(new ForeachWriter[Int] {
-
-      override def open(partitionId: Long, version: Long): Boolean = {
-        true
-      }
-
-      override def process(value: Int): Unit = {
-        // Slow down the processing speed so that messages may be aged out.
-        Thread.sleep(Random.nextInt(500))
-      }
-
-      override def close(errorOrNull: Throwable): Unit = {
-      }
-    }).start()
-  }
-
   test("stress test for failOnDataLoss=false") {
     val reader = spark
       .readStream
@@ -1001,7 +990,20 @@ class KafkaSourceStressForDontFailOnDataLossSuite extends StreamTest with Shared
     val kafka = reader.load()
       .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
       .as[(String, String)]
-    val query = startStream(kafka.map(kv => kv._2.toInt))
+    val query = kafka.map(kv => kv._2.toInt).writeStream.foreach(new ForeachWriter[Int] {
+
+      override def open(partitionId: Long, version: Long): Boolean = {
+        true
+      }
+
+      override def process(value: Int): Unit = {
+        // Slow down the processing speed so that messages may be aged out.
+        Thread.sleep(Random.nextInt(500))
+      }
+
+      override def close(errorOrNull: Throwable): Unit = {
+      }
+    }).start()
 
     val testTime = 1.minutes
     val startTime = System.currentTimeMillis()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b714a46b5f786..e8d683a578f35 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -191,9 +191,6 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           ds = ds.asInstanceOf[DataSourceV2],
           conf = sparkSession.sessionState.conf)).asJava)
 
-      // Streaming also uses the data source V2 API. So it may be that the data source implements
-      // v2, but has no v2 implementation for batch reads. In that case, we fall back to loading
-      // the dataframe as a v1 source.
       val reader = (ds, userSpecifiedSchema) match {
         case (ds: ReadSupportWithSchema, Some(schema)) =>
           ds.createReader(schema, options)
@@ -211,30 +208,23 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
           }
           reader
 
-        case _ => null // fall back to v1
+        case _ =>
+          throw new AnalysisException(s"$cls does not support data reading.")
       }
 
-      if (reader == null) {
-        loadV1Source(paths: _*)
-      } else {
-        Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
-      }
+      Dataset.ofRows(sparkSession, DataSourceV2Relation(reader))
     } else {
-      loadV1Source(paths: _*)
+      // Code path for data source v1.
+      sparkSession.baseRelationToDataFrame(
+        DataSource.apply(
+          sparkSession,
+          paths = paths,
+          userSpecifiedSchema = userSpecifiedSchema,
+          className = source,
+          options = extraOptions.toMap).resolveRelation())
     }
   }
 
-  private def loadV1Source(paths: String*) = {
-    // Code path for data source v1.
-    sparkSession.baseRelationToDataFrame(
-      DataSource.apply(
-        sparkSession,
-        paths = paths,
-        userSpecifiedSchema = userSpecifiedSchema,
-        className = source,
-        options = extraOptions.toMap).resolveRelation())
-  }
-
   /**
    * Construct a `DataFrame` representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 97f12ff625c42..3304f368e1050 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -255,24 +255,17 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
             }
           }
 
-        // Streaming also uses the data source V2 API. So it may be that the data source implements
-        // v2, but has no v2 implementation for batch writes. In that case, we fall back to saving
-        // as though it's a V1 source.
-        case _ => saveToV1Source()
+        case _ => throw new AnalysisException(s"$cls does not support data writing.")
       }
     } else {
-      saveToV1Source()
-    }
-  }
-
-  private def saveToV1Source(): Unit = {
-    // Code path for data source v1.
-    runCommand(df.sparkSession, "save") {
-      DataSource(
-        sparkSession = df.sparkSession,
-        className = source,
-        partitionColumns = partitioningColumns.getOrElse(Nil),
-        options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
+      // Code path for data source v1.
+      runCommand(df.sparkSession, "save") {
+        DataSource(
+          sparkSession = df.sparkSession,
+          className = source,
+          partitionColumns = partitioningColumns.getOrElse(Nil),
+          options = extraOptions.toMap).planForWriting(mode, AnalysisBarrier(df.logicalPlan))
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
index a4a857f2d4d9b..f0bdf84bb7a84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -81,11 +81,9 @@ case class WriteToDataSourceV2Exec(writer: DataSourceV2Writer, query: SparkPlan)
         (index, message: WriterCommitMessage) => messages(index) = message
       )
 
-      if (!writer.isInstanceOf[ContinuousWriter]) {
-        logInfo(s"Data source writer $writer is committing.")
-        writer.commit(messages)
-        logInfo(s"Data source writer $writer committed.")
-      }
+      logInfo(s"Data source writer $writer is committing.")
+      writer.commit(messages)
+      logInfo(s"Data source writer $writer committed.")
     } catch {
       case _: InterruptedException if writer.isInstanceOf[ContinuousWriter] =>
         // Interruption is how continuous queries are ended, so accept and ignore the exception.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index cf27e1a70650a..24a8b000df0c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -142,8 +142,7 @@ abstract class StreamExecution(
 
   override val id: UUID = UUID.fromString(streamMetadata.id)
 
-  override def runId: UUID = currentRunId
-  protected var currentRunId = UUID.randomUUID
+  override val runId: UUID = UUID.randomUUID
 
   /**
    * Pretty identified string of printing in logs. Format is
@@ -419,17 +418,11 @@ abstract class StreamExecution(
    * Blocks the current thread until processing for data from the given `source` has reached at
    * least the given `Offset`. This method is intended for use primarily when writing tests.
    */
-  private[sql] def awaitOffset(sourceIndex: Int, newOffset: Offset): Unit = {
+  private[sql] def awaitOffset(source: BaseStreamingSource, newOffset: Offset): Unit = {
     assertAwaitThread()
     def notDone = {
       val localCommittedOffsets = committedOffsets
-      if (sources == null) {
-        // sources might not be initialized yet
-        false
-      } else {
-        val source = sources(sourceIndex)
-        !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
-      }
+      !localCommittedOffsets.contains(source) || localCommittedOffsets(source) != newOffset
     }
 
     while (notDone) {
@@ -443,7 +436,7 @@ abstract class StreamExecution(
         awaitProgressLock.unlock()
       }
     }
-    logDebug(s"Unblocked at $newOffset for ${sources(sourceIndex)}")
+    logDebug(s"Unblocked at $newOffset for $source")
   }
 
   /** A flag to indicate that a batch has completed with no new data available. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
index e700aa4f9aea7..d79e4bd65f563 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousDataSourceRDDIter.scala
@@ -77,6 +77,7 @@ class ContinuousDataSourceRDD(
     dataReaderThread.start()
 
     context.addTaskCompletionListener(_ => {
+      reader.close()
       dataReaderThread.interrupt()
       epochPollExecutor.shutdown()
     })
@@ -200,8 +201,6 @@ class DataReaderThread(
         failedFlag.set(true)
         // Don't rethrow the exception in this thread. It's not needed, and the default Spark
         // exception handler will kill the executor.
-    } finally {
-      reader.close()
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
index 667410ef9f1c6..9657b5e26d770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -17,9 +17,7 @@
 
 package org.apache.spark.sql.execution.streaming.continuous
 
-import java.util.UUID
 import java.util.concurrent.TimeUnit
-import java.util.function.UnaryOperator
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => MutableMap}
@@ -54,7 +52,7 @@ class ContinuousExecution(
     sparkSession, name, checkpointRoot, analyzedPlan, sink,
     trigger, triggerClock, outputMode, deleteCheckpointOnStop) {
 
-  @volatile protected var continuousSources: Seq[ContinuousReader] = _
+  @volatile protected var continuousSources: Seq[ContinuousReader] = Seq.empty
   override protected def sources: Seq[BaseStreamingSource] = continuousSources
 
   override lazy val logicalPlan: LogicalPlan = {
@@ -80,17 +78,15 @@ class ContinuousExecution(
   }
 
   override protected def runActivatedStream(sparkSessionForStream: SparkSession): Unit = {
-    val stateUpdate = new UnaryOperator[State] {
-      override def apply(s: State) = s match {
-        // If we ended the query to reconfigure, reset the state to active.
-        case RECONFIGURING => ACTIVE
-        case _ => s
-      }
-    }
-
     do {
-      runContinuous(sparkSessionForStream)
-    } while (state.updateAndGet(stateUpdate) == ACTIVE)
+      try {
+        runContinuous(sparkSessionForStream)
+      } catch {
+        case _: InterruptedException if state.get().equals(RECONFIGURING) =>
+          // swallow exception and run again
+          state.set(ACTIVE)
+      }
+    } while (state.get() == ACTIVE)
   }
 
   /**
@@ -124,16 +120,12 @@ class ContinuousExecution(
         }
         committedOffsets = nextOffsets.toStreamProgress(sources)
 
-        // Get to an epoch ID that has definitely never been sent to a sink before. Since sink
-        // commit happens between offset log write and commit log write, this means an epoch ID
-        // which is not in the offset log.
-        val (latestOffsetEpoch, _) = offsetLog.getLatest().getOrElse {
-          throw new IllegalStateException(
-            s"Offset log had no latest element. This shouldn't be possible because nextOffsets is" +
-              s"an element.")
-        }
-        currentBatchId = latestOffsetEpoch + 1
+        // Forcibly align commit and offset logs by slicing off any spurious offset logs from
+        // a previous run. We can't allow commits to an epoch that a previous run reached but
+        // this run has not.
+        offsetLog.purgeAfter(latestEpochId)
 
+        currentBatchId = latestEpochId + 1
         logDebug(s"Resuming at epoch $currentBatchId with committed offsets $committedOffsets")
         nextOffsets
       case None =>
@@ -149,7 +141,6 @@ class ContinuousExecution(
    * @param sparkSessionForQuery Isolated [[SparkSession]] to run the continuous query with.
    */
   private def runContinuous(sparkSessionForQuery: SparkSession): Unit = {
-    currentRunId = UUID.randomUUID
     // A list of attributes that will need to be updated.
     val replacements = new ArrayBuffer[(Attribute, Attribute)]
     // Translate from continuous relation to the underlying data source.
@@ -234,11 +225,13 @@ class ContinuousExecution(
           triggerExecutor.execute(() => {
             startTrigger()
 
-            if (reader.needsReconfiguration() && state.compareAndSet(ACTIVE, RECONFIGURING)) {
+            if (reader.needsReconfiguration()) {
+              state.set(RECONFIGURING)
               stopSources()
               if (queryExecutionThread.isAlive) {
                 sparkSession.sparkContext.cancelJobGroup(runId.toString)
                 queryExecutionThread.interrupt()
+                // No need to join - this thread is about to end anyway.
               }
               false
             } else if (isActive) {
@@ -266,7 +259,6 @@ class ContinuousExecution(
           sparkSessionForQuery, lastExecution)(lastExecution.toRdd)
       }
     } finally {
-      epochEndpoint.askSync[Unit](StopContinuousExecutionWrites)
       SparkEnv.get.rpcEnv.stop(epochEndpoint)
 
       epochUpdateThread.interrupt()
@@ -281,22 +273,17 @@ class ContinuousExecution(
       epoch: Long, reader: ContinuousReader, partitionOffsets: Seq[PartitionOffset]): Unit = {
     assert(continuousSources.length == 1, "only one continuous source supported currently")
 
-    val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
-    val oldOffset = synchronized {
-      offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
-      offsetLog.get(epoch - 1)
+    if (partitionOffsets.contains(null)) {
+      // If any offset is null, that means the corresponding partition hasn't seen any data yet, so
+      // there's nothing meaningful to add to the offset log.
     }
-
-    // If offset hasn't changed since last epoch, there's been no new data.
-    if (oldOffset.contains(OffsetSeq.fill(globalOffset))) {
-      noNewData = true
-    }
-
-    awaitProgressLock.lock()
-    try {
-      awaitProgressLockCondition.signalAll()
-    } finally {
-      awaitProgressLock.unlock()
+    val globalOffset = reader.mergeOffsets(partitionOffsets.toArray)
+    synchronized {
+      if (queryExecutionThread.isAlive) {
+        offsetLog.add(epoch, OffsetSeq.fill(globalOffset))
+      } else {
+        return
+      }
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index 40dcbecade814..98017c3ac6a33 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -39,15 +39,6 @@ private[continuous] sealed trait EpochCoordinatorMessage extends Serializable
  */
 private[sql] case object IncrementAndGetEpoch extends EpochCoordinatorMessage
 
-/**
- * The RpcEndpoint stop() will wait to clear out the message queue before terminating the
- * object. This can lead to a race condition where the query restarts at epoch n, a new
- * EpochCoordinator starts at epoch n, and then the old epoch coordinator commits epoch n + 1.
- * The framework doesn't provide a handle to wait on the message queue, so we use a synchronous
- * message to stop any writes to the ContinuousExecution object.
- */
-private[sql] case object StopContinuousExecutionWrites extends EpochCoordinatorMessage
-
 // Init messages
 /**
  * Set the reader and writer partition counts. Tasks may not be started until the coordinator
@@ -125,8 +116,6 @@ private[continuous] class EpochCoordinator(
     override val rpcEnv: RpcEnv)
   extends ThreadSafeRpcEndpoint with Logging {
 
-  private var queryWritesStopped: Boolean = false
-
   private var numReaderPartitions: Int = _
   private var numWriterPartitions: Int = _
 
@@ -158,16 +147,12 @@ private[continuous] class EpochCoordinator(
         partitionCommits.remove(k)
       }
       for (k <- partitionOffsets.keys.filter { case (e, _) => e < epoch }) {
-        partitionOffsets.remove(k)
+        partitionCommits.remove(k)
       }
     }
   }
 
   override def receive: PartialFunction[Any, Unit] = {
-    // If we just drop these messages, we won't do any writes to the query. The lame duck tasks
-    // won't shed errors or anything.
-    case _ if queryWritesStopped => ()
-
     case CommitPartitionEpoch(partitionId, epoch, message) =>
       logDebug(s"Got commit from partition $partitionId at epoch $epoch: $message")
       if (!partitionCommits.isDefinedAt((epoch, partitionId))) {
@@ -203,9 +188,5 @@ private[continuous] class EpochCoordinator(
     case SetWriterPartitions(numPartitions) =>
       numWriterPartitions = numPartitions
       context.reply(())
-
-    case StopContinuousExecutionWrites =>
-      queryWritesStopped = true
-      context.reply(())
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index b5b4a05ab4973..db588ae282f38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -29,7 +29,6 @@ import org.apache.spark.sql.execution.datasources.DataSource
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousTrigger
 import org.apache.spark.sql.execution.streaming.sources.{MemoryPlanV2, MemorySinkV2}
-import org.apache.spark.sql.sources.v2.streaming.ContinuousWriteSupport
 
 /**
  * Interface used to write a streaming `Dataset` to external storage systems (e.g. file systems,
@@ -280,29 +279,18 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
         useTempCheckpointLocation = true,
         trigger = trigger)
     } else {
-      val sink = trigger match {
-        case _: ContinuousTrigger =>
-          val ds = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
-          ds.newInstance() match {
-            case w: ContinuousWriteSupport => w
-            case _ => throw new AnalysisException(
-              s"Data source $source does not support continuous writing")
-          }
-        case _ =>
-          val ds = DataSource(
-            df.sparkSession,
-            className = source,
-            options = extraOptions.toMap,
-            partitionColumns = normalizedParCols.getOrElse(Nil))
-          ds.createSink(outputMode)
-      }
-
+      val dataSource =
+        DataSource(
+          df.sparkSession,
+          className = source,
+          options = extraOptions.toMap,
+          partitionColumns = normalizedParCols.getOrElse(Nil))
       df.sparkSession.sessionState.streamingQueryManager.startQuery(
         extraOptions.get("queryName"),
         extraOptions.get("checkpointLocation"),
         df,
         extraOptions.toMap,
-        sink,
+        dataSource.createSink(outputMode),
         outputMode,
         useTempCheckpointLocation = source == "console",
         recoverFromCheckpointLocation = true,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 0762895fdc620..d46461fa9bf6d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -38,9 +38,8 @@ import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
 import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
-import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, ContinuousTrigger, EpochCoordinatorRef, IncrementAndGetEpoch}
+import org.apache.spark.sql.execution.streaming.continuous.{ContinuousExecution, EpochCoordinatorRef, IncrementAndGetEpoch}
 import org.apache.spark.sql.execution.streaming.sources.MemorySinkV2
 import org.apache.spark.sql.execution.streaming.state.StateStore
 import org.apache.spark.sql.streaming.StreamingQueryListener._
@@ -81,9 +80,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
     StateStore.stop() // stop the state store maintenance thread and unload store providers
   }
 
-  protected val defaultTrigger = Trigger.ProcessingTime(0)
-  protected val defaultUseV2Sink = false
-
   /** How long to wait for an active stream to catch up when checking a result. */
   val streamingTimeout = 10.seconds
 
@@ -193,7 +189,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
   /** Starts the stream, resuming if data has already been processed. It must not be running. */
   case class StartStream(
-      trigger: Trigger = defaultTrigger,
+      trigger: Trigger = Trigger.ProcessingTime(0),
       triggerClock: Clock = new SystemClock,
       additionalConfs: Map[String, String] = Map.empty,
       checkpointLocation: String = null)
@@ -280,7 +276,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
   def testStream(
       _stream: Dataset[_],
       outputMode: OutputMode = OutputMode.Append,
-      useV2Sink: Boolean = defaultUseV2Sink)(actions: StreamAction*): Unit = synchronized {
+      useV2Sink: Boolean = false)(actions: StreamAction*): Unit = synchronized {
     import org.apache.spark.sql.streaming.util.StreamManualClock
 
     // `synchronized` is added to prevent the user from calling multiple `testStream`s concurrently
@@ -407,11 +403,18 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
     def fetchStreamAnswer(currentStream: StreamExecution, lastOnly: Boolean) = {
       verify(currentStream != null, "stream not running")
+      // Get the map of source index to the current source objects
+      val indexToSource = currentStream
+        .logicalPlan
+        .collect { case StreamingExecutionRelation(s, _) => s }
+        .zipWithIndex
+        .map(_.swap)
+        .toMap
 
       // Block until all data added has been processed for all the source
       awaiting.foreach { case (sourceIndex, offset) =>
         failAfter(streamingTimeout) {
-          currentStream.awaitOffset(sourceIndex, offset)
+          currentStream.awaitOffset(indexToSource(sourceIndex), offset)
         }
       }
 
@@ -470,12 +473,6 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
             // after starting the query.
             try {
               currentStream.awaitInitialization(streamingTimeout.toMillis)
-              currentStream match {
-                case s: ContinuousExecution => eventually("IncrementalExecution was not created") {
-                    s.lastExecution.executedPlan // will fail if lastExecution is null
-                  }
-                case _ =>
-              }
             } catch {
               case _: StreamingQueryException =>
                 // Ignore the exception. `StopStream` or `ExpectFailure` will catch it as well.
@@ -603,10 +600,7 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
 
               def findSourceIndex(plan: LogicalPlan): Option[Int] = {
                 plan
-                  .collect {
-                    case StreamingExecutionRelation(s, _) => s
-                    case DataSourceV2Relation(_, r) => r
-                  }
+                  .collect { case StreamingExecutionRelation(s, _) => s }
                   .zipWithIndex
                   .find(_._1 == source)
                   .map(_._2)
@@ -619,13 +613,9 @@ trait StreamTest extends QueryTest with SharedSQLContext with TimeLimits with Be
                   findSourceIndex(query.logicalPlan)
                 }.orElse {
                   findSourceIndex(stream.logicalPlan)
-                }.orElse {
-                  queryToUse.flatMap { q =>
-                    findSourceIndex(q.lastExecution.logical)
-                  }
                 }.getOrElse {
                   throw new IllegalArgumentException(
-                    "Could not find index of the source to which data was added")
+                    "Could find index of the source to which data was added")
                 }
 
               // Store the expected offset of added data to wait for it later

From cd9f49a2aed3799964976ead06080a0f7044a0c3 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sat, 13 Jan 2018 16:13:44 +0900
Subject: [PATCH 328/356] [SPARK-22980][PYTHON][SQL] Clarify the length of each
 series is of each batch within scalar Pandas UDF

## What changes were proposed in this pull request?

This PR proposes to add a note that saying the length of a scalar Pandas UDF's `Series` is not of the whole input column but of the batch.

We are fine for a group map UDF because the usage is different from our typical UDF but scalar UDFs might cause confusion with the normal UDF.

For example, please consider this example:

```python
from pyspark.sql.functions import pandas_udf, col, lit

df = spark.range(1)
f = pandas_udf(lambda x, y: len(x) + y, LongType())
df.select(f(lit('text'), col('id'))).show()
```

```
+------------------+
|<lambda>(text, id)|
+------------------+
|                 1|
+------------------+
```

```python
from pyspark.sql.functions import udf, col, lit

df = spark.range(1)
f = udf(lambda x, y: len(x) + y, "long")
df.select(f(lit('text'), col('id'))).show()
```

```
+------------------+
|<lambda>(text, id)|
+------------------+
|                 4|
+------------------+
```

## How was this patch tested?

Manually built the doc and checked the output.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20237 from HyukjinKwon/SPARK-22980.
---
 python/pyspark/sql/functions.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 733e32bd825b0..e1ad6590554cf 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -2184,6 +2184,11 @@ def pandas_udf(f=None, returnType=None, functionType=None):
        |         8|      JOHN DOE|          22|
        +----------+--------------+------------+
 
+       .. note:: The length of `pandas.Series` within a scalar UDF is not that of the whole input
+           column, but is the length of an internal batch used for each call to the function.
+           Therefore, this can be used, for example, to ensure the length of each returned
+           `pandas.Series`, and can not be used as the column length.
+
     2. GROUP_MAP
 
        A group map UDF defines transformation: A `pandas.DataFrame` -> A `pandas.DataFrame`

From 628a1ca5a4d14397a90e9e96a7e03e8f63531b20 Mon Sep 17 00:00:00 2001
From: shimamoto <chibochibo@gmail.com>
Date: Sat, 13 Jan 2018 09:40:00 -0600
Subject: [PATCH 329/356] [SPARK-23043][BUILD] Upgrade json4s to 3.5.3

## What changes were proposed in this pull request?

Spark still use a few years old version 3.2.11. This change is to upgrade json4s to 3.5.3.

Note that this change does not include the Jackson update because the Jackson version referenced in json4s 3.5.3 is 2.8.4, which has a security vulnerability ([see](https://issues.apache.org/jira/browse/SPARK-20433)).

## How was this patch tested?

Existing unit tests and build.

Author: shimamoto <chibochibo@gmail.com>

Closes #20233 from shimamoto/upgrade-json4s.
---
 .../deploy/history/HistoryServerSuite.scala   |  2 +-
 .../org/apache/spark/ui/UISeleniumSuite.scala | 19 ++++++++++---------
 dev/deps/spark-deps-hadoop-2.6                |  8 ++++----
 dev/deps/spark-deps-hadoop-2.7                |  8 ++++----
 pom.xml                                       | 13 +++++++------
 5 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
index 3738f85da5831..87778dda0e2c8 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/HistoryServerSuite.scala
@@ -486,7 +486,7 @@ class HistoryServerSuite extends SparkFunSuite with BeforeAndAfter with Matchers
       json match {
         case JNothing => Seq()
         case apps: JArray =>
-          apps.filter(app => {
+          apps.children.filter(app => {
             (app \ "attempts") match {
               case attempts: JArray =>
                 val state = (attempts.children.head \ "completed").asInstanceOf[JBool]
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 326546787ab6c..ed51fc445fdfb 100644
--- a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
@@ -131,7 +131,8 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
 
       val storageJson = getJson(ui, "storage/rdd")
       storageJson.children.length should be (1)
-      (storageJson \ "storageLevel").extract[String] should be (StorageLevels.DISK_ONLY.description)
+      (storageJson.children.head \ "storageLevel").extract[String] should be (
+        StorageLevels.DISK_ONLY.description)
       val rddJson = getJson(ui, "storage/rdd/0")
       (rddJson  \ "storageLevel").extract[String] should be (StorageLevels.DISK_ONLY.description)
 
@@ -150,7 +151,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
 
       val updatedStorageJson = getJson(ui, "storage/rdd")
       updatedStorageJson.children.length should be (1)
-      (updatedStorageJson \ "storageLevel").extract[String] should be (
+      (updatedStorageJson.children.head \ "storageLevel").extract[String] should be (
         StorageLevels.MEMORY_ONLY.description)
       val updatedRddJson = getJson(ui, "storage/rdd/0")
       (updatedRddJson  \ "storageLevel").extract[String] should be (
@@ -204,7 +205,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
       }
       val stageJson = getJson(sc.ui.get, "stages")
       stageJson.children.length should be (1)
-      (stageJson \ "status").extract[String] should be (StageStatus.FAILED.name())
+      (stageJson.children.head \ "status").extract[String] should be (StageStatus.FAILED.name())
 
       // Regression test for SPARK-2105
       class NotSerializable
@@ -325,11 +326,11 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         find(cssSelector(".progress-cell .progress")).get.text should be ("2/2 (1 failed)")
       }
       val jobJson = getJson(sc.ui.get, "jobs")
-      (jobJson \ "numTasks").extract[Int]should be (2)
-      (jobJson \ "numCompletedTasks").extract[Int] should be (3)
-      (jobJson \ "numFailedTasks").extract[Int] should be (1)
-      (jobJson \ "numCompletedStages").extract[Int] should be (2)
-      (jobJson \ "numFailedStages").extract[Int] should be (1)
+      (jobJson \\ "numTasks").extract[Int]should be (2)
+      (jobJson \\ "numCompletedTasks").extract[Int] should be (3)
+      (jobJson \\ "numFailedTasks").extract[Int] should be (1)
+      (jobJson \\ "numCompletedStages").extract[Int] should be (2)
+      (jobJson \\ "numFailedStages").extract[Int] should be (1)
       val stageJson = getJson(sc.ui.get, "stages")
 
       for {
@@ -656,7 +657,7 @@ class UISeleniumSuite extends SparkFunSuite with WebBrowser with Matchers with B
         sc.ui.get.webUrl + "/api/v1/applications"))
       val appListJsonAst = JsonMethods.parse(appListRawJson)
       appListJsonAst.children.length should be (1)
-      val attempts = (appListJsonAst \ "attempts").children
+      val attempts = (appListJsonAst.children.head \ "attempts").children
       attempts.size should be (1)
       (attempts(0) \ "completed").extract[Boolean] should be (false)
       parseDate(attempts(0) \ "startTime") should be (sc.startTime)
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index a7fce2ede0ea5..2a298769be44c 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -122,9 +122,10 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json4s-ast_2.11-3.2.11.jar
-json4s-core_2.11-3.2.11.jar
-json4s-jackson_2.11-3.2.11.jar
+json4s-ast_2.11-3.5.3.jar
+json4s-core_2.11-3.5.3.jar
+json4s-jackson_2.11-3.5.3.jar
+json4s-scalap_2.11-3.5.3.jar
 jsr305-1.3.9.jar
 jta-1.1.jar
 jtransforms-2.4.0.jar
@@ -167,7 +168,6 @@ scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.5.jar
-scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index 94b2e98d85e74..abee326f283ab 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -122,9 +122,10 @@ jline-2.12.1.jar
 joda-time-2.9.3.jar
 jodd-core-3.5.2.jar
 jpam-1.1.jar
-json4s-ast_2.11-3.2.11.jar
-json4s-core_2.11-3.2.11.jar
-json4s-jackson_2.11-3.2.11.jar
+json4s-ast_2.11-3.5.3.jar
+json4s-core_2.11-3.5.3.jar
+json4s-jackson_2.11-3.5.3.jar
+json4s-scalap_2.11-3.5.3.jar
 jsp-api-2.1.jar
 jsr305-1.3.9.jar
 jta-1.1.jar
@@ -168,7 +169,6 @@ scala-library-2.11.8.jar
 scala-parser-combinators_2.11-1.0.4.jar
 scala-reflect-2.11.8.jar
 scala-xml_2.11-1.0.5.jar
-scalap-2.11.8.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
diff --git a/pom.xml b/pom.xml
index d14594aa4ccb0..666d5d7169a15 100644
--- a/pom.xml
+++ b/pom.xml
@@ -705,7 +705,13 @@
       <dependency>
         <groupId>org.json4s</groupId>
         <artifactId>json4s-jackson_${scala.binary.version}</artifactId>
-        <version>3.2.11</version>
+        <version>3.5.3</version>
+        <exclusions>
+          <exclusion>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>*</artifactId>
+          </exclusion>
+        </exclusions>
       </dependency>
       <dependency>
         <groupId>org.scala-lang</groupId>
@@ -732,11 +738,6 @@
         <artifactId>scala-parser-combinators_${scala.binary.version}</artifactId>
         <version>1.0.4</version>
       </dependency>
-      <dependency>
-        <groupId>org.scala-lang</groupId>
-        <artifactId>scalap</artifactId>
-        <version>${scala.version}</version>
-      </dependency>
       <!-- SPARK-16770 affecting Scala 2.11.x -->
       <dependency>
         <groupId>jline</groupId>

From fc6fe8a1d0f161c4788f3db94de49a8669ba3bcc Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Sat, 13 Jan 2018 10:01:44 -0600
Subject: [PATCH 330/356] [SPARK-22870][CORE] Dynamic allocation should allow 0
 idle time

## What changes were proposed in this pull request?

This pr to make `0` as a valid value for `spark.dynamicAllocation.executorIdleTimeout`.
For details, see the jira description: https://issues.apache.org/jira/browse/SPARK-22870.

## How was this patch tested?

N/A

Author: Yuming Wang <yumwang@ebay.com>
Author: Yuming Wang <wgyumg@gmail.com>

Closes #20080 from wangyum/SPARK-22870.
---
 .../scala/org/apache/spark/ExecutorAllocationManager.scala | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index 2e00dc8b49dd5..6c59038f2a6c1 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -195,8 +195,11 @@ private[spark] class ExecutorAllocationManager(
       throw new SparkException(
         "spark.dynamicAllocation.sustainedSchedulerBacklogTimeout must be > 0!")
     }
-    if (executorIdleTimeoutS <= 0) {
-      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be > 0!")
+    if (executorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.executorIdleTimeout must be >= 0!")
+    }
+    if (cachedExecutorIdleTimeoutS < 0) {
+      throw new SparkException("spark.dynamicAllocation.cachedExecutorIdleTimeout must be >= 0!")
     }
     // Require external shuffle service for dynamic allocation
     // Otherwise, we may lose shuffle files when killing executors

From bd4a21b4820c4ebaf750131574a6b2eeea36907e Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Sun, 14 Jan 2018 02:28:57 +0800
Subject: [PATCH 331/356] [SPARK-23036][SQL][TEST] Add withGlobalTempView for
 testing

## What changes were proposed in this pull request?

Add withGlobalTempView when create global temp view, like withTempView and withView.
And correct some improper usage.
Please see jira.
There are other similar place like that. I will fix it if community need. Please confirm it.
## How was this patch tested?

no new test.

Author: xubo245 <601450868@qq.com>

Closes #20228 from xubo245/DropTempView.
---
 .../sql/execution/GlobalTempViewSuite.scala   | 55 ++++++++-----------
 .../spark/sql/execution/SQLViewSuite.scala    | 34 +++++++-----
 .../apache/spark/sql/test/SQLTestUtils.scala  | 21 +++++--
 3 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
index cc943e0356f2a..dcc6fa6403f31 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/GlobalTempViewSuite.scala
@@ -36,7 +36,7 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("basic semantic") {
     val expectedErrorMsg = "not found"
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 'a'")
 
       // If there is no database in table name, we should try local temp view first, if not found,
@@ -79,19 +79,15 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
       // We can also use Dataset API to replace global temp view
       Seq(2 -> "b").toDF("i", "j").createOrReplaceGlobalTempView("src")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(2, "b"))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
   test("global temp view is shared among all sessions") {
-    try {
+    withGlobalTempView("src") {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
       checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, 2))
       val newSession = spark.newSession()
       checkAnswer(newSession.table(s"$globalTempDB.src"), Row(1, 2))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 
@@ -105,27 +101,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
 
   test("CREATE GLOBAL TEMP VIEW USING") {
     withTempPath { path =>
-      try {
+      withGlobalTempView("src") {
         Seq(1 -> "a").toDF("i", "j").write.parquet(path.getAbsolutePath)
         sql(s"CREATE GLOBAL TEMP VIEW src USING parquet OPTIONS (PATH '${path.toURI}')")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a"))
         sql(s"INSERT INTO $globalTempDB.src SELECT 2, 'b'")
         checkAnswer(spark.table(s"$globalTempDB.src"), Row(1, "a") :: Row(2, "b") :: Nil)
-      } finally {
-        spark.catalog.dropGlobalTempView("src")
       }
     }
   }
 
   test("CREATE TABLE LIKE should work for global temp view") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
-      sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
-      val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
-      assert(tableMeta.schema == new StructType().add("a", "int", false).add("b", "string", false))
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
-      sql("DROP TABLE default.cloned")
+    withTable("cloned") {
+      withGlobalTempView("src") {
+        sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1 AS a, '2' AS b")
+        sql(s"CREATE TABLE cloned LIKE $globalTempDB.src")
+        val tableMeta = spark.sessionState.catalog.getTableMetadata(TableIdentifier("cloned"))
+        assert(tableMeta.schema == new StructType()
+          .add("a", "int", false).add("b", "string", false))
+      }
     }
   }
 
@@ -146,26 +140,25 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
   }
 
   test("should lookup global temp view if and only if global temp db is specified") {
-    try {
-      sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
-      sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
+    withTempView("same_name") {
+      withGlobalTempView("same_name") {
+        sql("CREATE GLOBAL TEMP VIEW same_name AS SELECT 3, 4")
+        sql("CREATE TEMP VIEW same_name AS SELECT 1, 2")
 
-      checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
+        checkAnswer(sql("SELECT * FROM same_name"), Row(1, 2))
 
-      // we never lookup global temp views if database is not specified in table name
-      spark.catalog.dropTempView("same_name")
-      intercept[AnalysisException](sql("SELECT * FROM same_name"))
+        // we never lookup global temp views if database is not specified in table name
+        spark.catalog.dropTempView("same_name")
+        intercept[AnalysisException](sql("SELECT * FROM same_name"))
 
-      // Use qualified name to lookup a global temp view.
-      checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
-    } finally {
-      spark.catalog.dropTempView("same_name")
-      spark.catalog.dropGlobalTempView("same_name")
+        // Use qualified name to lookup a global temp view.
+        checkAnswer(sql(s"SELECT * FROM $globalTempDB.same_name"), Row(3, 4))
+      }
     }
   }
 
   test("public Catalog should recognize global temp view") {
-    try {
+    withGlobalTempView("src")  {
       sql("CREATE GLOBAL TEMP VIEW src AS SELECT 1, 2")
 
       assert(spark.catalog.tableExists(globalTempDB, "src"))
@@ -175,8 +168,6 @@ class GlobalTempViewSuite extends QueryTest with SharedSQLContext {
         description = null,
         tableType = "TEMPORARY",
         isTemporary = true).toString)
-    } finally {
-      spark.catalog.dropGlobalTempView("src")
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 08a4a21b20f61..8c55758cfe38d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -69,21 +69,25 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
   }
 
   test("create a permanent view on a temp view") {
-    withView("jtv1", "temp_jtv1", "global_temp_jtv1") {
-      sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
-      var e = intercept[AnalysisException] {
-        sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
-        "referencing a temporary view `temp_jtv1`"))
-
-      val globalTempDB = spark.sharedState.globalTempViewManager.database
-      sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
-      e = intercept[AnalysisException] {
-        sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
-      }.getMessage
-      assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
-        s"a temporary view `global_temp`.`global_temp_jtv1`"))
+    withView("jtv1") {
+      withTempView("temp_jtv1") {
+        withGlobalTempView("global_temp_jtv1") {
+          sql("CREATE TEMPORARY VIEW temp_jtv1 AS SELECT * FROM jt WHERE id > 3")
+          var e = intercept[AnalysisException] {
+            sql("CREATE VIEW jtv1 AS SELECT * FROM temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains("Not allowed to create a permanent view `jtv1` by " +
+            "referencing a temporary view `temp_jtv1`"))
+
+          val globalTempDB = spark.sharedState.globalTempViewManager.database
+          sql("CREATE GLOBAL TEMP VIEW global_temp_jtv1 AS SELECT * FROM jt WHERE id > 0")
+          e = intercept[AnalysisException] {
+            sql(s"CREATE VIEW jtv1 AS SELECT * FROM $globalTempDB.global_temp_jtv1 WHERE id < 6")
+          }.getMessage
+          assert(e.contains(s"Not allowed to create a permanent view `jtv1` by referencing " +
+            s"a temporary view `global_temp`.`global_temp_jtv1`"))
+        }
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
index 904f9f2ad0b22..bc4a120f7042f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -254,13 +254,26 @@ private[sql] trait SQLTestUtilsBase
   }
 
   /**
-   * Drops temporary table `tableName` after calling `f`.
+   * Drops temporary view `viewNames` after calling `f`.
    */
-  protected def withTempView(tableNames: String*)(f: => Unit): Unit = {
+  protected def withTempView(viewNames: String*)(f: => Unit): Unit = {
     try f finally {
       // If the test failed part way, we don't want to mask the failure by failing to remove
-      // temp tables that never got created.
-      try tableNames.foreach(spark.catalog.dropTempView) catch {
+      // temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropTempView) catch {
+        case _: NoSuchTableException =>
+      }
+    }
+  }
+
+  /**
+   * Drops global temporary view `viewNames` after calling `f`.
+   */
+  protected def withGlobalTempView(viewNames: String*)(f: => Unit): Unit = {
+    try f finally {
+      // If the test failed part way, we don't want to mask the failure by failing to remove
+      // global temp views that never got created.
+      try viewNames.foreach(spark.catalog.dropGlobalTempView) catch {
         case _: NoSuchTableException =>
       }
     }

From ba891ec993c616dc4249fc786c56ea82ed04a827 Mon Sep 17 00:00:00 2001
From: CodingCat <zhunansjtu@gmail.com>
Date: Sun, 14 Jan 2018 02:36:32 +0800
Subject: [PATCH 332/356] [SPARK-22790][SQL] add a configurable factor to
 describe HadoopFsRelation's size

## What changes were proposed in this pull request?

as per discussion in https://github.com/apache/spark/pull/19864#discussion_r156847927

the current HadoopFsRelation is purely based on the underlying file size which is not accurate and makes the execution vulnerable to errors like OOM

Users can enable CBO with the functionalities in https://github.com/apache/spark/pull/19864 to avoid this issue

This JIRA proposes to add a configurable factor to sizeInBytes method in HadoopFsRelation class so that users can mitigate this problem without CBO

## How was this patch tested?

Existing tests

Author: CodingCat <zhunansjtu@gmail.com>
Author: Nan Zhu <nanzhu@uber.com>

Closes #20072 from CodingCat/SPARK-22790.
---
 .../apache/spark/sql/internal/SQLConf.scala   | 13 +++++-
 .../datasources/HadoopFsRelation.scala        |  6 ++-
 .../datasources/HadoopFsRelationSuite.scala   | 41 +++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 36e802a9faa6f..6746fbcaf2483 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -249,7 +249,7 @@ object SQLConf {
   val CONSTRAINT_PROPAGATION_ENABLED = buildConf("spark.sql.constraintPropagation.enabled")
     .internal()
     .doc("When true, the query optimizer will infer and propagate data constraints in the query " +
-      "plan to optimize them. Constraint propagation can sometimes be computationally expensive" +
+      "plan to optimize them. Constraint propagation can sometimes be computationally expensive " +
       "for certain kinds of query plans (such as those with a large number of predicates and " +
       "aliases) which might negatively impact overall runtime.")
     .booleanConf
@@ -263,6 +263,15 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val FILE_COMRESSION_FACTOR = buildConf("spark.sql.sources.fileCompressionFactor")
+    .internal()
+    .doc("When estimating the output data size of a table scan, multiply the file size with this " +
+      "factor as the estimated data size, in case the data is compressed in the file and lead to" +
+      " a heavily underestimated result.")
+    .doubleConf
+    .checkValue(_ > 0, "the value of fileDataSizeFactor must be larger than 0")
+    .createWithDefault(1.0)
+
   val PARQUET_SCHEMA_MERGING_ENABLED = buildConf("spark.sql.parquet.mergeSchema")
     .doc("When true, the Parquet data source merges schemas collected from all data files, " +
          "otherwise the schema is picked from the summary file or a random data file " +
@@ -1255,6 +1264,8 @@ class SQLConf extends Serializable with Logging {
 
   def escapedStringLiterals: Boolean = getConf(ESCAPED_STRING_LITERALS)
 
+  def fileCompressionFactor: Double = getConf(FILE_COMRESSION_FACTOR)
+
   def stringRedationPattern: Option[Regex] = SQL_STRING_REDACTION_PATTERN.readFrom(reader)
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index 89d8a85a9cbd2..6b34638529770 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -82,7 +82,11 @@ case class HadoopFsRelation(
     }
   }
 
-  override def sizeInBytes: Long = location.sizeInBytes
+  override def sizeInBytes: Long = {
+    val compressionFactor = sqlContext.conf.fileCompressionFactor
+    (location.sizeInBytes * compressionFactor).toLong
+  }
+
 
   override def inputFiles: Array[String] = location.inputFiles
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
index caf03885e3873..c1f2c18d1417d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelationSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import java.io.{File, FilenameFilter}
 
 import org.apache.spark.sql.QueryTest
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.test.SharedSQLContext
 
 class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
@@ -39,4 +40,44 @@ class HadoopFsRelationSuite extends QueryTest with SharedSQLContext {
       assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize))
     }
   }
+
+  test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") {
+    import testImplicits._
+    Seq(1.0, 0.5).foreach { compressionFactor =>
+      withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString,
+        "spark.sql.autoBroadcastJoinThreshold" -> "400") {
+        withTempPath { workDir =>
+          // the file size is 740 bytes
+          val workDirPath = workDir.getAbsolutePath
+          val data1 = Seq(100, 200, 300, 400).toDF("count")
+          data1.write.parquet(workDirPath + "/data1")
+          val df1FromFile = spark.read.parquet(workDirPath + "/data1")
+          val data2 = Seq(100, 200, 300, 400).toDF("count")
+          data2.write.parquet(workDirPath + "/data2")
+          val df2FromFile = spark.read.parquet(workDirPath + "/data2")
+          val joinedDF = df1FromFile.join(df2FromFile, Seq("count"))
+          if (compressionFactor == 0.5) {
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.nonEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.isEmpty)
+          } else {
+            // compressionFactor is 1.0
+            val bJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case bJoin: BroadcastHashJoinExec => bJoin
+            }
+            assert(bJoinExec.isEmpty)
+            val smJoinExec = joinedDF.queryExecution.executedPlan.collect {
+              case smJoin: SortMergeJoinExec => smJoin
+            }
+            assert(smJoinExec.nonEmpty)
+          }
+        }
+      }
+    }
+  }
 }

From 0066d6f6fa604817468471832968d4339f71c5cb Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 14 Jan 2018 05:39:38 +0800
Subject: [PATCH 333/356] [SPARK-21213][SQL][FOLLOWUP] Use compatible types for
 comparisons in compareAndGetNewStats

## What changes were proposed in this pull request?
This pr fixed code to compare values in `compareAndGetNewStats`.
The test below fails in the current master;
```
    val oldStats2 = CatalogStatistics(sizeInBytes = BigInt(Long.MaxValue) * 2)
    val newStats5 = CommandUtils.compareAndGetNewStats(
      Some(oldStats2), newTotalSize = BigInt(Long.MaxValue) * 2, None)
    assert(newStats5.isEmpty)
```

## How was this patch tested?
Added some tests in `CommandUtilsSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20245 from maropu/SPARK-21213-FOLLOWUP.
---
 .../sql/execution/command/CommandUtils.scala  |  4 +-
 .../execution/command/CommandUtilsSuite.scala | 56 +++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 1a0d67fc71fbc..c27048626c8eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -116,8 +116,8 @@ object CommandUtils extends Logging {
       oldStats: Option[CatalogStatistics],
       newTotalSize: BigInt,
       newRowCount: Option[BigInt]): Option[CatalogStatistics] = {
-    val oldTotalSize = oldStats.map(_.sizeInBytes.toLong).getOrElse(-1L)
-    val oldRowCount = oldStats.flatMap(_.rowCount.map(_.toLong)).getOrElse(-1L)
+    val oldTotalSize = oldStats.map(_.sizeInBytes).getOrElse(BigInt(-1))
+    val oldRowCount = oldStats.flatMap(_.rowCount).getOrElse(BigInt(-1))
     var newStats: Option[CatalogStatistics] = None
     if (newTotalSize >= 0 && newTotalSize != oldTotalSize) {
       newStats = Some(CatalogStatistics(sizeInBytes = newTotalSize))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
new file mode 100644
index 0000000000000..f3e15189a6418
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/CommandUtilsSuite.scala
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.command
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
+
+class CommandUtilsSuite extends SparkFunSuite {
+
+  test("Check if compareAndGetNewStats returns correct results") {
+    val oldStats1 = CatalogStatistics(sizeInBytes = 10, rowCount = Some(100))
+    val newStats1 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 10, newRowCount = Some(100))
+    assert(newStats1.isEmpty)
+    val newStats2 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = None)
+    assert(newStats2.isEmpty)
+    val newStats3 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = 20, newRowCount = Some(-1))
+    assert(newStats3.isDefined)
+    newStats3.foreach { stat =>
+      assert(stat.sizeInBytes === 20)
+      assert(stat.rowCount.isEmpty)
+    }
+    val newStats4 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats1), newTotalSize = -1, newRowCount = Some(200))
+    assert(newStats4.isDefined)
+    newStats4.foreach { stat =>
+      assert(stat.sizeInBytes === 10)
+      assert(stat.rowCount.isDefined && stat.rowCount.get === 200)
+    }
+  }
+
+  test("Check if compareAndGetNewStats can handle large values") {
+    // Tests for large values
+    val oldStats2 = CatalogStatistics(sizeInBytes = BigInt(Long.MaxValue) * 2)
+    val newStats5 = CommandUtils.compareAndGetNewStats(
+      Some(oldStats2), newTotalSize = BigInt(Long.MaxValue) * 2, None)
+    assert(newStats5.isEmpty)
+  }
+}

From afae8f2bc82597593595af68d1aa2d802210ea8b Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Sun, 14 Jan 2018 11:26:49 +0900
Subject: [PATCH 334/356] [SPARK-22959][PYTHON] Configuration to select the
 modules for daemon and worker in PySpark

## What changes were proposed in this pull request?

We are now forced to use `pyspark/daemon.py` and `pyspark/worker.py` in PySpark.

This doesn't allow a custom modification for it (well, maybe we can still do this in a super hacky way though, for example, setting Python executable that has the custom modification). Because of this, for example, it's sometimes hard to debug what happens inside Python worker processes.

This is actually related with [SPARK-7721](https://issues.apache.org/jira/browse/SPARK-7721) too as somehow Coverage is unable to detect the coverage from `os.fork`. If we have some custom fixes to force the coverage, it works fine.

This is also related with [SPARK-20368](https://issues.apache.org/jira/browse/SPARK-20368). This JIRA describes Sentry support which (roughly) needs some changes within worker side.

With this configuration advanced users will be able to do a lot of pluggable workarounds and we can meet such potential needs in the future.

As an example, let's say if I configure the module `coverage_daemon` and had `coverage_daemon.py` in the python path:

```python
import os

from pyspark import daemon

if "COVERAGE_PROCESS_START" in os.environ:
    from pyspark.worker import main

    def _cov_wrapped(*args, **kwargs):
        import coverage
        cov = coverage.coverage(
            config_file=os.environ["COVERAGE_PROCESS_START"])
        cov.start()
        try:
            main(*args, **kwargs)
        finally:
            cov.stop()
            cov.save()
    daemon.worker_main = _cov_wrapped

if __name__ == '__main__':
    daemon.manager()
```

I can track the coverages in worker side too.

More importantly, we can leave the main code intact but allow some workarounds.

## How was this patch tested?

Manually tested.

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20151 from HyukjinKwon/configuration-daemon-worker.
---
 .../api/python/PythonWorkerFactory.scala      | 41 +++++++++++++++----
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
index f53c6178047f5..30976ac752a8a 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala
@@ -34,10 +34,10 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
   import PythonWorkerFactory._
 
-  // Because forking processes from Java is expensive, we prefer to launch a single Python daemon
-  // (pyspark/daemon.py) and tell it to fork new workers for our tasks. This daemon currently
-  // only works on UNIX-based systems now because it uses signals for child management, so we can
-  // also fall back to launching workers (pyspark/worker.py) directly.
+  // Because forking processes from Java is expensive, we prefer to launch a single Python daemon,
+  // pyspark/daemon.py (by default) and tell it to fork new workers for our tasks. This daemon
+  // currently only works on UNIX-based systems now because it uses signals for child management,
+  // so we can also fall back to launching workers, pyspark/worker.py (by default) directly.
   val useDaemon = {
     val useDaemonEnabled = SparkEnv.get.conf.getBoolean("spark.python.use.daemon", true)
 
@@ -45,6 +45,28 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
     !System.getProperty("os.name").startsWith("Windows") && useDaemonEnabled
   }
 
+  // WARN: Both configurations, 'spark.python.daemon.module' and 'spark.python.worker.module' are
+  // for very advanced users and they are experimental. This should be considered
+  // as expert-only option, and shouldn't be used before knowing what it means exactly.
+
+  // This configuration indicates the module to run the daemon to execute its Python workers.
+  val daemonModule = SparkEnv.get.conf.getOption("spark.python.daemon.module").map { value =>
+    logInfo(
+      s"Python daemon module in PySpark is set to [$value] in 'spark.python.daemon.module', " +
+      "using this to start the daemon up. Note that this configuration only has an effect when " +
+      "'spark.python.use.daemon' is enabled and the platform is not Windows.")
+    value
+  }.getOrElse("pyspark.daemon")
+
+  // This configuration indicates the module to run each Python worker.
+  val workerModule = SparkEnv.get.conf.getOption("spark.python.worker.module").map { value =>
+    logInfo(
+      s"Python worker module in PySpark is set to [$value] in 'spark.python.worker.module', " +
+      "using this to start the worker up. Note that this configuration only has an effect when " +
+      "'spark.python.use.daemon' is disabled or the platform is Windows.")
+    value
+  }.getOrElse("pyspark.worker")
+
   var daemon: Process = null
   val daemonHost = InetAddress.getByAddress(Array(127, 0, 0, 1))
   var daemonPort: Int = 0
@@ -74,8 +96,9 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
   }
 
   /**
-   * Connect to a worker launched through pyspark/daemon.py, which forks python processes itself
-   * to avoid the high cost of forking from Java. This currently only works on UNIX-based systems.
+   * Connect to a worker launched through pyspark/daemon.py (by default), which forks python
+   * processes itself to avoid the high cost of forking from Java. This currently only works
+   * on UNIX-based systems.
    */
   private def createThroughDaemon(): Socket = {
 
@@ -108,7 +131,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
   }
 
   /**
-   * Launch a worker by executing worker.py directly and telling it to connect to us.
+   * Launch a worker by executing worker.py (by default) directly and telling it to connect to us.
    */
   private def createSimpleWorker(): Socket = {
     var serverSocket: ServerSocket = null
@@ -116,7 +139,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
       serverSocket = new ServerSocket(0, 1, InetAddress.getByAddress(Array(127, 0, 0, 1)))
 
       // Create and start the worker
-      val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", "pyspark.worker"))
+      val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", workerModule))
       val workerEnv = pb.environment()
       workerEnv.putAll(envVars.asJava)
       workerEnv.put("PYTHONPATH", pythonPath)
@@ -159,7 +182,7 @@ private[spark] class PythonWorkerFactory(pythonExec: String, envVars: Map[String
 
       try {
         // Create and start the daemon
-        val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", "pyspark.daemon"))
+        val pb = new ProcessBuilder(Arrays.asList(pythonExec, "-m", daemonModule))
         val workerEnv = pb.environment()
         workerEnv.putAll(envVars.asJava)
         workerEnv.put("PYTHONPATH", pythonPath)

From c3548d11c3c57e8f2c6ebd9d2d6a3924ddcd3cba Mon Sep 17 00:00:00 2001
From: foxish <ramanathana@google.com>
Date: Sat, 13 Jan 2018 21:34:28 -0800
Subject: [PATCH 335/356] [SPARK-23063][K8S] K8s changes for publishing scripts
 (and a couple of other misses)

## What changes were proposed in this pull request?

Including the `-Pkubernetes` flag in a few places it was missed.

## How was this patch tested?

checkstyle, mima through manual tests.

Author: foxish <ramanathana@google.com>

Closes #20256 from foxish/SPARK-23063.
---
 dev/create-release/release-build.sh |  4 ++--
 dev/create-release/releaseutils.py  |  2 ++
 dev/deps/spark-deps-hadoop-2.6      | 11 +++++++++++
 dev/deps/spark-deps-hadoop-2.7      | 11 +++++++++++
 dev/lint-java                       |  2 +-
 dev/mima                            |  2 +-
 dev/scalastyle                      |  1 +
 dev/test-dependencies.sh            |  2 +-
 8 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index c71137468054f..a3579f21fc539 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -92,9 +92,9 @@ MVN="build/mvn --force"
 # Hive-specific profiles for some builds
 HIVE_PROFILES="-Phive -Phive-thriftserver"
 # Profiles for publishing snapshots and release to Maven Central
-PUBLISH_PROFILES="-Pmesos -Pyarn -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
+PUBLISH_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume $HIVE_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl"
 # Profiles for building binary releases
-BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pflume -Psparkr"
+BASE_RELEASE_PROFILES="-Pmesos -Pyarn -Pkubernetes -Pflume -Psparkr"
 # Scala 2.11 only profiles for some builds
 SCALA_2_11_PROFILES="-Pkafka-0-8"
 # Scala 2.12 only profiles for some builds
diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
index 730138195e5fe..32f6cbb29f0be 100755
--- a/dev/create-release/releaseutils.py
+++ b/dev/create-release/releaseutils.py
@@ -185,6 +185,8 @@ def get_commits(tag):
     "graphx": "GraphX",
     "input/output": CORE_COMPONENT,
     "java api": "Java API",
+    "k8s": "Kubernetes",
+    "kubernetes": "Kubernetes",
     "mesos": "Mesos",
     "ml": "MLlib",
     "mllib": "MLlib",
diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
index 2a298769be44c..48e54568e6fc6 100644
--- a/dev/deps/spark-deps-hadoop-2.6
+++ b/dev/deps/spark-deps-hadoop-2.6
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -131,10 +135,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -147,6 +154,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -171,6 +180,7 @@ scala-xml_2.11-1.0.5.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -186,5 +196,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7
index abee326f283ab..1807a77900e52 100644
--- a/dev/deps/spark-deps-hadoop-2.7
+++ b/dev/deps/spark-deps-hadoop-2.7
@@ -17,6 +17,7 @@ arpack_combined_all-0.1.jar
 arrow-format-0.8.0.jar
 arrow-memory-0.8.0.jar
 arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
 avro-1.7.7.jar
 avro-ipc-1.7.7.jar
 avro-mapred-1.7.7-hadoop2.jar
@@ -60,6 +61,7 @@ datanucleus-rdbms-3.2.9.jar
 derby-10.12.1.1.jar
 eigenbase-properties-1.1.5.jar
 flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
 gson-2.2.4.jar
 guava-14.0.1.jar
 guice-3.0.jar
@@ -91,8 +93,10 @@ jackson-annotations-2.6.7.jar
 jackson-core-2.6.7.jar
 jackson-core-asl-1.9.13.jar
 jackson-databind-2.6.7.1.jar
+jackson-dataformat-yaml-2.6.7.jar
 jackson-jaxrs-1.9.13.jar
 jackson-mapper-asl-1.9.13.jar
+jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
@@ -132,10 +136,13 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.16.jar
 kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
@@ -148,6 +155,8 @@ minlog-1.3.0.jar
 netty-3.9.9.Final.jar
 netty-all-4.1.17.Final.jar
 objenesis-2.1.jar
+okhttp-3.8.1.jar
+okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
 orc-mapreduce-1.4.1-nohive.jar
@@ -172,6 +181,7 @@ scala-xml_2.11-1.0.5.jar
 shapeless_2.11-2.3.2.jar
 slf4j-api-1.7.16.jar
 slf4j-log4j12-1.7.16.jar
+snakeyaml-1.15.jar
 snappy-0.2.jar
 snappy-java-1.1.2.6.jar
 spire-macros_2.11-0.13.0.jar
@@ -187,5 +197,6 @@ xbean-asm5-shaded-4.4.jar
 xercesImpl-2.9.1.jar
 xmlenc-0.52.jar
 xz-1.0.jar
+zjsonpatch-0.3.0.jar
 zookeeper-3.4.6.jar
 zstd-jni-1.3.2-2.jar
diff --git a/dev/lint-java b/dev/lint-java
index c2e80538ef2a5..1f0b0c8379ed0 100755
--- a/dev/lint-java
+++ b/dev/lint-java
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
 
-ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
+ERRORS=$($SCRIPT_DIR/../build/mvn -Pkinesis-asl -Pmesos -Pkubernetes -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
 
 if test ! -z "$ERRORS"; then
     echo -e "Checkstyle checks failed at following occurrences:\n$ERRORS"
diff --git a/dev/mima b/dev/mima
index 1e3ca9700bc07..cd2694ff4d3de 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/dev/scalastyle b/dev/scalastyle
index 89ecc8abd6f8c..b8053df05fa2b 100755
--- a/dev/scalastyle
+++ b/dev/scalastyle
@@ -24,6 +24,7 @@ ERRORS=$(echo -e "q\n" \
         -Pkinesis-asl \
         -Pmesos \
         -Pkafka-0-8 \
+        -Pkubernetes \
         -Pyarn \
         -Pflume \
         -Phive \
diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
index 58b295d4f6e00..3bf7618e1ea96 100755
--- a/dev/test-dependencies.sh
+++ b/dev/test-dependencies.sh
@@ -29,7 +29,7 @@ export LC_ALL=C
 # TODO: This would be much nicer to do in SBT, once SBT supports Maven-style resolution.
 
 # NOTE: These should match those in the release publishing script
-HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pyarn -Pflume -Phive"
+HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
     hadoop-2.6

From 7a3d0aad2b89aef54f7dd580397302e9ff984d9d Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sat, 13 Jan 2018 23:26:12 -0800
Subject: [PATCH 336/356] [SPARK-23038][TEST] Update docker/spark-test (JDK/OS)

## What changes were proposed in this pull request?

This PR aims to update the followings in `docker/spark-test`.

- JDK7 -> JDK8
Spark 2.2+ supports JDK8 only.

- Ubuntu 12.04.5 LTS(precise) -> Ubuntu 16.04.3 LTS(xeniel)
The end of life of `precise` was April 28, 2017.

## How was this patch tested?

Manual.

* Master
```
$ cd external/docker
$ ./build
$ export SPARK_HOME=...
$ docker run -v $SPARK_HOME:/opt/spark spark-test-master
CONTAINER_IP=172.17.0.3
...
18/01/11 06:50:25 INFO MasterWebUI: Bound MasterWebUI to 172.17.0.3, and started at http://172.17.0.3:8080
18/01/11 06:50:25 INFO Utils: Successfully started service on port 6066.
18/01/11 06:50:25 INFO StandaloneRestServer: Started REST server for submitting applications on port 6066
18/01/11 06:50:25 INFO Master: I have been elected leader! New state: ALIVE
```

* Slave
```
$ docker run -v $SPARK_HOME:/opt/spark spark-test-worker spark://172.17.0.3:7077
CONTAINER_IP=172.17.0.4
...
18/01/11 06:51:54 INFO Worker: Successfully registered with master spark://172.17.0.3:7077
```

After slave starts, master will show
```
18/01/11 06:51:54 INFO Master: Registering worker 172.17.0.4:8888 with 4 cores, 1024.0 MB RAM
```

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20230 from dongjoon-hyun/SPARK-23038.
---
 external/docker/spark-test/base/Dockerfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/external/docker/spark-test/base/Dockerfile b/external/docker/spark-test/base/Dockerfile
index 5a95a9387c310..c70cd71367679 100644
--- a/external/docker/spark-test/base/Dockerfile
+++ b/external/docker/spark-test/base/Dockerfile
@@ -15,14 +15,14 @@
 # limitations under the License.
 #
 
-FROM ubuntu:precise
+FROM ubuntu:xenial
 
 # Upgrade package index
-# install a few other useful packages plus Open Jdk 7
+# install a few other useful packages plus Open Jdk 8
 # Remove unneeded /var/lib/apt/lists/* after install to reduce the
 # docker image size (by ~30MB)
 RUN apt-get update && \
-    apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server && \
+    apt-get install -y less openjdk-8-jre-headless iproute2 vim-tiny sudo openssh-server && \
     rm -rf /var/lib/apt/lists/*
 
 ENV SCALA_VERSION 2.11.8

From 66738d29c59871b29d26fc3756772b95ef536248 Mon Sep 17 00:00:00 2001
From: Felix Cheung <felixcheung_m@hotmail.com>
Date: Sun, 14 Jan 2018 19:43:10 +0900
Subject: [PATCH 337/356] [SPARK-23069][DOCS][SPARKR] fix R doc for describe
 missing text

## What changes were proposed in this pull request?

fix doc truncated

## How was this patch tested?

manually

Author: Felix Cheung <felixcheung_m@hotmail.com>

Closes #20263 from felixcheung/r23docfix.
---
 R/pkg/R/DataFrame.R | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 9956f7eda91e6..6caa125e1e14a 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -3054,10 +3054,10 @@ setMethod("describe",
 #' \item stddev
 #' \item min
 #' \item max
-#' \item arbitrary approximate percentiles specified as a percentage (eg, "75%")
+#' \item arbitrary approximate percentiles specified as a percentage (eg, "75\%")
 #' }
 #' If no statistics are given, this function computes count, mean, stddev, min,
-#' approximate quartiles (percentiles at 25%, 50%, and 75%), and max.
+#' approximate quartiles (percentiles at 25\%, 50\%, and 75\%), and max.
 #' This function is meant for exploratory data analysis, as we make no guarantee about the
 #' backward compatibility of the schema of the resulting Dataset. If you want to
 #' programmatically compute summary statistics, use the \code{agg} function instead.
@@ -4019,9 +4019,9 @@ setMethod("broadcast",
 #'
 #' Spark will use this watermark for several purposes:
 #' \itemize{
-#'  \item{-} To know when a given time window aggregation can be finalized and thus can be emitted
+#'  \item To know when a given time window aggregation can be finalized and thus can be emitted
 #' when using output modes that do not allow updates.
-#'  \item{-} To minimize the amount of state that we need to keep for on-going aggregations.
+#'  \item To minimize the amount of state that we need to keep for on-going aggregations.
 #' }
 #' The current watermark is computed by looking at the \code{MAX(eventTime)} seen across
 #' all of the partitions in the query minus a user specified \code{delayThreshold}. Due to the cost

From 990f05c80347c6eec2ee06823cff587c9ea90b49 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Sun, 14 Jan 2018 22:26:21 +0800
Subject: [PATCH 338/356] [SPARK-23021][SQL] AnalysisBarrier should override
 innerChildren to print correct explain output

## What changes were proposed in this pull request?
`AnalysisBarrier` in the current master cuts off explain results for parsed logical plans;
```
scala> Seq((1, 1)).toDF("a", "b").groupBy("a").count().sample(0.1).explain(true)
== Parsed Logical Plan ==
Sample 0.0, 0.1, false, -7661439431999668039
+- AnalysisBarrier Aggregate [a#5], [a#5, count(1) AS count#14L]
```
To fix this, `AnalysisBarrier` needs to override `innerChildren` and this pr changed the output to;
```
== Parsed Logical Plan ==
Sample 0.0, 0.1, false, -5086223488015741426
+- AnalysisBarrier
      +- Aggregate [a#5], [a#5, count(1) AS count#14L]
         +- Project [_1#2 AS a#5, _2#3 AS b#6]
            +- LocalRelation [_1#2, _2#3]
```

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20247 from maropu/SPARK-23021-2.
---
 .../plans/logical/basicLogicalOperators.scala   |  1 +
 .../sql/hive/execution/HiveExplainSuite.scala   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 95e099c340af1..a4fca790dd086 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -903,6 +903,7 @@ case class Deduplicate(
  * This analysis barrier will be removed at the end of analysis stage.
  */
 case class AnalysisBarrier(child: LogicalPlan) extends LeafNode {
+  override protected def innerChildren: Seq[LogicalPlan] = Seq(child)
   override def output: Seq[Attribute] = child.output
   override def isStreaming: Boolean = child.isStreaming
   override def doCanonicalize(): LogicalPlan = child.canonicalized
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
index dfabf1ec2a22a..a4273de5fe260 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala
@@ -171,4 +171,21 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto
       sql("EXPLAIN EXTENDED CODEGEN SELECT 1")
     }
   }
+
+  test("SPARK-23021 AnalysisBarrier should not cut off explain output for parsed logical plans") {
+    val df = Seq((1, 1)).toDF("a", "b").groupBy("a").count().limit(1)
+    val outputStream = new java.io.ByteArrayOutputStream()
+    Console.withOut(outputStream) {
+      df.explain(true)
+    }
+    assert(outputStream.toString.replaceAll("""#\d+""", "#0").contains(
+      s"""== Parsed Logical Plan ==
+         |GlobalLimit 1
+         |+- LocalLimit 1
+         |   +- AnalysisBarrier
+         |         +- Aggregate [a#0], [a#0, count(1) AS count#0L]
+         |            +- Project [_1#0 AS a#0, _2#0 AS b#0]
+         |               +- LocalRelation [_1#0, _2#0]
+         |""".stripMargin))
+  }
 }

From 60eeecd7760aee6ce2fd207c83ae40054eadaf83 Mon Sep 17 00:00:00 2001
From: Sandor Murakozi <smurakozi@gmail.com>
Date: Sun, 14 Jan 2018 08:32:35 -0600
Subject: [PATCH 339/356] [SPARK-23051][CORE] Fix for broken job description in
 Spark UI

## What changes were proposed in this pull request?

In 2.2, Spark UI displayed the stage description if the job description was not set. This functionality was broken, the GUI has shown no description in this case. In addition, the code uses jobName and
jobDescription instead of stageName and stageDescription when JobTableRowData is created.

In this PR the logic producing values for the job rows was modified to find the latest stage attempt for the job and use that as a fallback if job description was missing.
StageName and stageDescription are also set using values from stage and jobName/description is used only as a fallback.

## How was this patch tested?
Manual testing of the UI, using the code in the bug report.

Author: Sandor Murakozi <smurakozi@gmail.com>

Closes #20251 from smurakozi/SPARK-23051.
---
 .../apache/spark/ui/jobs/AllJobsPage.scala    | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 37e3b3b304a63..ff916bb6a5759 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -65,12 +65,10 @@ private[ui] class AllJobsPage(parent: JobsTab, store: AppStatusStore) extends We
     }.map { job =>
       val jobId = job.jobId
       val status = job.status
-      val displayJobDescription =
-        if (job.description.isEmpty) {
-          job.name
-        } else {
-          UIUtils.makeDescription(job.description.get, "", plainText = true).text
-        }
+      val jobDescription = store.lastStageAttempt(job.stageIds.max).description
+      val displayJobDescription = jobDescription
+        .map(UIUtils.makeDescription(_, "", plainText = true).text)
+        .getOrElse("")
       val submissionTime = job.submissionTime.get.getTime()
       val completionTime = job.completionTime.map(_.getTime()).getOrElse(System.currentTimeMillis())
       val classNameByStatus = status match {
@@ -429,20 +427,23 @@ private[ui] class JobDataSource(
     val formattedDuration = duration.map(d => UIUtils.formatDuration(d)).getOrElse("Unknown")
     val submissionTime = jobData.submissionTime
     val formattedSubmissionTime = submissionTime.map(UIUtils.formatDate).getOrElse("Unknown")
-    val jobDescription = UIUtils.makeDescription(jobData.description.getOrElse(""),
-      basePath, plainText = false)
+    val lastStageAttempt = store.lastStageAttempt(jobData.stageIds.max)
+    val lastStageDescription = lastStageAttempt.description.getOrElse("")
+
+    val formattedJobDescription =
+      UIUtils.makeDescription(lastStageDescription, basePath, plainText = false)
 
     val detailUrl = "%s/jobs/job?id=%s".format(basePath, jobData.jobId)
 
     new JobTableRowData(
       jobData,
-      jobData.name,
-      jobData.description.getOrElse(jobData.name),
+      lastStageAttempt.name,
+      lastStageDescription,
       duration.getOrElse(-1),
       formattedDuration,
       submissionTime.map(_.getTime()).getOrElse(-1L),
       formattedSubmissionTime,
-      jobDescription,
+      formattedJobDescription,
       detailUrl
     )
   }

From 42a1a15d739890bdfbb367ef94198b19e98ffcb7 Mon Sep 17 00:00:00 2001
From: guoxiaolong <guo.xiaolong1@zte.com.cn>
Date: Mon, 15 Jan 2018 02:02:49 +0800
Subject: [PATCH 340/356] [SPARK-22999][SQL] show databases like command' can
 remove the like keyword

## What changes were proposed in this pull request?

SHOW DATABASES (LIKE pattern = STRING)? Can be like the back increase?
When using this command, LIKE keyword can be removed.
You can refer to the SHOW TABLES command, SHOW TABLES 'test *' and SHOW TABELS like 'test *' can be used.
Similarly SHOW DATABASES 'test *' and SHOW DATABASES like 'test *' can be used.

## How was this patch tested?
unit tests   manual tests
Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: guoxiaolong <guo.xiaolong1@zte.com.cn>

Closes #20194 from guoxiaolongzte/SPARK-22999.
---
 .../antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4    | 2 +-
 .../org/apache/spark/sql/execution/command/DDLSuite.scala     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
index 6daf01d98426c..39d5e4ed56628 100644
--- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
+++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4
@@ -141,7 +141,7 @@ statement
         (LIKE? pattern=STRING)?                                        #showTables
     | SHOW TABLE EXTENDED ((FROM | IN) db=identifier)?
         LIKE pattern=STRING partitionSpec?                             #showTable
-    | SHOW DATABASES (LIKE pattern=STRING)?                            #showDatabases
+    | SHOW DATABASES (LIKE? pattern=STRING)?                            #showDatabases
     | SHOW TBLPROPERTIES table=tableIdentifier
         ('(' key=tablePropertyKey ')')?                                #showTblProperties
     | SHOW COLUMNS (FROM | IN) tableIdentifier
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 591510c1d8283..2b4b7c137428a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -991,6 +991,10 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
       sql("SHOW DATABASES LIKE '*db1A'"),
       Row("showdb1a") :: Nil)
 
+    checkAnswer(
+      sql("SHOW DATABASES '*db1A'"),
+      Row("showdb1a") :: Nil)
+
     checkAnswer(
       sql("SHOW DATABASES LIKE 'showdb1A'"),
       Row("showdb1a") :: Nil)

From b98ffa4d6dabaf787177d3f14b200fc4b118c7ce Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 15 Jan 2018 10:55:21 +0800
Subject: [PATCH 341/356] [SPARK-23054][SQL] Fix incorrect results of casting
 UserDefinedType to String

## What changes were proposed in this pull request?
This pr fixed the issue when casting `UserDefinedType`s into strings;
```
>>> from pyspark.ml.classification import MultilayerPerceptronClassifier
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])), (1.0, Vectors.dense([0.0, 1.0]))], ["label", "features"])
>>> df.selectExpr("CAST(features AS STRING)").show(truncate = False)
+-------------------------------------------+
|features                                   |
+-------------------------------------------+
|[6,1,0,0,2800000020,2,0,0,0]               |
|[6,1,0,0,2800000020,2,0,0,3ff0000000000000]|
+-------------------------------------------+
```
The root cause is that `Cast` handles input data as `UserDefinedType.sqlType`(this is underlying storage type), so we should pass data into `UserDefinedType.deserialize` then `toString`.
This pr modified the result into;
```
+---------+
|features |
+---------+
|[0.0,0.0]|
|[0.0,1.0]|
+---------+
```

## How was this patch tested?
Added tests in `UserDefinedTypeSuite `.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20246 from maropu/SPARK-23054.
---
 .../spark/sql/catalyst/expressions/Cast.scala     |  7 +++++++
 .../apache/spark/sql/UserDefinedTypeSuite.scala   | 15 +++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index f21aa1e9e3135..a95ebe301b9d1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -282,6 +282,8 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
         builder.append("]")
         builder.build()
       })
+    case udt: UserDefinedType[_] =>
+      buildCast[Any](_, o => UTF8String.fromString(udt.deserialize(o).toString))
     case _ => buildCast[Any](_, o => UTF8String.fromString(o.toString))
   }
 
@@ -836,6 +838,11 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
              |$evPrim = $buffer.build();
            """.stripMargin
         }
+      case udt: UserDefinedType[_] =>
+        val udtRef = ctx.addReferenceObj("udt", udt)
+        (c, evPrim, evNull) => {
+          s"$evPrim = UTF8String.fromString($udtRef.deserialize($c).toString());"
+        }
       case _ =>
         (c, evPrim, evNull) => s"$evPrim = UTF8String.fromString(String.valueOf($c));"
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index a08433ba794d9..cc8b600efa46a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -21,7 +21,7 @@ import scala.beans.{BeanInfo, BeanProperty}
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
+import org.apache.spark.sql.catalyst.expressions.{Cast, ExpressionEvalHelper, GenericInternalRow, Literal}
 import org.apache.spark.sql.catalyst.util.{ArrayData, GenericArrayData}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
 import org.apache.spark.sql.functions._
@@ -44,6 +44,8 @@ object UDT {
       case v: MyDenseVector => java.util.Arrays.equals(this.data, v.data)
       case _ => false
     }
+
+    override def toString: String = data.mkString("(", ", ", ")")
   }
 
   private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
@@ -143,7 +145,8 @@ private[spark] class ExampleSubTypeUDT extends UserDefinedType[IExampleSubType]
   override def userClass: Class[IExampleSubType] = classOf[IExampleSubType]
 }
 
-class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest {
+class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest
+    with ExpressionEvalHelper {
   import testImplicits._
 
   private lazy val pointsRDD = Seq(
@@ -304,4 +307,12 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetT
       pointsRDD.except(pointsRDD2),
       Seq(Row(0.0, new UDT.MyDenseVector(Array(0.2, 2.0)))))
   }
+
+  test("SPARK-23054 Cast UserDefinedType to string") {
+    val udt = new UDT.MyDenseVectorUDT()
+    val vector = new UDT.MyDenseVector(Array(1.0, 3.0, 5.0, 7.0, 9.0))
+    val data = udt.serialize(vector)
+    val ret = Cast(Literal(data, udt), StringType, None)
+    checkEvaluation(ret, "(1.0, 3.0, 5.0, 7.0, 9.0)")
+  }
 }

From 9a96bfc8bf021cb4b6c62fac6ce1bcf87affcd43 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Mon, 15 Jan 2018 12:06:56 +0800
Subject: [PATCH 342/356] [SPARK-23049][SQL]
 `spark.sql.files.ignoreCorruptFiles` should work for ORC files

## What changes were proposed in this pull request?

When `spark.sql.files.ignoreCorruptFiles=true`, we should ignore corrupted ORC files.

## How was this patch tested?

Pass the Jenkins with a newly added test case.

Author: Dongjoon Hyun <dongjoon@apache.org>

Closes #20240 from dongjoon-hyun/SPARK-23049.
---
 .../execution/datasources/orc/OrcUtils.scala  | 29 ++++++++----
 .../datasources/orc/OrcQuerySuite.scala       | 47 +++++++++++++++++++
 .../parquet/ParquetQuerySuite.scala           | 23 +++++++--
 .../spark/sql/hive/orc/OrcFileFormat.scala    |  8 +++-
 .../spark/sql/hive/orc/OrcFileOperator.scala  | 28 +++++++++--
 5 files changed, 117 insertions(+), 18 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index 13a23996f4ade..460194ba61c8b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileStatus, Path}
 import org.apache.orc.{OrcFile, Reader, TypeDescription}
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.SparkSession
@@ -50,23 +51,35 @@ object OrcUtils extends Logging {
     paths
   }
 
-  def readSchema(file: Path, conf: Configuration): Option[TypeDescription] = {
+  def readSchema(file: Path, conf: Configuration, ignoreCorruptFiles: Boolean)
+      : Option[TypeDescription] = {
     val fs = file.getFileSystem(conf)
     val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
-    val reader = OrcFile.createReader(file, readerOptions)
-    val schema = reader.getSchema
-    if (schema.getFieldNames.size == 0) {
-      None
-    } else {
-      Some(schema)
+    try {
+      val reader = OrcFile.createReader(file, readerOptions)
+      val schema = reader.getSchema
+      if (schema.getFieldNames.size == 0) {
+        None
+      } else {
+        Some(schema)
+      }
+    } catch {
+      case e: org.apache.orc.FileFormatException =>
+        if (ignoreCorruptFiles) {
+          logWarning(s"Skipped the footer in the corrupted file: $file", e)
+          None
+        } else {
+          throw new SparkException(s"Could not read footer for file: $file", e)
+        }
     }
   }
 
   def readSchema(sparkSession: SparkSession, files: Seq[FileStatus])
       : Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     val conf = sparkSession.sessionState.newHadoopConf()
     // TODO: We need to support merge schema. Please see SPARK-11412.
-    files.map(_.getPath).flatMap(readSchema(_, conf)).headOption.map { schema =>
+    files.map(_.getPath).flatMap(readSchema(_, conf, ignoreCorruptFiles)).headOption.map { schema =>
       logDebug(s"Reading schema from file $files, got Hive schema string: $schema")
       CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
index e00e057a18cc6..f58c331f33ca8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcQuerySuite.scala
@@ -31,6 +31,7 @@ import org.apache.orc.OrcConf.COMPRESS
 import org.apache.orc.mapred.OrcStruct
 import org.apache.orc.mapreduce.OrcInputFormat
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, RecordReaderIterator}
@@ -531,6 +532,52 @@ abstract class OrcQueryTest extends OrcTest {
     val df = spark.read.orc(path1.getCanonicalPath, path2.getCanonicalPath)
     assert(df.count() == 20)
   }
+
+  test("Enabling/disabling ignoreCorruptFiles") {
+    def testIgnoreCorruptFiles(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.orc(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.orc(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").orc(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
+      testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
+    }
+
+    withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
+      val m1 = intercept[SparkException] {
+        testIgnoreCorruptFiles()
+      }.getMessage
+      assert(m1.contains("Could not read footer for file"))
+      val m2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }.getMessage
+      assert(m2.contains("Malformed ORC file"))
+    }
+  }
 }
 
 class OrcQuerySuite extends OrcQueryTest with SharedSQLContext {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 4c8c9ef6e0432..6ad88ed997ce7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -320,14 +320,27 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
           new Path(basePath, "first").toString,
           new Path(basePath, "second").toString,
           new Path(basePath, "third").toString)
-        checkAnswer(
-          df,
-          Seq(Row(0), Row(1)))
+        checkAnswer(df, Seq(Row(0), Row(1)))
+      }
+    }
+
+    def testIgnoreCorruptFilesWithoutSchemaInfer(): Unit = {
+      withTempDir { dir =>
+        val basePath = dir.getCanonicalPath
+        spark.range(1).toDF("a").write.parquet(new Path(basePath, "first").toString)
+        spark.range(1, 2).toDF("a").write.parquet(new Path(basePath, "second").toString)
+        spark.range(2, 3).toDF("a").write.json(new Path(basePath, "third").toString)
+        val df = spark.read.schema("a long").parquet(
+          new Path(basePath, "first").toString,
+          new Path(basePath, "second").toString,
+          new Path(basePath, "third").toString)
+        checkAnswer(df, Seq(Row(0), Row(1)))
       }
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "true") {
       testIgnoreCorruptFiles()
+      testIgnoreCorruptFilesWithoutSchemaInfer()
     }
 
     withSQLConf(SQLConf.IGNORE_CORRUPT_FILES.key -> "false") {
@@ -335,6 +348,10 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         testIgnoreCorruptFiles()
       }
       assert(exception.getMessage().contains("is not a Parquet file"))
+      val exception2 = intercept[SparkException] {
+        testIgnoreCorruptFilesWithoutSchemaInfer()
+      }
+      assert(exception2.getMessage().contains("is not a Parquet file"))
     }
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index 95741c7b30289..237ed9bc05988 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -59,9 +59,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
     OrcFileOperator.readSchema(
       files.map(_.getPath.toString),
-      Some(sparkSession.sessionState.newHadoopConf())
+      Some(sparkSession.sessionState.newHadoopConf()),
+      ignoreCorruptFiles
     )
   }
 
@@ -129,6 +131,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     val broadcastedHadoopConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
+    val ignoreCorruptFiles = sparkSession.sessionState.conf.ignoreCorruptFiles
 
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
@@ -138,7 +141,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       // SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
       // case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
       // using the given physical schema. Instead, we simply return an empty iterator.
-      val isEmptyFile = OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf)).isEmpty
+      val isEmptyFile =
+        OrcFileOperator.readSchema(Seq(filePath.toString), Some(conf), ignoreCorruptFiles).isEmpty
       if (isEmptyFile) {
         Iterator.empty
       } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
index 5a3fcd7a759c0..80e44ca504356 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileOperator.scala
@@ -17,11 +17,14 @@
 
 package org.apache.spark.sql.hive.orc
 
+import java.io.IOException
+
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
 import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector
 
+import org.apache.spark.SparkException
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
@@ -46,7 +49,10 @@ private[hive] object OrcFileOperator extends Logging {
    *       create the result reader from that file.  If no such file is found, it returns `None`.
    * @todo Needs to consider all files when schema evolution is taken into account.
    */
-  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
+  def getFileReader(basePath: String,
+      config: Option[Configuration] = None,
+      ignoreCorruptFiles: Boolean = false)
+      : Option[Reader] = {
     def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
       reader.getObjectInspector match {
         case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
@@ -65,16 +71,28 @@ private[hive] object OrcFileOperator extends Logging {
     }
 
     listOrcFiles(basePath, conf).iterator.map { path =>
-      path -> OrcFile.createReader(fs, path)
+      val reader = try {
+        Some(OrcFile.createReader(fs, path))
+      } catch {
+        case e: IOException =>
+          if (ignoreCorruptFiles) {
+            logWarning(s"Skipped the footer in the corrupted file: $path", e)
+            None
+          } else {
+            throw new SparkException(s"Could not read footer for file: $path", e)
+          }
+      }
+      path -> reader
     }.collectFirst {
-      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
+      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
     }
   }
 
-  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
+  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
+      : Option[StructType] = {
     // Take the first file where we can open a valid reader if we can find one.  Otherwise just
     // return None to indicate we can't infer the schema.
-    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
+    paths.flatMap(getFileReader(_, conf, ignoreCorruptFiles)).headOption.map { reader =>
       val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
       val schema = readerInspector.getTypeName
       logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")

From 2b9fdfe7c5fdfe42f7fdc7141e5317fb214c4439 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 15 Jan 2018 07:45:39 +0000
Subject: [PATCH 343/356] Consistent versions

---
 dists/hadoop-palantir/pom.xml                                   | 2 +-
 resource-managers/kubernetes/docker-minimal-bundle/pom.xml      | 2 +-
 .../kubernetes/integration-tests-spark-jobs-helpers/pom.xml     | 2 +-
 .../kubernetes/integration-tests-spark-jobs/pom.xml             | 2 +-
 resource-managers/kubernetes/integration-tests/pom.xml          | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dists/hadoop-palantir/pom.xml b/dists/hadoop-palantir/pom.xml
index 8fb0b874664b7..c3174e4191eae 100644
--- a/dists/hadoop-palantir/pom.xml
+++ b/dists/hadoop-palantir/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
index 85827a17a963f..80951d691c644 100644
--- a/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
+++ b/resource-managers/kubernetes/docker-minimal-bundle/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
index c4fe21604fb44..91a7482546419 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs-helpers/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
index db5ec86e0dfd8..eab95c5806eb8 100644
--- a/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
+++ b/resource-managers/kubernetes/integration-tests-spark-jobs/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 
diff --git a/resource-managers/kubernetes/integration-tests/pom.xml b/resource-managers/kubernetes/integration-tests/pom.xml
index 2f1587e0d6bb4..58b9dd00e5294 100644
--- a/resource-managers/kubernetes/integration-tests/pom.xml
+++ b/resource-managers/kubernetes/integration-tests/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent_2.11</artifactId>
-    <version>2.3.0-SNAPSHOT</version>
+    <version>2.4.0-SNAPSHOT</version>
     <relativePath>../../../pom.xml</relativePath>
   </parent>
 

From 1f2962c7cdccded9b87fa9983e07689c1f38f6e6 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 15 Jan 2018 07:48:04 +0000
Subject: [PATCH 344/356] one bouncycastle

---
 pom.xml | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/pom.xml b/pom.xml
index 8f452abdfb4b8..950d9abf7a7ca 100644
--- a/pom.xml
+++ b/pom.xml
@@ -136,7 +136,7 @@
     <okhttp.version>2.7.5</okhttp.version>
     <okio.version>1.13.0</okio.version>
     <retrofit.version>2.3.0</retrofit.version>
-    <bouncycastle.version>1.54</bouncycastle.version>
+    <bouncycastle.version>1.58</bouncycastle.version>
     <jetty.version>9.4.8.v20171121</jetty.version>
     <orc.version>1.4.1</orc.version>
     <orc.classifier>nohive</orc.classifier>
@@ -411,6 +411,7 @@
       <dependency>
         <groupId>org.bouncycastle</groupId>
         <artifactId>bcprov-jdk15on</artifactId>
+        <!-- brought in by jets3t; see SPARK-22634 -->
         <version>${bouncycastle.version}</version>
       </dependency>
       <!-- This artifact is a shaded version of ASM 5.0.4. The POM that was used to produce this
@@ -1196,12 +1197,6 @@
           </exclusion>
         </exclusions>
       </dependency>
-      <dependency>
-        <groupId>org.bouncycastle</groupId>
-        <artifactId>bcprov-jdk15on</artifactId>
-        <!-- brought in by jets3t; see SPARK-22634 -->
-        <version>1.58</version>
-      </dependency>
       <dependency>
         <groupId>org.apache.hadoop</groupId>
         <artifactId>hadoop-yarn-api</artifactId>

From 91a6f1f767cc0937265370757e9fc9a8c7983d71 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 15 Jan 2018 07:59:59 +0000
Subject: [PATCH 345/356] versions

---
 dev/deps/spark-deps-hadoop-palantir | 38 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-palantir b/dev/deps/spark-deps-hadoop-palantir
index fee1fbb36ff63..452d82dbd937a 100644
--- a/dev/deps/spark-deps-hadoop-palantir
+++ b/dev/deps/spark-deps-hadoop-palantir
@@ -14,9 +14,9 @@ apacheds-kerberos-codec-2.0.0-M15.jar
 api-asn1-api-1.0.0-M20.jar
 api-util-1.0.0-M20.jar
 arpack_combined_all-0.1.jar
-arrow-format-0.4.0.jar
-arrow-memory-0.4.0.jar
-arrow-vector-0.4.0.jar
+arrow-format-0.8.0.jar
+arrow-memory-0.8.0.jar
+arrow-vector-0.8.0.jar
 automaton-1.11-8.jar
 avro-1.8.1.jar
 avro-ipc-1.8.1.jar
@@ -25,8 +25,8 @@ aws-java-sdk-core-1.11.45.jar
 aws-java-sdk-kms-1.11.45.jar
 aws-java-sdk-s3-1.11.45.jar
 base64-2.3.8.jar
-bcpkix-jdk15on-1.54.jar
-bcprov-jdk15on-1.54.jar
+bcpkix-jdk15on-1.58.jar
+bcprov-jdk15on-1.58.jar
 bonecp-0.8.0.RELEASE.jar
 breeze-macros_2.11-0.13.2.jar
 breeze_2.11-0.13.2.jar
@@ -41,7 +41,7 @@ commons-beanutils-core-1.8.0.jar
 commons-cli-1.2.jar
 commons-codec-1.10.jar
 commons-collections-3.2.2.jar
-commons-compiler-3.0.0.jar
+commons-compiler-3.0.8.jar
 commons-compress-1.8.1.jar
 commons-configuration-1.6.jar
 commons-crypto-1.0.0.jar
@@ -101,10 +101,10 @@ hibernate-validator-5.2.2.Final.jar
 hk2-api-2.5.0-b32.jar
 hk2-locator-2.5.0-b32.jar
 hk2-utils-2.5.0-b32.jar
-hppc-0.7.1.jar
+hppc-0.7.2.jar
 htrace-core4-4.0.1-incubating.jar
-httpclient-4.5.3.jar
-httpcore-4.4.6.jar
+httpclient-4.5.4.jar
+httpcore-4.4.8.jar
 ion-java-1.0.1.jar
 ivy-2.4.0.jar
 jackson-annotations-2.6.7.jar
@@ -125,8 +125,8 @@ jackson-module-jaxb-annotations-2.6.7.jar
 jackson-module-paranamer-2.7.9.jar
 jackson-module-scala_2.11-2.6.7.1.jar
 jackson-xc-1.9.13.jar
-janino-3.0.0.jar
-java-xmlbuilder-1.0.jar
+janino-3.0.8.jar
+java-xmlbuilder-1.1.jar
 javassist-3.20.0-GA.jar
 javax.annotation-api-1.2.jar
 javax.el-3.0.0.jar
@@ -148,7 +148,7 @@ jersey-guava-2.25.1.jar
 jersey-media-jaxb-2.25.1.jar
 jersey-media-multipart-2.25.1.jar
 jersey-server-2.25.1.jar
-jets3t-0.9.3.jar
+jets3t-0.9.4.jar
 jetty-6.1.26.jar
 jetty-sslengine-6.1.26.jar
 jetty-util-6.1.26.jar
@@ -165,17 +165,16 @@ jta-1.1.jar
 jtransforms-2.4.0.jar
 jul-to-slf4j-1.7.25.jar
 kryo-shaded-3.0.3.jar
-kubernetes-client-2.2.13.jar
-kubernetes-model-1.0.67.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
 leveldbjni-all-1.8.jar
 libfb303-0.9.3.jar
 libthrift-0.9.3.jar
 log4j-1.2.17.jar
-logging-interceptor-3.6.0.jar
+logging-interceptor-3.8.1.jar
 lz4-java-1.4.0.jar
 machinist_2.11-0.6.1.jar
 macro-compat_2.11-1.1.1.jar
-mail-1.4.7.jar
 metrics-core-3.2.5.jar
 metrics-graphite-3.2.5.jar
 metrics-influxdb-1.1.8.jar
@@ -183,13 +182,12 @@ metrics-json-3.2.5.jar
 metrics-jvm-3.2.5.jar
 mimepull-1.9.6.jar
 minlog-1.3.0.jar
-mx4j-3.0.2.jar
 netty-3.10.6.Final.jar
-netty-all-4.0.50.Final.jar
+netty-all-4.1.17.Final.jar
 nimbus-jose-jwt-3.9.jar
 objenesis-2.5.1.jar
 okhttp-2.7.5.jar
-okhttp-3.9.0.jar
+okhttp-3.9.1.jar
 okio-1.13.0.jar
 opencsv-2.3.jar
 orc-core-1.4.1-nohive.jar
@@ -226,7 +224,7 @@ stax-api-1.0-2.jar
 stax-api-1.0.1.jar
 stream-2.7.0.jar
 stringtemplate-3.2.1.jar
-univocity-parsers-2.5.4.jar
+univocity-parsers-2.5.9.jar
 validation-api-1.1.0.Final.jar
 xbean-asm5-shaded-4.4.jar
 xmlenc-0.52.jar

From b59808385cfe24ce768e5b3098b9034e64b99a5a Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Mon, 15 Jan 2018 16:26:52 +0800
Subject: [PATCH 346/356] [SPARK-23023][SQL] Cast field data to strings in
 showString

## What changes were proposed in this pull request?
The current `Datset.showString` prints rows thru `RowEncoder` deserializers like;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------------------------------------------+
|a                                                           |
+------------------------------------------------------------+
|[WrappedArray(1, 2), WrappedArray(3), WrappedArray(4, 5, 6)]|
+------------------------------------------------------------+
```
This result is incorrect because the correct one is;
```
scala> Seq(Seq(Seq(1, 2), Seq(3), Seq(4, 5, 6))).toDF("a").show(false)
+------------------------+
|a                       |
+------------------------+
|[[1, 2], [3], [4, 5, 6]]|
+------------------------+
```
So, this pr fixed code in `showString` to cast field data to strings before printing.

## How was this patch tested?
Added tests in `DataFrameSuite`.

Author: Takeshi Yamamuro <yamamuro@apache.org>

Closes #20214 from maropu/SPARK-23023.
---
 python/pyspark/sql/functions.py               | 32 +++++++++----------
 .../scala/org/apache/spark/sql/Dataset.scala  | 21 ++++++------
 .../org/apache/spark/sql/DataFrameSuite.scala | 28 ++++++++++++++++
 .../org/apache/spark/sql/DatasetSuite.scala   | 12 +++----
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index e1ad6590554cf..f7b3f29764040 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -1849,14 +1849,14 @@ def explode_outer(col):
     +---+----------+----+-----+
 
     >>> df.select("id", "a_map", explode_outer("an_array")).show()
-    +---+-------------+----+
-    | id|        a_map| col|
-    +---+-------------+----+
-    |  1|Map(x -> 1.0)| foo|
-    |  1|Map(x -> 1.0)| bar|
-    |  2|        Map()|null|
-    |  3|         null|null|
-    +---+-------------+----+
+    +---+----------+----+
+    | id|     a_map| col|
+    +---+----------+----+
+    |  1|[x -> 1.0]| foo|
+    |  1|[x -> 1.0]| bar|
+    |  2|        []|null|
+    |  3|      null|null|
+    +---+----------+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.explode_outer(_to_java_column(col))
@@ -1881,14 +1881,14 @@ def posexplode_outer(col):
     |  3|      null|null|null| null|
     +---+----------+----+----+-----+
     >>> df.select("id", "a_map", posexplode_outer("an_array")).show()
-    +---+-------------+----+----+
-    | id|        a_map| pos| col|
-    +---+-------------+----+----+
-    |  1|Map(x -> 1.0)|   0| foo|
-    |  1|Map(x -> 1.0)|   1| bar|
-    |  2|        Map()|null|null|
-    |  3|         null|null|null|
-    +---+-------------+----+----+
+    +---+----------+----+----+
+    | id|     a_map| pos| col|
+    +---+----------+----+----+
+    |  1|[x -> 1.0]|   0| foo|
+    |  1|[x -> 1.0]|   1| bar|
+    |  2|        []|null|null|
+    |  3|      null|null|null|
+    +---+----------+----+----+
     """
     sc = SparkContext._active_spark_context
     jc = sc._jvm.functions.posexplode_outer(_to_java_column(col))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 77e571272920a..34f0ab5aa6699 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -237,13 +237,20 @@ class Dataset[T] private[sql](
   private[sql] def showString(
       _numRows: Int, truncate: Int = 20, vertical: Boolean = false): String = {
     val numRows = _numRows.max(0).min(Int.MaxValue - 1)
-    val takeResult = toDF().take(numRows + 1)
+    val newDf = toDF()
+    val castCols = newDf.logicalPlan.output.map { col =>
+      // Since binary types in top-level schema fields have a specific format to print,
+      // so we do not cast them to strings here.
+      if (col.dataType == BinaryType) {
+        Column(col)
+      } else {
+        Column(col).cast(StringType)
+      }
+    }
+    val takeResult = newDf.select(castCols: _*).take(numRows + 1)
     val hasMoreData = takeResult.length > numRows
     val data = takeResult.take(numRows)
 
-    lazy val timeZone =
-      DateTimeUtils.getTimeZone(sparkSession.sessionState.conf.sessionLocalTimeZone)
-
     // For array values, replace Seq and Array with square brackets
     // For cells that are beyond `truncate` characters, replace it with the
     // first `truncate-3` and "..."
@@ -252,12 +259,6 @@ class Dataset[T] private[sql](
         val str = cell match {
           case null => "null"
           case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]")
-          case array: Array[_] => array.mkString("[", ", ", "]")
-          case seq: Seq[_] => seq.mkString("[", ", ", "]")
-          case d: Date =>
-            DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d))
-          case ts: Timestamp =>
-            DateTimeUtils.timestampToString(DateTimeUtils.fromJavaTimestamp(ts), timeZone)
           case _ => cell.toString
         }
         if (truncate > 0 && str.length > truncate) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 5e4c1a6a484fb..33707080c1301 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1255,6 +1255,34 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
     assert(testData.select($"*").showString(1, vertical = true) === expectedAnswer)
   }
 
+  test("SPARK-23023 Cast rows to strings in showString") {
+    val df1 = Seq(Seq(1, 2, 3, 4)).toDF("a")
+    assert(df1.showString(10) ===
+      s"""+------------+
+         ||           a|
+         |+------------+
+         ||[1, 2, 3, 4]|
+         |+------------+
+         |""".stripMargin)
+    val df2 = Seq(Map(1 -> "a", 2 -> "b")).toDF("a")
+    assert(df2.showString(10) ===
+      s"""+----------------+
+         ||               a|
+         |+----------------+
+         ||[1 -> a, 2 -> b]|
+         |+----------------+
+         |""".stripMargin)
+    val df3 = Seq(((1, "a"), 0), ((2, "b"), 0)).toDF("a", "b")
+    assert(df3.showString(10) ===
+      s"""+------+---+
+         ||     a|  b|
+         |+------+---+
+         ||[1, a]|  0|
+         ||[2, b]|  0|
+         |+------+---+
+         |""".stripMargin)
+  }
+
   test("SPARK-7327 show with empty dataFrame") {
     val expectedAnswer = """+---+-----+
                            ||key|value|
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 54893c184642b..49c59cf695dc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -958,12 +958,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
     ).toDS()
 
     val expected =
-      """+-------+
-        ||      f|
-        |+-------+
-        ||[foo,1]|
-        ||[bar,2]|
-        |+-------+
+      """+--------+
+        ||       f|
+        |+--------+
+        ||[foo, 1]|
+        ||[bar, 2]|
+        |+--------+
         |""".stripMargin
 
     checkShowString(ds, expected)

From a38c887ac093d7cf343d807515147d87ca931ce7 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Mon, 15 Jan 2018 07:49:34 -0600
Subject: [PATCH 347/356] [SPARK-19550][BUILD][FOLLOW-UP] Remove MaxPermSize
 for sql module

## What changes were proposed in this pull request?

Remove `MaxPermSize` for `sql` module

## How was this patch tested?

Manually tested.

Author: Yuming Wang <yumwang@ebay.com>

Closes #20268 from wangyum/SPARK-19550-MaxPermSize.
---
 sql/catalyst/pom.xml | 2 +-
 sql/core/pom.xml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/pom.xml b/sql/catalyst/pom.xml
index 839b929abd3cb..7d23637e28342 100644
--- a/sql/catalyst/pom.xml
+++ b/sql/catalyst/pom.xml
@@ -134,7 +134,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>
diff --git a/sql/core/pom.xml b/sql/core/pom.xml
index 744daa6079779..ef41837f89d68 100644
--- a/sql/core/pom.xml
+++ b/sql/core/pom.xml
@@ -195,7 +195,7 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-ea -Xmx4g -Xss4m -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=512m</argLine>
+          <argLine>-ea -Xmx4g -Xss4m -XX:ReservedCodeCacheSize=${CodeCacheSize}</argLine>
         </configuration>
       </plugin>
       <plugin>

From bd08a9e7af4137bddca638e627ad2ae531bce20f Mon Sep 17 00:00:00 2001
From: gatorsmile <gatorsmile@gmail.com>
Date: Mon, 15 Jan 2018 22:32:38 +0800
Subject: [PATCH 348/356] [SPARK-23070] Bump previousSparkVersion in
 MimaBuild.scala to be 2.2.0

## What changes were proposed in this pull request?
Bump previousSparkVersion in MimaBuild.scala to be 2.2.0 and add the missing exclusions to `v23excludes` in `MimaExcludes`. No item can be un-excluded in `v23excludes`.

## How was this patch tested?
The existing tests.

Author: gatorsmile <gatorsmile@gmail.com>

Closes #20264 from gatorsmile/bump22.
---
 project/MimaBuild.scala    |  2 +-
 project/MimaExcludes.scala | 35 ++++++++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/project/MimaBuild.scala b/project/MimaBuild.scala
index 2ef0e7b40d940..adde213e361f0 100644
--- a/project/MimaBuild.scala
+++ b/project/MimaBuild.scala
@@ -88,7 +88,7 @@ object MimaBuild {
 
   def mimaSettings(sparkHome: File, projectRef: ProjectRef) = {
     val organization = "org.apache.spark"
-    val previousSparkVersion = "2.0.0"
+    val previousSparkVersion = "2.2.0"
     val project = projectRef.project
     val fullId = "spark-" + project + "_2.11"
     mimaDefaultSettings ++
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 32eb31f495979..d35c50e1d00fe 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -102,7 +102,40 @@ object MimaExcludes {
 
     // [SPARK-21087] CrossValidator, TrainValidationSplit expose sub models after fitting: Scala
     ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.CrossValidatorModel$CrossValidatorModelWriter"),
-    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter")
+    ProblemFilters.exclude[FinalClassProblem]("org.apache.spark.ml.tuning.TrainValidationSplitModel$TrainValidationSplitModelWriter"),
+
+    // [SPARK-21728][CORE] Allow SparkSubmit to use Logging
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFileList"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.downloadFile"),
+
+    // [SPARK-21714][CORE][YARN] Avoiding re-uploading remote resources in yarn client mode
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment"),
+
+    // [SPARK-22324][SQL][PYTHON] Upgrade Arrow to 0.8.0
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.network.util.AbstractFileRegion.transfered"),
+
+    // [SPARK-20643][CORE] Add listener implementation to collect app state
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$5"),
+
+    // [SPARK-20648][CORE] Port JobsTab and StageTab to the new UI backend
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.status.api.v1.TaskData.<init>$default$12"),
+
+    // [SPARK-21462][SS] Added batchId to StreamingQueryProgress.json
+    // [SPARK-21409][SS] Expose state store memory usage in SQL metrics and progress updates
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.sql.streaming.StateOperatorProgress.this"),
+
+    // [SPARK-22278][SS] Expose current event time watermark and current processing time in GroupState
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentWatermarkMs"),
+    ProblemFilters.exclude[ReversedMissingMethodProblem]("org.apache.spark.sql.streaming.GroupState.getCurrentProcessingTimeMs"),
+
+    // [SPARK-20542][ML][SQL] Add an API to Bucketizer that can bin multiple columns
+    ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasOutputCols.org$apache$spark$ml$param$shared$HasOutputCols$_setter_$outputCols_="),
+
+    // [SPARK-18619][ML] Make QuantileDiscretizer/Bucketizer/StringIndexer/RFormula inherit from HasHandleInvalid
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.Bucketizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.QuantileDiscretizer.getHandleInvalid"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.ml.feature.StringIndexerModel.getHandleInvalid")
   )
 
   // Exclude rules for 2.2.x

From 6c81fe227a6233f5d9665d2efadf8a1cf09f700d Mon Sep 17 00:00:00 2001
From: xubo245 <601450868@qq.com>
Date: Mon, 15 Jan 2018 23:13:15 +0800
Subject: [PATCH 349/356] [SPARK-23035][SQL] Fix improper information of
 TempTableAlreadyExistsException

## What changes were proposed in this pull request?

Problem: it throw TempTableAlreadyExistsException and output "Temporary table '$table' already exists" when we create temp view by using org.apache.spark.sql.catalyst.catalog.GlobalTempViewManager#create, it's improper.

So fix improper information about TempTableAlreadyExistsException when create temp view:

change "Temporary table"  to  "Temporary view"

## How was this patch tested?

test("rename temporary view - destination table already exists, with: CREATE TEMPORARY view")

test("rename temporary view - destination table with database name,with:CREATE TEMPORARY view")

Author: xubo245 <601450868@qq.com>

Closes #20227 from xubo245/fixDeprecated.
---
 .../analysis/AlreadyExistException.scala      |  2 +-
 .../catalog/SessionCatalogSuite.scala         |  6 +-
 .../spark/sql/execution/SQLViewSuite.scala    |  2 +-
 .../sql/execution/command/DDLSuite.scala      | 75 ++++++++++++++++++-
 4 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
index 57f7a80bedc6c..6d587abd8fd4d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/AlreadyExistException.scala
@@ -31,7 +31,7 @@ class TableAlreadyExistsException(db: String, table: String)
   extends AnalysisException(s"Table or view '$table' already exists in database '$db'")
 
 class TempTableAlreadyExistsException(table: String)
-  extends AnalysisException(s"Temporary table '$table' already exists")
+  extends AnalysisException(s"Temporary view '$table' already exists")
 
 class PartitionAlreadyExistsException(db: String, table: String, spec: TablePartitionSpec)
   extends AnalysisException(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 95c87ffa20cb7..6abab0073cca3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -279,7 +279,7 @@ abstract class SessionCatalogSuite extends AnalysisTest {
     }
   }
 
-  test("create temp table") {
+  test("create temp view") {
     withBasicCatalog { catalog =>
       val tempTable1 = Range(1, 10, 1, 10)
       val tempTable2 = Range(1, 20, 2, 10)
@@ -288,11 +288,11 @@ abstract class SessionCatalogSuite extends AnalysisTest {
       assert(catalog.getTempView("tbl1") == Option(tempTable1))
       assert(catalog.getTempView("tbl2") == Option(tempTable2))
       assert(catalog.getTempView("tbl3").isEmpty)
-      // Temporary table already exists
+      // Temporary view already exists
       intercept[TempTableAlreadyExistsException] {
         catalog.createTempView("tbl1", tempTable1, overrideIfExists = false)
       }
-      // Temporary table already exists but we override it
+      // Temporary view already exists but we override it
       catalog.createTempView("tbl1", tempTable2, overrideIfExists = true)
       assert(catalog.getTempView("tbl1") == Option(tempTable2))
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index 8c55758cfe38d..14082197ba0bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -293,7 +293,7 @@ abstract class SQLViewSuite extends QueryTest with SQLTestUtils {
         sql("CREATE TEMPORARY VIEW testView AS SELECT id FROM jt")
       }
 
-      assert(e.message.contains("Temporary table") && e.message.contains("already exists"))
+      assert(e.message.contains("Temporary view") && e.message.contains("already exists"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index 2b4b7c137428a..6ca21b5aa1595 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -835,6 +835,31 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table with database name,with:CREATE TEMPORARY view") {
+    withTempView("view1") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO default.tab2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`default`.`tab2`': " +
+          "cannot specify database name 'default' in the destination table"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") == Seq(TableIdentifier("view1")))
+    }
+  }
+
   test("rename temporary view") {
     withTempView("tab1", "tab2") {
       spark.range(10).createOrReplaceTempView("tab1")
@@ -883,6 +908,42 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("rename temporary view - destination table already exists, with: CREATE TEMPORARY view") {
+    withTempView("view1", "view2") {
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view1
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      sql(
+        """
+          |CREATE TEMPORARY VIEW view2
+          |USING org.apache.spark.sql.sources.DDLScanSource
+          |OPTIONS (
+          |  From '1',
+          |  To '10',
+          |  Table 'test1'
+          |)
+        """.stripMargin)
+
+      val e = intercept[AnalysisException] {
+        sql("ALTER TABLE view1 RENAME TO view2")
+      }
+      assert(e.getMessage.contains(
+        "RENAME TEMPORARY VIEW from '`view1`' to '`view2`': destination table already exists"))
+
+      val catalog = spark.sessionState.catalog
+      assert(catalog.listTables("default") ==
+        Seq(TableIdentifier("view1"), TableIdentifier("view2")))
+    }
+  }
+
   test("alter table: bucketing is not supported") {
     val catalog = spark.sessionState.catalog
     val tableIdent = TableIdentifier("tab1", Some("dbx"))
@@ -1728,12 +1789,22 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils {
   }
 
   test("block creating duplicate temp table") {
-    withView("t_temp") {
+    withTempView("t_temp") {
       sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
       val e = intercept[TempTableAlreadyExistsException] {
         sql("CREATE TEMPORARY TABLE t_temp (c3 int, c4 string) USING JSON")
       }.getMessage
-      assert(e.contains("Temporary table 't_temp' already exists"))
+      assert(e.contains("Temporary view 't_temp' already exists"))
+    }
+  }
+
+  test("block creating duplicate temp view") {
+    withTempView("t_temp") {
+      sql("CREATE TEMPORARY VIEW t_temp AS SELECT 1, 2")
+      val e = intercept[TempTableAlreadyExistsException] {
+        sql("CREATE TEMPORARY VIEW t_temp (c3 int, c4 string) USING JSON")
+      }.getMessage
+      assert(e.contains("Temporary view 't_temp' already exists"))
     }
   }
 

From b8380fdaa7b63d33c8cfd75e9e727698aac489c1 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Mon, 15 Jan 2018 14:48:43 -0500
Subject: [PATCH 350/356] Fixes

---
 circle.yml                                    |   4 +-
 .../org/apache/spark/deploy/SparkSubmit.scala |   4 +-
 dev/deps/spark-deps-hadoop-palantir           | 234 +++++++++++++++++
 dev/mima                                      |   2 +-
 project/MimaExcludes.scala                    |   7 +
 project/SparkBuild.scala                      |   4 +-
 .../org/apache/spark/deploy/k8s/Config.scala  | 219 ----------------
 .../apache/spark/deploy/k8s/Constants.scala   |  88 -------
 .../deploy/k8s/InitContainerBootstrap.scala   | 120 ---------
 .../spark/deploy/k8s/KubernetesUtils.scala    | 114 --------
 .../deploy/k8s/MountSecretsBootstrap.scala    |  72 -----
 .../deploy/k8s/SparkPodInitContainer.scala    | 116 ---------
 .../k8s/submit/DriverConfigOrchestrator.scala | 170 ------------
 .../submit/KubernetesClientApplication.scala  | 238 -----------------
 .../k8s/submit/KubernetesDriverSpec.scala     |  47 ----
 .../k8s/submit/LoggingPodStatusWatcher.scala  |   6 +-
 .../steps/BasicDriverConfigurationStep.scala  | 161 ------------
 .../steps/DependencyResolutionStep.scala      |  65 -----
 .../steps/DriverConfigurationStep.scala       |  30 ---
 .../DriverInitContainerBootstrapStep.scala    |  95 -------
 .../DriverKubernetesCredentialsStep.scala     | 245 ------------------
 .../submit/steps/DriverMountSecretsStep.scala |  38 ---
 .../steps/DriverServiceBootstrapStep.scala    | 104 --------
 .../BasicInitContainerConfigurationStep.scala |  67 -----
 .../InitContainerConfigOrchestrator.scala     |  79 ------
 .../InitContainerConfigurationStep.scala      |  25 --
 .../InitContainerMountSecretsStep.scala       |  36 ---
 .../initcontainer/InitContainerSpec.scala     |  37 ---
 .../KubernetesClusterSchedulerBackend.scala   |   3 +-
 .../spark/deploy/k8s/SecretVolumeUtils.scala  |  36 ---
 .../k8s/SparkPodInitContainerSuite.scala      |  86 ------
 .../DriverConfigOrchestratorSuite.scala       | 127 ---------
 .../BasicDriverConfigurationStepSuite.scala   | 118 ---------
 .../steps/DependencyResolutionStepSuite.scala |  81 ------
 ...riverInitContainerBootstrapStepSuite.scala | 160 ------------
 ...DriverKubernetesCredentialsStepSuite.scala | 153 -----------
 .../steps/DriverMountSecretsStepSuite.scala   |  49 ----
 .../DriverServiceBootstrapStepSuite.scala     | 180 -------------
 ...cInitContainerConfigurationStepSuite.scala |  95 -------
 ...InitContainerConfigOrchestratorSuite.scala |  80 ------
 .../InitContainerMountSecretsStepSuite.scala  |  52 ----
 .../src/main/dockerfiles/spark/Dockerfile     |  52 ----
 .../src/main/dockerfiles/spark/entrypoint.sh  |  97 -------
 43 files changed, 252 insertions(+), 3544 deletions(-)
 create mode 100644 dev/deps/spark-deps-hadoop-palantir
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
 delete mode 100644 resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
 delete mode 100644 resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
 delete mode 100644 resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
 delete mode 100755 resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh

diff --git a/circle.yml b/circle.yml
index 58bb67f0c11aa..00142165d6f8f 100644
--- a/circle.yml
+++ b/circle.yml
@@ -43,13 +43,13 @@ dependencies:
         # Copy contents into current build directory
         rsync --info=stats2,misc1,flist0 -a build_classes/ .
       fi
-    - ./build/mvn -DskipTests -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr install
+    - set -o pipefail && ./build/mvn -DskipTests -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr install | tee -a "$CIRCLE_ARTIFACTS/mvn-install.log"
       # Copy all of */target/scala_2.11/classes to build_classes/
     - >
       rsync --info=stats2,misc1,flist0 -a --delete-excluded --prune-empty-dirs --exclude build_classes/ --exclude 'target/streams' --exclude 'assembly/target' --exclude 'common/network-yarn/target' --exclude 'examples/target' --exclude '***/*.jar' --include 'target/***' --include '**/' --exclude '*' . build_classes/
     - |
       # Make sbt fetch all the external deps to ~/.ivy2 so it gets cached
-      ./build/sbt -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr externalDependencyClasspath
+      set -o pipefail && ./build/sbt -Phadoop-cloud -Phadoop-palantir -Pkinesis-asl -Pkubernetes -Pyarn -Phive -Psparkr externalDependencyClasspath | tee -a "$CIRCLE_ARTIFACTS/sbt-classpath.log"
   cache_directories:
     - "build_classes"
     - "build"
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 33932a4833906..890da7ff9b23c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -99,7 +99,7 @@ object SparkSubmit extends CommandLineUtils with Logging {
   private[deploy] val REST_CLUSTER_SUBMIT_CLASS = classOf[RestSubmissionClientApp].getName()
   private[deploy] val STANDALONE_CLUSTER_SUBMIT_CLASS = classOf[ClientApp].getName()
   private[deploy] val KUBERNETES_CLUSTER_SUBMIT_CLASS =
-    "org.apache.spark.deploy.k8s.submit.KubernetesClientApplication"
+    "org.apache.spark.deploy.k8s.submit.Client"
 
   // scalastyle:off println
   private[spark] def printVersionAndExit(): Unit = {
@@ -324,8 +324,6 @@ object SparkSubmit extends CommandLineUtils with Logging {
         printErrorAndExit("Python applications are currently not supported for Kubernetes.")
       case (KUBERNETES, _) if args.isR =>
         printErrorAndExit("R applications are currently not supported for Kubernetes.")
-      case (KUBERNETES, CLIENT) =>
-        printErrorAndExit("Client mode is currently not supported for Kubernetes.")
       case (LOCAL, CLUSTER) =>
         printErrorAndExit("Cluster deploy mode is not compatible with master \"local\"")
       case (_, CLUSTER) if isShell(args.primaryResource) =>
diff --git a/dev/deps/spark-deps-hadoop-palantir b/dev/deps/spark-deps-hadoop-palantir
new file mode 100644
index 0000000000000..6d2b0a65589ff
--- /dev/null
+++ b/dev/deps/spark-deps-hadoop-palantir
@@ -0,0 +1,234 @@
+JavaEWAH-0.3.2.jar
+RoaringBitmap-0.6.43.jar
+ST4-4.0.4.jar
+activation-1.1.1.jar
+aircompressor-0.8.jar
+antlr-2.7.7.jar
+antlr-runtime-3.4.jar
+antlr4-runtime-4.7.jar
+aopalliance-1.0.jar
+aopalliance-repackaged-2.5.0-b32.jar
+apache-log4j-extras-1.2.17.jar
+apacheds-i18n-2.0.0-M15.jar
+apacheds-kerberos-codec-2.0.0-M15.jar
+api-asn1-api-1.0.0-M20.jar
+api-util-1.0.0-M20.jar
+arpack_combined_all-0.1.jar
+arrow-format-0.8.0.jar
+arrow-memory-0.8.0.jar
+arrow-vector-0.8.0.jar
+automaton-1.11-8.jar
+avro-1.8.1.jar
+avro-ipc-1.8.1.jar
+avro-mapred-1.8.1-hadoop2.jar
+aws-java-sdk-core-1.11.45.jar
+aws-java-sdk-kms-1.11.45.jar
+aws-java-sdk-s3-1.11.45.jar
+base64-2.3.8.jar
+bcpkix-jdk15on-1.58.jar
+bcprov-jdk15on-1.58.jar
+bonecp-0.8.0.RELEASE.jar
+breeze-macros_2.11-0.13.2.jar
+breeze_2.11-0.13.2.jar
+calcite-avatica-1.2.0-incubating.jar
+calcite-core-1.2.0-incubating.jar
+calcite-linq4j-1.2.0-incubating.jar
+chill-java-0.8.4.jar
+chill_2.11-0.8.4.jar
+classmate-1.1.0.jar
+commons-beanutils-1.9.3.jar
+commons-beanutils-core-1.8.0.jar
+commons-cli-1.2.jar
+commons-codec-1.10.jar
+commons-collections-3.2.2.jar
+commons-compiler-3.0.8.jar
+commons-compress-1.8.1.jar
+commons-configuration-1.6.jar
+commons-crypto-1.0.0.jar
+commons-dbcp-1.4.jar
+commons-digester-1.8.jar
+commons-httpclient-3.1.jar
+commons-io-2.4.jar
+commons-lang-2.6.jar
+commons-lang3-3.5.jar
+commons-logging-1.2.jar
+commons-math3-3.4.1.jar
+commons-net-2.2.jar
+commons-pool-1.6.jar
+compress-lzf-1.0.3.jar
+converter-jackson-2.3.0.jar
+converter-scalars-2.3.0.jar
+core-1.1.2.jar
+curator-client-2.7.1.jar
+curator-framework-2.7.1.jar
+curator-recipes-2.7.1.jar
+datanucleus-api-jdo-3.2.6.jar
+datanucleus-core-3.2.10.jar
+datanucleus-rdbms-3.2.9.jar
+derby-10.12.1.1.jar
+dropwizard-jackson-0.9.2.jar
+dropwizard-lifecycle-0.9.2.jar
+dropwizard-metrics-0.9.2.jar
+dropwizard-metrics-influxdb-1.1.8.jar
+dropwizard-util-0.9.2.jar
+dropwizard-validation-0.9.2.jar
+eigenbase-properties-1.1.5.jar
+fastutil-7.0.13.jar
+flatbuffers-1.2.0-3f79e055.jar
+generex-1.0.1.jar
+gson-2.2.4.jar
+guava-14.0.1.jar
+guice-3.0.jar
+guice-servlet-3.0.jar
+hadoop-annotations-2.8.2-palantir.2.jar
+hadoop-auth-2.8.2-palantir.2.jar
+hadoop-aws-2.8.2-palantir.2.jar
+hadoop-client-2.8.2-palantir.2.jar
+hadoop-common-2.8.2-palantir.2.jar
+hadoop-hdfs-client-2.8.2-palantir.2.jar
+hadoop-mapreduce-client-app-2.8.2-palantir.2.jar
+hadoop-mapreduce-client-common-2.8.2-palantir.2.jar
+hadoop-mapreduce-client-core-2.8.2-palantir.2.jar
+hadoop-mapreduce-client-jobclient-2.8.2-palantir.2.jar
+hadoop-mapreduce-client-shuffle-2.8.2-palantir.2.jar
+hadoop-openstack-2.8.2-palantir.2.jar
+hadoop-yarn-api-2.8.2-palantir.2.jar
+hadoop-yarn-client-2.8.2-palantir.2.jar
+hadoop-yarn-common-2.8.2-palantir.2.jar
+hadoop-yarn-server-common-2.8.2-palantir.2.jar
+hadoop-yarn-server-web-proxy-2.8.2-palantir.2.jar
+hibernate-validator-5.2.2.Final.jar
+hk2-api-2.5.0-b32.jar
+hk2-locator-2.5.0-b32.jar
+hk2-utils-2.5.0-b32.jar
+hppc-0.7.2.jar
+htrace-core4-4.0.1-incubating.jar
+httpclient-4.5.4.jar
+httpcore-4.4.8.jar
+ion-java-1.0.1.jar
+ivy-2.4.0.jar
+jackson-annotations-2.6.7.jar
+jackson-core-2.6.7.jar
+jackson-core-asl-1.9.13.jar
+jackson-databind-2.6.7.1.jar
+jackson-dataformat-cbor-2.6.7.jar
+jackson-dataformat-yaml-2.6.7.jar
+jackson-datatype-guava-2.6.7.jar
+jackson-datatype-jdk7-2.6.7.jar
+jackson-datatype-joda-2.6.7.jar
+jackson-jaxrs-1.9.13.jar
+jackson-jaxrs-base-2.6.7.jar
+jackson-jaxrs-json-provider-2.6.7.jar
+jackson-mapper-asl-1.9.13.jar
+jackson-module-afterburner-2.6.7.jar
+jackson-module-jaxb-annotations-2.6.7.jar
+jackson-module-paranamer-2.7.9.jar
+jackson-module-scala_2.11-2.6.7.1.jar
+jackson-xc-1.9.13.jar
+janino-3.0.8.jar
+java-xmlbuilder-1.1.jar
+javassist-3.20.0-GA.jar
+javax.annotation-api-1.2.jar
+javax.el-3.0.0.jar
+javax.inject-1.jar
+javax.inject-2.5.0-b32.jar
+javax.servlet-api-3.1.0.jar
+javax.ws.rs-api-2.0.1.jar
+javolution-5.5.1.jar
+jaxb-api-2.2.2.jar
+jboss-logging-3.2.1.Final.jar
+jcip-annotations-1.0.jar
+jcl-over-slf4j-1.7.25.jar
+jdo-api-3.0.1.jar
+jersey-client-2.25.1.jar
+jersey-common-2.25.1.jar
+jersey-container-servlet-2.25.1.jar
+jersey-container-servlet-core-2.25.1.jar
+jersey-guava-2.25.1.jar
+jersey-media-jaxb-2.25.1.jar
+jersey-media-multipart-2.25.1.jar
+jersey-server-2.25.1.jar
+jets3t-0.9.4.jar
+jetty-6.1.26.jar
+jetty-sslengine-6.1.26.jar
+jetty-util-6.1.26.jar
+jmespath-java-1.0.jar
+joda-time-2.9.9.jar
+jodd-core-3.5.2.jar
+json-smart-1.1.1.jar
+json4s-ast_2.11-3.5.3.jar
+json4s-core_2.11-3.5.3.jar
+json4s-jackson_2.11-3.5.3.jar
+json4s-scalap_2.11-3.5.3.jar
+jsp-api-2.1.jar
+jsr305-3.0.1.jar
+jta-1.1.jar
+jtransforms-2.4.0.jar
+jul-to-slf4j-1.7.25.jar
+kryo-shaded-3.0.3.jar
+kubernetes-client-3.0.0.jar
+kubernetes-model-2.0.0.jar
+leveldbjni-all-1.8.jar
+libfb303-0.9.3.jar
+libthrift-0.9.3.jar
+log4j-1.2.17.jar
+logging-interceptor-3.8.1.jar
+lz4-java-1.4.0.jar
+machinist_2.11-0.6.1.jar
+macro-compat_2.11-1.1.1.jar
+metrics-core-3.2.5.jar
+metrics-graphite-3.2.5.jar
+metrics-influxdb-1.1.8.jar
+metrics-json-3.2.5.jar
+metrics-jvm-3.2.5.jar
+mimepull-1.9.6.jar
+minlog-1.3.0.jar
+netty-3.10.6.Final.jar
+netty-all-4.1.17.Final.jar
+nimbus-jose-jwt-3.9.jar
+objenesis-2.5.1.jar
+okhttp-2.7.5.jar
+okhttp-3.9.1.jar
+okio-1.13.0.jar
+opencsv-2.3.jar
+orc-core-1.4.1-nohive.jar
+orc-mapreduce-1.4.1-nohive.jar
+oro-2.0.8.jar
+osgi-resource-locator-1.0.1.jar
+paranamer-2.8.jar
+parquet-column-1.9.1-palantir3.jar
+parquet-common-1.9.1-palantir3.jar
+parquet-encoding-1.9.1-palantir3.jar
+parquet-format-2.3.1.jar
+parquet-hadoop-1.9.1-palantir3.jar
+parquet-hadoop-bundle-1.6.0.jar
+parquet-jackson-1.9.1-palantir3.jar
+protobuf-java-2.5.0.jar
+py4j-0.10.6.jar
+pyrolite-4.13.jar
+retrofit-2.3.0.jar
+scala-compiler-2.11.8.jar
+scala-library-2.11.8.jar
+scala-parser-combinators_2.11-1.0.4.jar
+scala-reflect-2.11.8.jar
+scala-xml_2.11-1.0.5.jar
+shapeless_2.11-2.3.2.jar
+slf4j-api-1.7.25.jar
+slf4j-log4j12-1.7.25.jar
+snakeyaml-1.15.jar
+snappy-0.2.jar
+snappy-java-1.1.2.6.jar
+spire-macros_2.11-0.13.0.jar
+spire_2.11-0.13.0.jar
+stax-api-1.0-2.jar
+stax-api-1.0.1.jar
+stream-2.7.0.jar
+stringtemplate-3.2.1.jar
+univocity-parsers-2.5.9.jar
+validation-api-1.1.0.Final.jar
+xbean-asm5-shaded-4.4.jar
+xmlenc-0.52.jar
+xz-1.5.jar
+zjsonpatch-0.3.0.jar
+zookeeper-3.4.6.jar
+zstd-jni-1.3.2-2.jar
diff --git a/dev/mima b/dev/mima
index cd2694ff4d3de..e6402fb3771f6 100755
--- a/dev/mima
+++ b/dev/mima
@@ -24,7 +24,7 @@ set -e
 FWDIR="$(cd "`dirname "$0"`"/..; pwd)"
 cd "$FWDIR"
 
-SPARK_PROFILES="-Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
+SPARK_PROFILES="-Phadoop-palantir -Pmesos -Phadoop-cloud -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Pspark-ganglia-lgpl -Pkinesis-asl -Phive-thriftserver -Phive"
 TOOLS_CLASSPATH="$(build/sbt -DcopyDependencies=false "export tools/fullClasspath" | tail -n1)"
 OLD_DEPS_CLASSPATH="$(build/sbt -DcopyDependencies=false $SPARK_PROFILES "export oldDeps/fullClasspath" | tail -n1)"
 
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 112e8f5e51cf2..7d75ef8ddb7a8 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,13 @@ object MimaExcludes {
 
   // Exclude rules for 2.4.x
   lazy val v24excludes = v23excludes ++ Seq(
+    // Converted from case object to case class
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.productElement"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.productArity"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.canEqual"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.productIterator"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.productPrefix"),
+    ProblemFilters.exclude[FinalMethodProblem]("org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages#RetrieveSparkAppConfig.toString")
   )
 
   // Exclude rules for 2.3.x
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e70c4cdf29dd1..a0ae82639034e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -57,12 +57,12 @@ object BuildCommons {
   val optionallyEnabledProjects@Seq(kubernetes, mesos, yarn,
     streamingFlumeSink, streamingFlume,
     streamingKafka, sparkGangliaLgpl, streamingKinesisAsl,
-    dockerIntegrationTests, hadoopCloud) =
+    dockerIntegrationTests, hadoopCloud, _*) =
     Seq("kubernetes", "mesos", "yarn",
       "streaming-flume-sink", "streaming-flume",
       "streaming-kafka-0-8", "ganglia-lgpl", "streaming-kinesis-asl",
       "docker-integration-tests", "hadoop-cloud",
-      "kubernetes", "kubernetes-integration-tests",
+      "kubernetes-integration-tests",
       "kubernetes-integration-tests-spark-jobs", "kubernetes-integration-tests-spark-jobs-helpers",
       "kubernetes-docker-minimal-bundle"
     ).map(ProjectRef(buildLocation, _))
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
deleted file mode 100644
index 471196ac0e3f6..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import java.util.concurrent.TimeUnit
-
-import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.ConfigBuilder
-
-private[spark] object Config extends Logging {
-
-  val KUBERNETES_NAMESPACE =
-    ConfigBuilder("spark.kubernetes.namespace")
-      .doc("The namespace that will be used for running the driver and executor pods.")
-      .stringConf
-      .createWithDefault("default")
-
-  val CONTAINER_IMAGE =
-    ConfigBuilder("spark.kubernetes.container.image")
-      .doc("Container image to use for Spark containers. Individual container types " +
-        "(e.g. driver or executor) can also be configured to use different images if desired, " +
-        "by setting the container type-specific image name.")
-      .stringConf
-      .createOptional
-
-  val DRIVER_CONTAINER_IMAGE =
-    ConfigBuilder("spark.kubernetes.driver.container.image")
-      .doc("Container image to use for the driver.")
-      .fallbackConf(CONTAINER_IMAGE)
-
-  val EXECUTOR_CONTAINER_IMAGE =
-    ConfigBuilder("spark.kubernetes.executor.container.image")
-      .doc("Container image to use for the executors.")
-      .fallbackConf(CONTAINER_IMAGE)
-
-  val CONTAINER_IMAGE_PULL_POLICY =
-    ConfigBuilder("spark.kubernetes.container.image.pullPolicy")
-      .doc("Kubernetes image pull policy. Valid values are Always, Never, and IfNotPresent.")
-      .stringConf
-      .checkValues(Set("Always", "Never", "IfNotPresent"))
-      .createWithDefault("IfNotPresent")
-
-  val KUBERNETES_AUTH_DRIVER_CONF_PREFIX =
-      "spark.kubernetes.authenticate.driver"
-  val KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX =
-      "spark.kubernetes.authenticate.driver.mounted"
-  val OAUTH_TOKEN_CONF_SUFFIX = "oauthToken"
-  val OAUTH_TOKEN_FILE_CONF_SUFFIX = "oauthTokenFile"
-  val CLIENT_KEY_FILE_CONF_SUFFIX = "clientKeyFile"
-  val CLIENT_CERT_FILE_CONF_SUFFIX = "clientCertFile"
-  val CA_CERT_FILE_CONF_SUFFIX = "caCertFile"
-
-  val KUBERNETES_SERVICE_ACCOUNT_NAME =
-    ConfigBuilder(s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.serviceAccountName")
-      .doc("Service account that is used when running the driver pod. The driver pod uses " +
-        "this service account when requesting executor pods from the API server. If specific " +
-        "credentials are given for the driver pod to use, the driver will favor " +
-        "using those credentials instead.")
-      .stringConf
-      .createOptional
-
-  val KUBERNETES_DRIVER_LIMIT_CORES =
-    ConfigBuilder("spark.kubernetes.driver.limit.cores")
-      .doc("Specify the hard cpu limit for the driver pod")
-      .stringConf
-      .createOptional
-
-  val KUBERNETES_EXECUTOR_LIMIT_CORES =
-    ConfigBuilder("spark.kubernetes.executor.limit.cores")
-      .doc("Specify the hard cpu limit for each executor pod")
-      .stringConf
-      .createOptional
-
-  val KUBERNETES_DRIVER_POD_NAME =
-    ConfigBuilder("spark.kubernetes.driver.pod.name")
-      .doc("Name of the driver pod.")
-      .stringConf
-      .createOptional
-
-  val KUBERNETES_EXECUTOR_POD_NAME_PREFIX =
-    ConfigBuilder("spark.kubernetes.executor.podNamePrefix")
-      .doc("Prefix to use in front of the executor pod names.")
-      .internal()
-      .stringConf
-      .createWithDefault("spark")
-
-  val KUBERNETES_ALLOCATION_BATCH_SIZE =
-    ConfigBuilder("spark.kubernetes.allocation.batch.size")
-      .doc("Number of pods to launch at once in each round of executor allocation.")
-      .intConf
-      .checkValue(value => value > 0, "Allocation batch size should be a positive integer")
-      .createWithDefault(5)
-
-  val KUBERNETES_ALLOCATION_BATCH_DELAY =
-    ConfigBuilder("spark.kubernetes.allocation.batch.delay")
-      .doc("Time to wait between each round of executor allocation.")
-      .timeConf(TimeUnit.MILLISECONDS)
-      .checkValue(value => value > 0, "Allocation batch delay must be a positive time value.")
-      .createWithDefaultString("1s")
-
-  val KUBERNETES_EXECUTOR_LOST_REASON_CHECK_MAX_ATTEMPTS =
-    ConfigBuilder("spark.kubernetes.executor.lostCheck.maxAttempts")
-      .doc("Maximum number of attempts allowed for checking the reason of an executor loss " +
-        "before it is assumed that the executor failed.")
-      .intConf
-      .checkValue(value => value > 0, "Maximum attempts of checks of executor lost reason " +
-        "must be a positive integer")
-      .createWithDefault(10)
-
-  val WAIT_FOR_APP_COMPLETION =
-    ConfigBuilder("spark.kubernetes.submission.waitAppCompletion")
-      .doc("In cluster mode, whether to wait for the application to finish before exiting the " +
-        "launcher process.")
-      .booleanConf
-      .createWithDefault(true)
-
-  val REPORT_INTERVAL =
-    ConfigBuilder("spark.kubernetes.report.interval")
-      .doc("Interval between reports of the current app status in cluster mode.")
-      .timeConf(TimeUnit.MILLISECONDS)
-      .checkValue(interval => interval > 0, s"Logging interval must be a positive time value.")
-      .createWithDefaultString("1s")
-
-  val JARS_DOWNLOAD_LOCATION =
-    ConfigBuilder("spark.kubernetes.mountDependencies.jarsDownloadDir")
-      .doc("Location to download jars to in the driver and executors. When using " +
-        "spark-submit, this directory must be empty and will be mounted as an empty directory " +
-        "volume on the driver and executor pod.")
-      .stringConf
-      .createWithDefault("/var/spark-data/spark-jars")
-
-  val FILES_DOWNLOAD_LOCATION =
-    ConfigBuilder("spark.kubernetes.mountDependencies.filesDownloadDir")
-      .doc("Location to download files to in the driver and executors. When using " +
-        "spark-submit, this directory must be empty and will be mounted as an empty directory " +
-        "volume on the driver and executor pods.")
-      .stringConf
-      .createWithDefault("/var/spark-data/spark-files")
-
-  val INIT_CONTAINER_IMAGE =
-    ConfigBuilder("spark.kubernetes.initContainer.image")
-      .doc("Image for the driver and executor's init-container for downloading dependencies.")
-      .fallbackConf(CONTAINER_IMAGE)
-
-  val INIT_CONTAINER_MOUNT_TIMEOUT =
-    ConfigBuilder("spark.kubernetes.mountDependencies.timeout")
-      .doc("Timeout before aborting the attempt to download and unpack dependencies from remote " +
-        "locations into the driver and executor pods.")
-      .timeConf(TimeUnit.SECONDS)
-      .createWithDefault(300)
-
-  val INIT_CONTAINER_MAX_THREAD_POOL_SIZE =
-    ConfigBuilder("spark.kubernetes.mountDependencies.maxSimultaneousDownloads")
-      .doc("Maximum number of remote dependencies to download simultaneously in a driver or " +
-        "executor pod.")
-      .intConf
-      .createWithDefault(5)
-
-  val INIT_CONTAINER_REMOTE_JARS =
-    ConfigBuilder("spark.kubernetes.initContainer.remoteJars")
-      .doc("Comma-separated list of jar URIs to download in the init-container. This is " +
-        "calculated from spark.jars.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  val INIT_CONTAINER_REMOTE_FILES =
-    ConfigBuilder("spark.kubernetes.initContainer.remoteFiles")
-      .doc("Comma-separated list of file URIs to download in the init-container. This is " +
-        "calculated from spark.files.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  val INIT_CONTAINER_CONFIG_MAP_NAME =
-    ConfigBuilder("spark.kubernetes.initContainer.configMapName")
-      .doc("Name of the config map to use in the init-container that retrieves submitted files " +
-        "for the executor.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  val INIT_CONTAINER_CONFIG_MAP_KEY_CONF =
-    ConfigBuilder("spark.kubernetes.initContainer.configMapKey")
-      .doc("Key for the entry in the init container config map for submitted files that " +
-        "corresponds to the properties for this init-container.")
-      .internal()
-      .stringConf
-      .createOptional
-
-  val KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX =
-    "spark.kubernetes.authenticate.submission"
-
-  val KUBERNETES_NODE_SELECTOR_PREFIX = "spark.kubernetes.node.selector."
-
-  val KUBERNETES_DRIVER_LABEL_PREFIX = "spark.kubernetes.driver.label."
-  val KUBERNETES_DRIVER_ANNOTATION_PREFIX = "spark.kubernetes.driver.annotation."
-  val KUBERNETES_DRIVER_SECRETS_PREFIX = "spark.kubernetes.driver.secrets."
-
-  val KUBERNETES_EXECUTOR_LABEL_PREFIX = "spark.kubernetes.executor.label."
-  val KUBERNETES_EXECUTOR_ANNOTATION_PREFIX = "spark.kubernetes.executor.annotation."
-  val KUBERNETES_EXECUTOR_SECRETS_PREFIX = "spark.kubernetes.executor.secrets."
-
-  val KUBERNETES_DRIVER_ENV_KEY = "spark.kubernetes.driverEnv."
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
deleted file mode 100644
index 9411956996843..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Constants.scala
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-private[spark] object Constants {
-
-  // Labels
-  val SPARK_APP_ID_LABEL = "spark-app-selector"
-  val SPARK_EXECUTOR_ID_LABEL = "spark-exec-id"
-  val SPARK_ROLE_LABEL = "spark-role"
-  val SPARK_POD_DRIVER_ROLE = "driver"
-  val SPARK_POD_EXECUTOR_ROLE = "executor"
-
-  // Annotations
-  val SPARK_APP_NAME_ANNOTATION = "spark-app-name"
-
-  // Credentials secrets
-  val DRIVER_CREDENTIALS_SECRETS_BASE_DIR =
-    "/mnt/secrets/spark-kubernetes-credentials"
-  val DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME = "ca-cert"
-  val DRIVER_CREDENTIALS_CA_CERT_PATH =
-    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME"
-  val DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME = "client-key"
-  val DRIVER_CREDENTIALS_CLIENT_KEY_PATH =
-    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME"
-  val DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME = "client-cert"
-  val DRIVER_CREDENTIALS_CLIENT_CERT_PATH =
-    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME"
-  val DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME = "oauth-token"
-  val DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH =
-    s"$DRIVER_CREDENTIALS_SECRETS_BASE_DIR/$DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME"
-  val DRIVER_CREDENTIALS_SECRET_VOLUME_NAME = "kubernetes-credentials"
-
-  // Default and fixed ports
-  val DEFAULT_DRIVER_PORT = 7078
-  val DEFAULT_BLOCKMANAGER_PORT = 7079
-  val DRIVER_PORT_NAME = "driver-rpc-port"
-  val BLOCK_MANAGER_PORT_NAME = "blockmanager"
-  val EXECUTOR_PORT_NAME = "executor"
-
-  // Environment Variables
-  val ENV_EXECUTOR_PORT = "SPARK_EXECUTOR_PORT"
-  val ENV_DRIVER_URL = "SPARK_DRIVER_URL"
-  val ENV_EXECUTOR_CORES = "SPARK_EXECUTOR_CORES"
-  val ENV_EXECUTOR_MEMORY = "SPARK_EXECUTOR_MEMORY"
-  val ENV_APPLICATION_ID = "SPARK_APPLICATION_ID"
-  val ENV_EXECUTOR_ID = "SPARK_EXECUTOR_ID"
-  val ENV_EXECUTOR_POD_IP = "SPARK_EXECUTOR_POD_IP"
-  val ENV_MOUNTED_CLASSPATH = "SPARK_MOUNTED_CLASSPATH"
-  val ENV_JAVA_OPT_PREFIX = "SPARK_JAVA_OPT_"
-  val ENV_CLASSPATH = "SPARK_CLASSPATH"
-  val ENV_DRIVER_MAIN_CLASS = "SPARK_DRIVER_CLASS"
-  val ENV_DRIVER_ARGS = "SPARK_DRIVER_ARGS"
-  val ENV_DRIVER_JAVA_OPTS = "SPARK_DRIVER_JAVA_OPTS"
-  val ENV_DRIVER_BIND_ADDRESS = "SPARK_DRIVER_BIND_ADDRESS"
-  val ENV_DRIVER_MEMORY = "SPARK_DRIVER_MEMORY"
-  val ENV_MOUNTED_FILES_DIR = "SPARK_MOUNTED_FILES_DIR"
-
-  // Bootstrapping dependencies with the init-container
-  val INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME = "download-jars-volume"
-  val INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME = "download-files-volume"
-  val INIT_CONTAINER_PROPERTIES_FILE_VOLUME = "spark-init-properties"
-  val INIT_CONTAINER_PROPERTIES_FILE_DIR = "/etc/spark-init"
-  val INIT_CONTAINER_PROPERTIES_FILE_NAME = "spark-init.properties"
-  val INIT_CONTAINER_PROPERTIES_FILE_PATH =
-    s"$INIT_CONTAINER_PROPERTIES_FILE_DIR/$INIT_CONTAINER_PROPERTIES_FILE_NAME"
-  val INIT_CONTAINER_SECRET_VOLUME_NAME = "spark-init-secret"
-
-  // Miscellaneous
-  val KUBERNETES_MASTER_INTERNAL_URL = "https://kubernetes.default.svc"
-  val DRIVER_CONTAINER_NAME = "spark-kubernetes-driver"
-  val MEMORY_OVERHEAD_FACTOR = 0.10
-  val MEMORY_OVERHEAD_MIN_MIB = 384L
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
deleted file mode 100644
index f6a57dfe00171..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/InitContainerBootstrap.scala
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, EnvVarBuilder, PodBuilder, VolumeMount, VolumeMountBuilder}
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-
-/**
- * Bootstraps an init-container for downloading remote dependencies. This is separated out from
- * the init-container steps API because this component can be used to bootstrap init-containers
- * for both the driver and executors.
- */
-private[spark] class InitContainerBootstrap(
-    initContainerImage: String,
-    imagePullPolicy: String,
-    jarsDownloadPath: String,
-    filesDownloadPath: String,
-    configMapName: String,
-    configMapKey: String,
-    sparkRole: String,
-    sparkConf: SparkConf) {
-
-  /**
-   * Bootstraps an init-container that downloads dependencies to be used by a main container.
-   */
-  def bootstrapInitContainer(
-      original: PodWithDetachedInitContainer): PodWithDetachedInitContainer = {
-    val sharedVolumeMounts = Seq[VolumeMount](
-      new VolumeMountBuilder()
-        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
-        .withMountPath(jarsDownloadPath)
-        .build(),
-      new VolumeMountBuilder()
-        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
-        .withMountPath(filesDownloadPath)
-        .build())
-
-    val customEnvVarKeyPrefix = sparkRole match {
-      case SPARK_POD_DRIVER_ROLE => KUBERNETES_DRIVER_ENV_KEY
-      case SPARK_POD_EXECUTOR_ROLE => "spark.executorEnv."
-      case _ => throw new SparkException(s"$sparkRole is not a valid Spark pod role")
-    }
-    val customEnvVars = sparkConf.getAllWithPrefix(customEnvVarKeyPrefix).toSeq.map {
-      case (key, value) =>
-        new EnvVarBuilder()
-          .withName(key)
-          .withValue(value)
-          .build()
-    }
-
-    val initContainer = new ContainerBuilder(original.initContainer)
-      .withName("spark-init")
-      .withImage(initContainerImage)
-      .withImagePullPolicy(imagePullPolicy)
-      .addAllToEnv(customEnvVars.asJava)
-      .addNewVolumeMount()
-        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
-        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
-        .endVolumeMount()
-      .addToVolumeMounts(sharedVolumeMounts: _*)
-      .addToArgs("init")
-      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
-      .build()
-
-    val podWithBasicVolumes = new PodBuilder(original.pod)
-      .editSpec()
-      .addNewVolume()
-        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
-        .withNewConfigMap()
-          .withName(configMapName)
-          .addNewItem()
-            .withKey(configMapKey)
-            .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
-            .endItem()
-          .endConfigMap()
-        .endVolume()
-      .addNewVolume()
-        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
-        .withEmptyDir(new EmptyDirVolumeSource())
-        .endVolume()
-      .addNewVolume()
-        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
-        .withEmptyDir(new EmptyDirVolumeSource())
-        .endVolume()
-      .endSpec()
-      .build()
-
-    val mainContainer = new ContainerBuilder(original.mainContainer)
-      .addToVolumeMounts(sharedVolumeMounts: _*)
-      .addNewEnv()
-        .withName(ENV_MOUNTED_FILES_DIR)
-        .withValue(filesDownloadPath)
-        .endEnv()
-      .build()
-
-    PodWithDetachedInitContainer(
-      podWithBasicVolumes,
-      initContainer,
-      mainContainer)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
deleted file mode 100644
index 37331d8bbf9b7..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import java.io.File
-
-import io.fabric8.kubernetes.api.model.{Container, Pod, PodBuilder}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.util.Utils
-
-private[spark] object KubernetesUtils {
-
-  /**
-   * Extract and parse Spark configuration properties with a given name prefix and
-   * return the result as a Map. Keys must not have more than one value.
-   *
-   * @param sparkConf Spark configuration
-   * @param prefix the given property name prefix
-   * @return a Map storing the configuration property keys and values
-   */
-  def parsePrefixedKeyValuePairs(
-      sparkConf: SparkConf,
-      prefix: String): Map[String, String] = {
-    sparkConf.getAllWithPrefix(prefix).toMap
-  }
-
-  def requireNandDefined(opt1: Option[_], opt2: Option[_], errMessage: String): Unit = {
-    opt1.foreach { _ => require(opt2.isEmpty, errMessage) }
-  }
-
-  /**
-   * Append the given init-container to a pod's list of init-containers.
-   *
-   * @param originalPodSpec original specification of the pod
-   * @param initContainer the init-container to add to the pod
-   * @return the pod with the init-container added to the list of InitContainers
-   */
-  def appendInitContainer(originalPodSpec: Pod, initContainer: Container): Pod = {
-    new PodBuilder(originalPodSpec)
-      .editOrNewSpec()
-        .addToInitContainers(initContainer)
-        .endSpec()
-      .build()
-  }
-
-  /**
-   * For the given collection of file URIs, resolves them as follows:
-   * - File URIs with scheme file:// are resolved to the given download path.
-   * - File URIs with scheme local:// resolve to just the path of the URI.
-   * - Otherwise, the URIs are returned as-is.
-   */
-  def resolveFileUris(
-      fileUris: Iterable[String],
-      fileDownloadPath: String): Iterable[String] = {
-    fileUris.map { uri =>
-      resolveFileUri(uri, fileDownloadPath, false)
-    }
-  }
-
-  /**
-   * If any file uri has any scheme other than local:// it is mapped as if the file
-   * was downloaded to the file download path. Otherwise, it is mapped to the path
-   * part of the URI.
-   */
-  def resolveFilePaths(fileUris: Iterable[String], fileDownloadPath: String): Iterable[String] = {
-    fileUris.map { uri =>
-      resolveFileUri(uri, fileDownloadPath, true)
-    }
-  }
-
-  /**
-   * Get from a given collection of file URIs the ones that represent remote files.
-   */
-  def getOnlyRemoteFiles(uris: Iterable[String]): Iterable[String] = {
-    uris.filter { uri =>
-      val scheme = Utils.resolveURI(uri).getScheme
-      scheme != "file" && scheme != "local"
-    }
-  }
-
-  private def resolveFileUri(
-      uri: String,
-      fileDownloadPath: String,
-      assumesDownloaded: Boolean): String = {
-    val fileUri = Utils.resolveURI(uri)
-    val fileScheme = Option(fileUri.getScheme).getOrElse("file")
-    fileScheme match {
-      case "local" =>
-        fileUri.getPath
-      case _ =>
-        if (assumesDownloaded || fileScheme == "file") {
-          val fileName = new File(fileUri.getPath).getName
-          s"$fileDownloadPath/$fileName"
-        } else {
-          uri
-        }
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
deleted file mode 100644
index c35e7db51d407..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/MountSecretsBootstrap.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, Pod, PodBuilder}
-
-/**
- * Bootstraps a driver or executor container or an init-container with needed secrets mounted.
- */
-private[spark] class MountSecretsBootstrap(secretNamesToMountPaths: Map[String, String]) {
-
-  /**
-   * Add new secret volumes for the secrets specified in secretNamesToMountPaths into the given pod.
-   *
-   * @param pod the pod into which the secret volumes are being added.
-   * @return the updated pod with the secret volumes added.
-   */
-  def addSecretVolumes(pod: Pod): Pod = {
-    var podBuilder = new PodBuilder(pod)
-    secretNamesToMountPaths.keys.foreach { name =>
-      podBuilder = podBuilder
-        .editOrNewSpec()
-          .addNewVolume()
-            .withName(secretVolumeName(name))
-            .withNewSecret()
-              .withSecretName(name)
-              .endSecret()
-            .endVolume()
-          .endSpec()
-    }
-
-    podBuilder.build()
-  }
-
-  /**
-   * Mounts Kubernetes secret volumes of the secrets specified in secretNamesToMountPaths into the
-   * given container.
-   *
-   * @param container the container into which the secret volumes are being mounted.
-   * @return the updated container with the secrets mounted.
-   */
-  def mountSecrets(container: Container): Container = {
-    var containerBuilder = new ContainerBuilder(container)
-    secretNamesToMountPaths.foreach { case (name, path) =>
-      containerBuilder = containerBuilder
-        .addNewVolumeMount()
-          .withName(secretVolumeName(name))
-          .withMountPath(path)
-          .endVolumeMount()
-    }
-
-    containerBuilder.build()
-  }
-
-  private def secretVolumeName(secretName: String): String = {
-    secretName + "-volume"
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
deleted file mode 100644
index c0f08786b76a1..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/SparkPodInitContainer.scala
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import java.io.File
-import java.util.concurrent.TimeUnit
-
-import scala.concurrent.{ExecutionContext, Future}
-
-import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
-import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.{ThreadUtils, Utils}
-
-/**
- * Process that fetches files from a resource staging server and/or arbitrary remote locations.
- *
- * The init-container can handle fetching files from any of those sources, but not all of the
- * sources need to be specified. This allows for composing multiple instances of this container
- * with different configurations for different download sources, or using the same container to
- * download everything at once.
- */
-private[spark] class SparkPodInitContainer(
-    sparkConf: SparkConf,
-    fileFetcher: FileFetcher) extends Logging {
-
-  private val maxThreadPoolSize = sparkConf.get(INIT_CONTAINER_MAX_THREAD_POOL_SIZE)
-  private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
-    ThreadUtils.newDaemonCachedThreadPool("download-executor", maxThreadPoolSize))
-
-  private val jarsDownloadDir = new File(sparkConf.get(JARS_DOWNLOAD_LOCATION))
-  private val filesDownloadDir = new File(sparkConf.get(FILES_DOWNLOAD_LOCATION))
-
-  private val remoteJars = sparkConf.get(INIT_CONTAINER_REMOTE_JARS)
-  private val remoteFiles = sparkConf.get(INIT_CONTAINER_REMOTE_FILES)
-
-  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)
-
-  def run(): Unit = {
-    logInfo(s"Downloading remote jars: $remoteJars")
-    downloadFiles(
-      remoteJars,
-      jarsDownloadDir,
-      s"Remote jars download directory specified at $jarsDownloadDir does not exist " +
-        "or is not a directory.")
-
-    logInfo(s"Downloading remote files: $remoteFiles")
-    downloadFiles(
-      remoteFiles,
-      filesDownloadDir,
-      s"Remote files download directory specified at $filesDownloadDir does not exist " +
-        "or is not a directory.")
-
-    downloadExecutor.shutdown()
-    downloadExecutor.awaitTermination(downloadTimeoutMinutes, TimeUnit.MINUTES)
-  }
-
-  private def downloadFiles(
-      filesCommaSeparated: Option[String],
-      downloadDir: File,
-      errMessage: String): Unit = {
-    filesCommaSeparated.foreach { files =>
-      require(downloadDir.isDirectory, errMessage)
-      Utils.stringToSeq(files).foreach { file =>
-        Future[Unit] {
-          fileFetcher.fetchFile(file, downloadDir)
-        }
-      }
-    }
-  }
-}
-
-private class FileFetcher(sparkConf: SparkConf, securityManager: SparkSecurityManager) {
-
-  def fetchFile(uri: String, targetDir: File): Unit = {
-    Utils.fetchFile(
-      url = uri,
-      targetDir = targetDir,
-      conf = sparkConf,
-      securityMgr = securityManager,
-      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
-      timestamp = System.currentTimeMillis(),
-      useCache = false)
-  }
-}
-
-object SparkPodInitContainer extends Logging {
-
-  def main(args: Array[String]): Unit = {
-    logInfo("Starting init-container to download Spark application dependencies.")
-    val sparkConf = new SparkConf(true)
-    if (args.nonEmpty) {
-      Utils.loadDefaultSparkProperties(sparkConf, args(0))
-    }
-
-    val securityManager = new SparkSecurityManager(sparkConf)
-    val fileFetcher = new FileFetcher(sparkConf, securityManager)
-    new SparkPodInitContainer(sparkConf, fileFetcher).run()
-    logInfo("Finished downloading application dependencies.")
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
deleted file mode 100644
index c9cc300d65569..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestrator.scala
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit
-
-import java.util.UUID
-
-import com.google.common.primitives.Longs
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.k8s.{KubernetesUtils, MountSecretsBootstrap}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.steps._
-import org.apache.spark.deploy.k8s.submit.steps.initcontainer.InitContainerConfigOrchestrator
-import org.apache.spark.launcher.SparkLauncher
-import org.apache.spark.util.SystemClock
-import org.apache.spark.util.Utils
-
-/**
- * Figures out and returns the complete ordered list of needed DriverConfigurationSteps to
- * configure the Spark driver pod. The returned steps will be applied one by one in the given
- * order to produce a final KubernetesDriverSpec that is used in KubernetesClientApplication
- * to construct and create the driver pod. It uses the InitContainerConfigOrchestrator to
- * configure the driver init-container if one is needed, i.e., when there are remote dependencies
- * to localize.
- */
-private[spark] class DriverConfigOrchestrator(
-    kubernetesAppId: String,
-    launchTime: Long,
-    mainAppResource: Option[MainAppResource],
-    appName: String,
-    mainClass: String,
-    appArgs: Array[String],
-    sparkConf: SparkConf) {
-
-  // The resource name prefix is derived from the Spark application name, making it easy to connect
-  // the names of the Kubernetes resources from e.g. kubectl or the Kubernetes dashboard to the
-  // application the user submitted.
-  private val kubernetesResourceNamePrefix = {
-    val uuid = UUID.nameUUIDFromBytes(Longs.toByteArray(launchTime)).toString.replaceAll("-", "")
-    s"$appName-$uuid".toLowerCase.replaceAll("\\.", "-")
-  }
-
-  private val imagePullPolicy = sparkConf.get(CONTAINER_IMAGE_PULL_POLICY)
-  private val initContainerConfigMapName = s"$kubernetesResourceNamePrefix-init-config"
-  private val jarsDownloadPath = sparkConf.get(JARS_DOWNLOAD_LOCATION)
-  private val filesDownloadPath = sparkConf.get(FILES_DOWNLOAD_LOCATION)
-
-  def getAllConfigurationSteps: Seq[DriverConfigurationStep] = {
-    val driverCustomLabels = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf,
-      KUBERNETES_DRIVER_LABEL_PREFIX)
-    require(!driverCustomLabels.contains(SPARK_APP_ID_LABEL), "Label with key " +
-      s"$SPARK_APP_ID_LABEL is not allowed as it is reserved for Spark bookkeeping " +
-      "operations.")
-    require(!driverCustomLabels.contains(SPARK_ROLE_LABEL), "Label with key " +
-      s"$SPARK_ROLE_LABEL is not allowed as it is reserved for Spark bookkeeping " +
-      "operations.")
-
-    val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf,
-      KUBERNETES_DRIVER_SECRETS_PREFIX)
-
-    val allDriverLabels = driverCustomLabels ++ Map(
-      SPARK_APP_ID_LABEL -> kubernetesAppId,
-      SPARK_ROLE_LABEL -> SPARK_POD_DRIVER_ROLE)
-
-    val initialSubmissionStep = new BasicDriverConfigurationStep(
-      kubernetesAppId,
-      kubernetesResourceNamePrefix,
-      allDriverLabels,
-      imagePullPolicy,
-      appName,
-      mainClass,
-      appArgs,
-      sparkConf)
-
-    val serviceBootstrapStep = new DriverServiceBootstrapStep(
-      kubernetesResourceNamePrefix,
-      allDriverLabels,
-      sparkConf,
-      new SystemClock)
-
-    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
-      sparkConf, kubernetesResourceNamePrefix)
-
-    val additionalMainAppJar = if (mainAppResource.nonEmpty) {
-       val mayBeResource = mainAppResource.get match {
-        case JavaMainAppResource(resource) if resource != SparkLauncher.NO_RESOURCE =>
-          Some(resource)
-        case _ => None
-      }
-      mayBeResource
-    } else {
-      None
-    }
-
-    val sparkJars = sparkConf.getOption("spark.jars")
-      .map(_.split(","))
-      .getOrElse(Array.empty[String]) ++
-      additionalMainAppJar.toSeq
-    val sparkFiles = sparkConf.getOption("spark.files")
-      .map(_.split(","))
-      .getOrElse(Array.empty[String])
-
-    val dependencyResolutionStep = if (sparkJars.nonEmpty || sparkFiles.nonEmpty) {
-      Seq(new DependencyResolutionStep(
-        sparkJars,
-        sparkFiles,
-        jarsDownloadPath,
-        filesDownloadPath))
-    } else {
-      Nil
-    }
-
-    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
-      Seq(new DriverMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
-    } else {
-      Nil
-    }
-
-    val initContainerBootstrapStep = if (existNonContainerLocalFiles(sparkJars ++ sparkFiles)) {
-      val orchestrator = new InitContainerConfigOrchestrator(
-        sparkJars,
-        sparkFiles,
-        jarsDownloadPath,
-        filesDownloadPath,
-        imagePullPolicy,
-        initContainerConfigMapName,
-        INIT_CONTAINER_PROPERTIES_FILE_NAME,
-        sparkConf)
-      val bootstrapStep = new DriverInitContainerBootstrapStep(
-        orchestrator.getAllConfigurationSteps,
-        initContainerConfigMapName,
-        INIT_CONTAINER_PROPERTIES_FILE_NAME)
-
-      Seq(bootstrapStep)
-    } else {
-      Nil
-    }
-
-    Seq(
-      initialSubmissionStep,
-      serviceBootstrapStep,
-      kubernetesCredentialsStep) ++
-      dependencyResolutionStep ++
-      mountSecretsStep ++
-      initContainerBootstrapStep
-  }
-
-  private def existNonContainerLocalFiles(files: Seq[String]): Boolean = {
-    files.exists { uri =>
-      Utils.resolveURI(uri).getScheme != "local"
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
deleted file mode 100644
index 5884348cb3e41..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesClientApplication.scala
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit
-
-import java.util.{Collections, UUID}
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-import scala.util.control.NonFatal
-
-import io.fabric8.kubernetes.api.model._
-import io.fabric8.kubernetes.client.KubernetesClient
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.SparkApplication
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.SparkKubernetesClientFactory
-import org.apache.spark.deploy.k8s.submit.steps.DriverConfigurationStep
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Utils
-
-/**
- * Encapsulates arguments to the submission client.
- *
- * @param mainAppResource the main application resource if any
- * @param mainClass the main class of the application to run
- * @param driverArgs arguments to the driver
- */
-private[spark] case class ClientArguments(
-     mainAppResource: Option[MainAppResource],
-     mainClass: String,
-     driverArgs: Array[String])
-
-private[spark] object ClientArguments {
-
-  def fromCommandLineArgs(args: Array[String]): ClientArguments = {
-    var mainAppResource: Option[MainAppResource] = None
-    var mainClass: Option[String] = None
-    val driverArgs = mutable.ArrayBuffer.empty[String]
-
-    args.sliding(2, 2).toList.foreach {
-      case Array("--primary-java-resource", primaryJavaResource: String) =>
-        mainAppResource = Some(JavaMainAppResource(primaryJavaResource))
-      case Array("--main-class", clazz: String) =>
-        mainClass = Some(clazz)
-      case Array("--arg", arg: String) =>
-        driverArgs += arg
-      case other =>
-        val invalid = other.mkString(" ")
-        throw new RuntimeException(s"Unknown arguments: $invalid")
-    }
-
-    require(mainClass.isDefined, "Main class must be specified via --main-class")
-
-    ClientArguments(
-      mainAppResource,
-      mainClass.get,
-      driverArgs.toArray)
-  }
-}
-
-/**
- * Submits a Spark application to run on Kubernetes by creating the driver pod and starting a
- * watcher that monitors and logs the application status. Waits for the application to terminate if
- * spark.kubernetes.submission.waitAppCompletion is true.
- *
- * @param submissionSteps steps that collectively configure the driver
- * @param sparkConf the submission client Spark configuration
- * @param kubernetesClient the client to talk to the Kubernetes API server
- * @param waitForAppCompletion a flag indicating whether the client should wait for the application
- *                             to complete
- * @param appName the application name
- * @param watcher a watcher that monitors and logs the application status
- */
-private[spark] class Client(
-    submissionSteps: Seq[DriverConfigurationStep],
-    sparkConf: SparkConf,
-    kubernetesClient: KubernetesClient,
-    waitForAppCompletion: Boolean,
-    appName: String,
-    watcher: LoggingPodStatusWatcher) extends Logging {
-
-  private val driverJavaOptions = sparkConf.get(
-    org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
-
-   /**
-    * Run command that initializes a DriverSpec that will be updated after each
-    * DriverConfigurationStep in the sequence that is passed in. The final KubernetesDriverSpec
-    * will be used to build the Driver Container, Driver Pod, and Kubernetes Resources
-    */
-  def run(): Unit = {
-    var currentDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf)
-    // submissionSteps contain steps necessary to take, to resolve varying
-    // client arguments that are passed in, created by orchestrator
-    for (nextStep <- submissionSteps) {
-      currentDriverSpec = nextStep.configureDriver(currentDriverSpec)
-    }
-
-    val resolvedDriverJavaOpts = currentDriverSpec
-      .driverSparkConf
-      // Remove this as the options are instead extracted and set individually below using
-      // environment variables with prefix SPARK_JAVA_OPT_.
-      .remove(org.apache.spark.internal.config.DRIVER_JAVA_OPTIONS)
-      .getAll
-      .map {
-        case (confKey, confValue) => s"-D$confKey=$confValue"
-      } ++ driverJavaOptions.map(Utils.splitCommandString).getOrElse(Seq.empty)
-    val driverJavaOptsEnvs: Seq[EnvVar] = resolvedDriverJavaOpts.zipWithIndex.map {
-      case (option, index) =>
-        new EnvVarBuilder()
-          .withName(s"$ENV_JAVA_OPT_PREFIX$index")
-          .withValue(option)
-          .build()
-    }
-
-    val resolvedDriverContainer = new ContainerBuilder(currentDriverSpec.driverContainer)
-      .addAllToEnv(driverJavaOptsEnvs.asJava)
-      .build()
-    val resolvedDriverPod = new PodBuilder(currentDriverSpec.driverPod)
-      .editSpec()
-        .addToContainers(resolvedDriverContainer)
-        .endSpec()
-      .build()
-
-    Utils.tryWithResource(
-      kubernetesClient
-        .pods()
-        .withName(resolvedDriverPod.getMetadata.getName)
-        .watch(watcher)) { _ =>
-      val createdDriverPod = kubernetesClient.pods().create(resolvedDriverPod)
-      try {
-        if (currentDriverSpec.otherKubernetesResources.nonEmpty) {
-          val otherKubernetesResources = currentDriverSpec.otherKubernetesResources
-          addDriverOwnerReference(createdDriverPod, otherKubernetesResources)
-          kubernetesClient.resourceList(otherKubernetesResources: _*).createOrReplace()
-        }
-      } catch {
-        case NonFatal(e) =>
-          kubernetesClient.pods().delete(createdDriverPod)
-          throw e
-      }
-
-      if (waitForAppCompletion) {
-        logInfo(s"Waiting for application $appName to finish...")
-        watcher.awaitCompletion()
-        logInfo(s"Application $appName finished.")
-      } else {
-        logInfo(s"Deployed Spark application $appName into Kubernetes.")
-      }
-    }
-  }
-
-  // Add a OwnerReference to the given resources making the driver pod an owner of them so when
-  // the driver pod is deleted, the resources are garbage collected.
-  private def addDriverOwnerReference(driverPod: Pod, resources: Seq[HasMetadata]): Unit = {
-    val driverPodOwnerReference = new OwnerReferenceBuilder()
-      .withName(driverPod.getMetadata.getName)
-      .withApiVersion(driverPod.getApiVersion)
-      .withUid(driverPod.getMetadata.getUid)
-      .withKind(driverPod.getKind)
-      .withController(true)
-      .build()
-    resources.foreach { resource =>
-      val originalMetadata = resource.getMetadata
-      originalMetadata.setOwnerReferences(Collections.singletonList(driverPodOwnerReference))
-    }
-  }
-}
-
-/**
- * Main class and entry point of application submission in KUBERNETES mode.
- */
-private[spark] class KubernetesClientApplication extends SparkApplication {
-
-  override def start(args: Array[String], conf: SparkConf): Unit = {
-    val parsedArguments = ClientArguments.fromCommandLineArgs(args)
-    run(parsedArguments, conf)
-  }
-
-  private def run(clientArguments: ClientArguments, sparkConf: SparkConf): Unit = {
-    val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-    // For constructing the app ID, we can't use the Spark application name, as the app ID is going
-    // to be added as a label to group resources belonging to the same application. Label values are
-    // considerably restrictive, e.g. must be no longer than 63 characters in length. So we generate
-    // a unique app ID (captured by spark.app.id) in the format below.
-    val kubernetesAppId = s"spark-${UUID.randomUUID().toString.replaceAll("-", "")}"
-    val launchTime = System.currentTimeMillis()
-    val waitForAppCompletion = sparkConf.get(WAIT_FOR_APP_COMPLETION)
-    val appName = sparkConf.getOption("spark.app.name").getOrElse("spark")
-    // The master URL has been checked for validity already in SparkSubmit.
-    // We just need to get rid of the "k8s://" prefix here.
-    val master = sparkConf.get("spark.master").substring("k8s://".length)
-    val loggingInterval = if (waitForAppCompletion) Some(sparkConf.get(REPORT_INTERVAL)) else None
-
-    val watcher = new LoggingPodStatusWatcherImpl(kubernetesAppId, loggingInterval)
-
-    val orchestrator = new DriverConfigOrchestrator(
-      kubernetesAppId,
-      launchTime,
-      clientArguments.mainAppResource,
-      appName,
-      clientArguments.mainClass,
-      clientArguments.driverArgs,
-      sparkConf)
-
-    Utils.tryWithResource(SparkKubernetesClientFactory.createKubernetesClient(
-      master,
-      Some(namespace),
-      KUBERNETES_AUTH_SUBMISSION_CONF_PREFIX,
-      sparkConf,
-      None,
-      None)) { kubernetesClient =>
-        val client = new Client(
-          orchestrator.getAllConfigurationSteps,
-          sparkConf,
-          kubernetesClient,
-          waitForAppCompletion,
-          appName,
-          watcher)
-        client.run()
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
deleted file mode 100644
index db13f09387ef9..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/KubernetesDriverSpec.scala
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit
-
-import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, HasMetadata, Pod, PodBuilder}
-
-import org.apache.spark.SparkConf
-
-/**
- * Represents the components and characteristics of a Spark driver. The driver can be considered
- * as being comprised of the driver pod itself, any other Kubernetes resources that the driver
- * pod depends on, and the SparkConf that should be supplied to the Spark application. The driver
- * container should be operated on via the specific field of this case class as opposed to trying
- * to edit the container directly on the pod. The driver container should be attached at the
- * end of executing all submission steps.
- */
-private[spark] case class KubernetesDriverSpec(
-    driverPod: Pod,
-    driverContainer: Container,
-    otherKubernetesResources: Seq[HasMetadata],
-    driverSparkConf: SparkConf)
-
-private[spark] object KubernetesDriverSpec {
-  def initialSpec(initialSparkConf: SparkConf): KubernetesDriverSpec = {
-    KubernetesDriverSpec(
-      // Set new metadata and a new spec so that submission steps can use
-      // PodBuilder#editMetadata() and/or PodBuilder#editSpec() safely.
-      new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build(),
-      new ContainerBuilder().build(),
-      Seq.empty[HasMetadata],
-      initialSparkConf.clone())
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
index c645b008d736d..fdc6e0a61f73a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/LoggingPodStatusWatcher.scala
@@ -109,7 +109,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
       ("namespace", pod.getMetadata.getNamespace()),
       ("labels", pod.getMetadata.getLabels().asScala.mkString(", ")),
       ("pod uid", pod.getMetadata.getUid),
-      ("creation time", pod.getMetadata.getCreationTimestamp()),
+      ("creation time", pod.getMetadata.getCreationTimestamp.getTime),
 
       // spec details
       ("service account name", pod.getSpec.getServiceAccountName()),
@@ -117,7 +117,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
       ("node name", pod.getSpec.getNodeName()),
 
       // status
-      ("start time", pod.getStatus.getStartTime),
+      ("start time", pod.getStatus.getStartTime.getTime),
       ("container images",
         pod.getStatus.getContainerStatuses()
           .asScala
@@ -162,7 +162,7 @@ private[k8s] class LoggingPodStatusWatcherImpl(
           case running: ContainerStateRunning =>
             Seq(
               ("Container state", "Running"),
-              ("Container started at", running.getStartedAt))
+              ("Container started at", running.getStartedAt.getTime))
           case waiting: ContainerStateWaiting =>
             Seq(
               ("Container state", "Waiting"),
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
deleted file mode 100644
index 164e2e5594778..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStep.scala
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, EnvVarBuilder, EnvVarSourceBuilder, PodBuilder, QuantityBuilder}
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.KubernetesUtils
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.internal.config.{DRIVER_CLASS_PATH, DRIVER_MEMORY, DRIVER_MEMORY_OVERHEAD}
-
-/**
- * Performs basic configuration for the driver pod.
- */
-private[spark] class BasicDriverConfigurationStep(
-    kubernetesAppId: String,
-    resourceNamePrefix: String,
-    driverLabels: Map[String, String],
-    imagePullPolicy: String,
-    appName: String,
-    mainClass: String,
-    appArgs: Array[String],
-    sparkConf: SparkConf) extends DriverConfigurationStep {
-
-  private val driverPodName = sparkConf
-    .get(KUBERNETES_DRIVER_POD_NAME)
-    .getOrElse(s"$resourceNamePrefix-driver")
-
-  private val driverExtraClasspath = sparkConf.get(DRIVER_CLASS_PATH)
-
-  private val driverContainerImage = sparkConf
-    .get(DRIVER_CONTAINER_IMAGE)
-    .getOrElse(throw new SparkException("Must specify the driver container image"))
-
-  // CPU settings
-  private val driverCpuCores = sparkConf.getOption("spark.driver.cores").getOrElse("1")
-  private val driverLimitCores = sparkConf.get(KUBERNETES_DRIVER_LIMIT_CORES)
-
-  // Memory settings
-  private val driverMemoryMiB = sparkConf.get(DRIVER_MEMORY)
-  private val driverMemoryString = sparkConf.get(
-    DRIVER_MEMORY.key, DRIVER_MEMORY.defaultValueString)
-  private val memoryOverheadMiB = sparkConf
-    .get(DRIVER_MEMORY_OVERHEAD)
-    .getOrElse(math.max((MEMORY_OVERHEAD_FACTOR * driverMemoryMiB).toInt, MEMORY_OVERHEAD_MIN_MIB))
-  private val driverMemoryWithOverheadMiB = driverMemoryMiB + memoryOverheadMiB
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val driverExtraClasspathEnv = driverExtraClasspath.map { classPath =>
-      new EnvVarBuilder()
-        .withName(ENV_CLASSPATH)
-        .withValue(classPath)
-        .build()
-    }
-
-    val driverCustomAnnotations = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf, KUBERNETES_DRIVER_ANNOTATION_PREFIX)
-    require(!driverCustomAnnotations.contains(SPARK_APP_NAME_ANNOTATION),
-      s"Annotation with key $SPARK_APP_NAME_ANNOTATION is not allowed as it is reserved for" +
-        " Spark bookkeeping operations.")
-
-    val driverCustomEnvs = sparkConf.getAllWithPrefix(KUBERNETES_DRIVER_ENV_KEY).toSeq
-      .map { env =>
-        new EnvVarBuilder()
-          .withName(env._1)
-          .withValue(env._2)
-          .build()
-      }
-
-    val driverAnnotations = driverCustomAnnotations ++ Map(SPARK_APP_NAME_ANNOTATION -> appName)
-
-    val nodeSelector = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf, KUBERNETES_NODE_SELECTOR_PREFIX)
-
-    val driverCpuQuantity = new QuantityBuilder(false)
-      .withAmount(driverCpuCores)
-      .build()
-    val driverMemoryQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverMemoryMiB}Mi")
-      .build()
-    val driverMemoryLimitQuantity = new QuantityBuilder(false)
-      .withAmount(s"${driverMemoryWithOverheadMiB}Mi")
-      .build()
-    val maybeCpuLimitQuantity = driverLimitCores.map { limitCores =>
-      ("cpu", new QuantityBuilder(false).withAmount(limitCores).build())
-    }
-
-    val driverContainer = new ContainerBuilder(driverSpec.driverContainer)
-      .withName(DRIVER_CONTAINER_NAME)
-      .withImage(driverContainerImage)
-      .withImagePullPolicy(imagePullPolicy)
-      .addAllToEnv(driverCustomEnvs.asJava)
-      .addToEnv(driverExtraClasspathEnv.toSeq: _*)
-      .addNewEnv()
-        .withName(ENV_DRIVER_MEMORY)
-        .withValue(driverMemoryString)
-        .endEnv()
-      .addNewEnv()
-        .withName(ENV_DRIVER_MAIN_CLASS)
-        .withValue(mainClass)
-        .endEnv()
-      .addNewEnv()
-        .withName(ENV_DRIVER_ARGS)
-        .withValue(appArgs.mkString(" "))
-        .endEnv()
-      .addNewEnv()
-        .withName(ENV_DRIVER_BIND_ADDRESS)
-        .withValueFrom(new EnvVarSourceBuilder()
-          .withNewFieldRef("v1", "status.podIP")
-          .build())
-        .endEnv()
-      .withNewResources()
-        .addToRequests("cpu", driverCpuQuantity)
-        .addToRequests("memory", driverMemoryQuantity)
-        .addToLimits("memory", driverMemoryLimitQuantity)
-        .addToLimits(maybeCpuLimitQuantity.toMap.asJava)
-        .endResources()
-      .addToArgs("driver")
-      .build()
-
-    val baseDriverPod = new PodBuilder(driverSpec.driverPod)
-      .editOrNewMetadata()
-        .withName(driverPodName)
-        .addToLabels(driverLabels.asJava)
-        .addToAnnotations(driverAnnotations.asJava)
-      .endMetadata()
-      .withNewSpec()
-        .withRestartPolicy("Never")
-        .withNodeSelector(nodeSelector.asJava)
-        .endSpec()
-      .build()
-
-    val resolvedSparkConf = driverSpec.driverSparkConf.clone()
-      .setIfMissing(KUBERNETES_DRIVER_POD_NAME, driverPodName)
-      .set("spark.app.id", kubernetesAppId)
-      .set(KUBERNETES_EXECUTOR_POD_NAME_PREFIX, resourceNamePrefix)
-
-    driverSpec.copy(
-      driverPod = baseDriverPod,
-      driverSparkConf = resolvedSparkConf,
-      driverContainer = driverContainer)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
deleted file mode 100644
index d4b83235b4e3b..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStep.scala
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.File
-
-import io.fabric8.kubernetes.api.model.ContainerBuilder
-
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.KubernetesUtils
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-/**
- * Step that configures the classpath, spark.jars, and spark.files for the driver given that the
- * user may provide remote files or files with local:// schemes.
- */
-private[spark] class DependencyResolutionStep(
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    jarsDownloadPath: String,
-    filesDownloadPath: String) extends DriverConfigurationStep {
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val resolvedSparkJars = KubernetesUtils.resolveFileUris(sparkJars, jarsDownloadPath)
-    val resolvedSparkFiles = KubernetesUtils.resolveFileUris(sparkFiles, filesDownloadPath)
-
-    val sparkConf = driverSpec.driverSparkConf.clone()
-    if (resolvedSparkJars.nonEmpty) {
-      sparkConf.set("spark.jars", resolvedSparkJars.mkString(","))
-    }
-    if (resolvedSparkFiles.nonEmpty) {
-      sparkConf.set("spark.files", resolvedSparkFiles.mkString(","))
-    }
-
-    val resolvedClasspath = KubernetesUtils.resolveFilePaths(sparkJars, jarsDownloadPath)
-    val resolvedDriverContainer = if (resolvedClasspath.nonEmpty) {
-      new ContainerBuilder(driverSpec.driverContainer)
-        .addNewEnv()
-        .withName(ENV_MOUNTED_CLASSPATH)
-        .withValue(resolvedClasspath.mkString(File.pathSeparator))
-        .endEnv()
-        .build()
-    } else {
-      driverSpec.driverContainer
-    }
-
-    driverSpec.copy(
-      driverContainer = resolvedDriverContainer,
-      driverSparkConf = sparkConf)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
deleted file mode 100644
index 17614e040e587..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverConfigurationStep.scala
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-/**
- * Represents a step in configuring the Spark driver pod.
- */
-private[spark] trait DriverConfigurationStep {
-
-  /**
-   * Apply some transformation to the previous state of the driver to add a new feature to it.
-   */
-  def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
deleted file mode 100644
index 9fb3dafdda540..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStep.scala
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.StringWriter
-import java.util.Properties
-
-import io.fabric8.kubernetes.api.model.{ConfigMap, ConfigMapBuilder, ContainerBuilder, HasMetadata}
-
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.KubernetesUtils
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.deploy.k8s.submit.steps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
-
-/**
- * Configures the driver init-container that localizes remote dependencies into the driver pod.
- * It applies the given InitContainerConfigurationSteps in the given order to produce a final
- * InitContainerSpec that is then used to configure the driver pod with the init-container attached.
- * It also builds a ConfigMap that will be mounted into the init-container. The ConfigMap carries
- * configuration properties for the init-container.
- */
-private[spark] class DriverInitContainerBootstrapStep(
-    steps: Seq[InitContainerConfigurationStep],
-    configMapName: String,
-    configMapKey: String)
-  extends DriverConfigurationStep {
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    var initContainerSpec = InitContainerSpec(
-      properties = Map.empty[String, String],
-      driverSparkConf = Map.empty[String, String],
-      initContainer = new ContainerBuilder().build(),
-      driverContainer = driverSpec.driverContainer,
-      driverPod = driverSpec.driverPod,
-      dependentResources = Seq.empty[HasMetadata])
-    for (nextStep <- steps) {
-      initContainerSpec = nextStep.configureInitContainer(initContainerSpec)
-    }
-
-    val configMap = buildConfigMap(
-      configMapName,
-      configMapKey,
-      initContainerSpec.properties)
-    val resolvedDriverSparkConf = driverSpec.driverSparkConf
-      .clone()
-      .set(INIT_CONTAINER_CONFIG_MAP_NAME, configMapName)
-      .set(INIT_CONTAINER_CONFIG_MAP_KEY_CONF, configMapKey)
-      .setAll(initContainerSpec.driverSparkConf)
-    val resolvedDriverPod = KubernetesUtils.appendInitContainer(
-      initContainerSpec.driverPod, initContainerSpec.initContainer)
-
-    driverSpec.copy(
-      driverPod = resolvedDriverPod,
-      driverContainer = initContainerSpec.driverContainer,
-      driverSparkConf = resolvedDriverSparkConf,
-      otherKubernetesResources =
-        driverSpec.otherKubernetesResources ++
-          initContainerSpec.dependentResources ++
-          Seq(configMap))
-  }
-
-  private def buildConfigMap(
-      configMapName: String,
-      configMapKey: String,
-      config: Map[String, String]): ConfigMap = {
-    val properties = new Properties()
-    config.foreach { entry =>
-      properties.setProperty(entry._1, entry._2)
-    }
-    val propertiesWriter = new StringWriter()
-    properties.store(propertiesWriter,
-      s"Java properties built from Kubernetes config map with name: $configMapName " +
-        s"and config map key: $configMapKey")
-    new ConfigMapBuilder()
-      .withNewMetadata()
-        .withName(configMapName)
-        .endMetadata()
-      .addToData(configMapKey, propertiesWriter.toString)
-      .build()
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
deleted file mode 100644
index ccc18908658f1..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStep.scala
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.File
-import java.nio.charset.StandardCharsets
-
-import scala.collection.JavaConverters._
-import scala.language.implicitConversions
-
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder, Secret, SecretBuilder}
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-/**
- * Mounts Kubernetes credentials into the driver pod. The driver will use such mounted credentials
- * to request executors.
- */
-private[spark] class DriverKubernetesCredentialsStep(
-    submissionSparkConf: SparkConf,
-    kubernetesResourceNamePrefix: String) extends DriverConfigurationStep {
-
-  private val maybeMountedOAuthTokenFile = submissionSparkConf.getOption(
-    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX")
-  private val maybeMountedClientKeyFile = submissionSparkConf.getOption(
-    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX")
-  private val maybeMountedClientCertFile = submissionSparkConf.getOption(
-    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX")
-  private val maybeMountedCaCertFile = submissionSparkConf.getOption(
-    s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX")
-  private val driverServiceAccount = submissionSparkConf.get(KUBERNETES_SERVICE_ACCOUNT_NAME)
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val driverSparkConf = driverSpec.driverSparkConf.clone()
-
-    val oauthTokenBase64 = submissionSparkConf
-      .getOption(s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX")
-      .map { token =>
-        BaseEncoding.base64().encode(token.getBytes(StandardCharsets.UTF_8))
-      }
-    val caCertDataBase64 = safeFileConfToBase64(
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-      "Driver CA cert file")
-    val clientKeyDataBase64 = safeFileConfToBase64(
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-      "Driver client key file")
-    val clientCertDataBase64 = safeFileConfToBase64(
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-      "Driver client cert file")
-
-    val driverSparkConfWithCredentialsLocations = setDriverPodKubernetesCredentialLocations(
-      driverSparkConf,
-      oauthTokenBase64,
-      caCertDataBase64,
-      clientKeyDataBase64,
-      clientCertDataBase64)
-
-    val kubernetesCredentialsSecret = createCredentialsSecret(
-      oauthTokenBase64,
-      caCertDataBase64,
-      clientKeyDataBase64,
-      clientCertDataBase64)
-
-    val driverPodWithMountedKubernetesCredentials = kubernetesCredentialsSecret.map { secret =>
-      new PodBuilder(driverSpec.driverPod)
-        .editOrNewSpec()
-          .addNewVolume()
-            .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-            .withNewSecret().withSecretName(secret.getMetadata.getName).endSecret()
-            .endVolume()
-          .endSpec()
-        .build()
-    }.getOrElse(
-      driverServiceAccount.map { account =>
-        new PodBuilder(driverSpec.driverPod)
-          .editOrNewSpec()
-          .withServiceAccount(account)
-          .withServiceAccountName(account)
-          .endSpec()
-          .build()
-      }.getOrElse(driverSpec.driverPod)
-    )
-
-    val driverContainerWithMountedSecretVolume = kubernetesCredentialsSecret.map { secret =>
-      new ContainerBuilder(driverSpec.driverContainer)
-        .addNewVolumeMount()
-          .withName(DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-          .withMountPath(DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
-          .endVolumeMount()
-        .build()
-    }.getOrElse(driverSpec.driverContainer)
-
-    driverSpec.copy(
-      driverPod = driverPodWithMountedKubernetesCredentials,
-      otherKubernetesResources =
-        driverSpec.otherKubernetesResources ++ kubernetesCredentialsSecret.toSeq,
-      driverSparkConf = driverSparkConfWithCredentialsLocations,
-      driverContainer = driverContainerWithMountedSecretVolume)
-  }
-
-  private def createCredentialsSecret(
-      driverOAuthTokenBase64: Option[String],
-      driverCaCertDataBase64: Option[String],
-      driverClientKeyDataBase64: Option[String],
-      driverClientCertDataBase64: Option[String]): Option[Secret] = {
-    val allSecretData =
-      resolveSecretData(
-        driverClientKeyDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME) ++
-      resolveSecretData(
-        driverClientCertDataBase64,
-        DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME) ++
-      resolveSecretData(
-        driverCaCertDataBase64,
-        DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME) ++
-      resolveSecretData(
-        driverOAuthTokenBase64,
-        DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME)
-
-    if (allSecretData.isEmpty) {
-      None
-    } else {
-      Some(new SecretBuilder()
-        .withNewMetadata()
-          .withName(s"$kubernetesResourceNamePrefix-kubernetes-credentials")
-          .endMetadata()
-        .withData(allSecretData.asJava)
-        .build())
-    }
-  }
-
-  private def setDriverPodKubernetesCredentialLocations(
-      driverSparkConf: SparkConf,
-      driverOauthTokenBase64: Option[String],
-      driverCaCertDataBase64: Option[String],
-      driverClientKeyDataBase64: Option[String],
-      driverClientCertDataBase64: Option[String]): SparkConf = {
-    val resolvedMountedOAuthTokenFile = resolveSecretLocation(
-      maybeMountedOAuthTokenFile,
-      driverOauthTokenBase64,
-      DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH)
-    val resolvedMountedClientKeyFile = resolveSecretLocation(
-      maybeMountedClientKeyFile,
-      driverClientKeyDataBase64,
-      DRIVER_CREDENTIALS_CLIENT_KEY_PATH)
-    val resolvedMountedClientCertFile = resolveSecretLocation(
-      maybeMountedClientCertFile,
-      driverClientCertDataBase64,
-      DRIVER_CREDENTIALS_CLIENT_CERT_PATH)
-    val resolvedMountedCaCertFile = resolveSecretLocation(
-      maybeMountedCaCertFile,
-      driverCaCertDataBase64,
-      DRIVER_CREDENTIALS_CA_CERT_PATH)
-
-    val sparkConfWithCredentialLocations = driverSparkConf
-      .setOption(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-        resolvedMountedCaCertFile)
-      .setOption(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-        resolvedMountedClientKeyFile)
-      .setOption(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-        resolvedMountedClientCertFile)
-      .setOption(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
-        resolvedMountedOAuthTokenFile)
-
-    // Redact all OAuth token values
-    sparkConfWithCredentialLocations
-      .getAll
-      .filter(_._1.endsWith(OAUTH_TOKEN_CONF_SUFFIX)).map(_._1)
-      .foreach {
-        sparkConfWithCredentialLocations.set(_, "<present_but_redacted>")
-      }
-    sparkConfWithCredentialLocations
-  }
-
-  private def safeFileConfToBase64(conf: String, fileType: String): Option[String] = {
-    submissionSparkConf.getOption(conf)
-      .map(new File(_))
-      .map { file =>
-        require(file.isFile, String.format("%s provided at %s does not exist or is not a file.",
-          fileType, file.getAbsolutePath))
-        BaseEncoding.base64().encode(Files.toByteArray(file))
-      }
-  }
-
-  private def resolveSecretLocation(
-      mountedUserSpecified: Option[String],
-      valueMountedFromSubmitter: Option[String],
-      mountedCanonicalLocation: String): Option[String] = {
-    mountedUserSpecified.orElse(valueMountedFromSubmitter.map { _ =>
-      mountedCanonicalLocation
-    })
-  }
-
-  /**
-   * Resolve a Kubernetes secret data entry from an optional client credential used by the
-   * driver to talk to the Kubernetes API server.
-   *
-   * @param userSpecifiedCredential the optional user-specified client credential.
-   * @param secretName name of the Kubernetes secret storing the client credential.
-   * @return a secret data entry in the form of a map from the secret name to the secret data,
-   *         which may be empty if the user-specified credential is empty.
-   */
-  private def resolveSecretData(
-      userSpecifiedCredential: Option[String],
-      secretName: String): Map[String, String] = {
-    userSpecifiedCredential.map { valueBase64 =>
-      Map(secretName -> valueBase64)
-    }.getOrElse(Map.empty[String, String])
-  }
-
-  private implicit def augmentSparkConf(sparkConf: SparkConf): OptionSettableSparkConf = {
-    new OptionSettableSparkConf(sparkConf)
-  }
-}
-
-private class OptionSettableSparkConf(sparkConf: SparkConf) {
-  def setOption(configEntry: String, option: Option[String]): SparkConf = {
-    option.foreach { opt =>
-      sparkConf.set(configEntry, opt)
-    }
-    sparkConf
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
deleted file mode 100644
index 91e9a9f211335..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStep.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-/**
- * A driver configuration step for mounting user-specified secrets onto user-specified paths.
- *
- * @param bootstrap a utility actually handling mounting of the secrets.
- */
-private[spark] class DriverMountSecretsStep(
-    bootstrap: MountSecretsBootstrap) extends DriverConfigurationStep {
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    val pod = bootstrap.addSecretVolumes(driverSpec.driverPod)
-    val container = bootstrap.mountSecrets(driverSpec.driverContainer)
-    driverSpec.copy(
-      driverPod = pod,
-      driverContainer = container
-    )
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
deleted file mode 100644
index 34af7cde6c1a9..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStep.scala
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.ServiceBuilder
-
-import org.apache.spark.SparkConf
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.internal.Logging
-import org.apache.spark.util.Clock
-
-/**
- * Allows the driver to be reachable by executor pods through a headless service. The service's
- * ports should correspond to the ports that the executor will reach the pod at for RPC.
- */
-private[spark] class DriverServiceBootstrapStep(
-    resourceNamePrefix: String,
-    driverLabels: Map[String, String],
-    sparkConf: SparkConf,
-    clock: Clock) extends DriverConfigurationStep with Logging {
-
-  import DriverServiceBootstrapStep._
-
-  override def configureDriver(driverSpec: KubernetesDriverSpec): KubernetesDriverSpec = {
-    require(sparkConf.getOption(DRIVER_BIND_ADDRESS_KEY).isEmpty,
-      s"$DRIVER_BIND_ADDRESS_KEY is not supported in Kubernetes mode, as the driver's bind " +
-      "address is managed and set to the driver pod's IP address.")
-    require(sparkConf.getOption(DRIVER_HOST_KEY).isEmpty,
-      s"$DRIVER_HOST_KEY is not supported in Kubernetes mode, as the driver's hostname will be " +
-      "managed via a Kubernetes service.")
-
-    val preferredServiceName = s"$resourceNamePrefix$DRIVER_SVC_POSTFIX"
-    val resolvedServiceName = if (preferredServiceName.length <= MAX_SERVICE_NAME_LENGTH) {
-      preferredServiceName
-    } else {
-      val randomServiceId = clock.getTimeMillis()
-      val shorterServiceName = s"spark-$randomServiceId$DRIVER_SVC_POSTFIX"
-      logWarning(s"Driver's hostname would preferably be $preferredServiceName, but this is " +
-        s"too long (must be <= $MAX_SERVICE_NAME_LENGTH characters). Falling back to use " +
-        s"$shorterServiceName as the driver service's name.")
-      shorterServiceName
-    }
-
-    val driverPort = sparkConf.getInt("spark.driver.port", DEFAULT_DRIVER_PORT)
-    val driverBlockManagerPort = sparkConf.getInt(
-        org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT.key, DEFAULT_BLOCKMANAGER_PORT)
-    val driverService = new ServiceBuilder()
-      .withNewMetadata()
-        .withName(resolvedServiceName)
-        .endMetadata()
-      .withNewSpec()
-        .withClusterIP("None")
-        .withSelector(driverLabels.asJava)
-        .addNewPort()
-          .withName(DRIVER_PORT_NAME)
-          .withPort(driverPort)
-          .withNewTargetPort(driverPort)
-          .endPort()
-        .addNewPort()
-          .withName(BLOCK_MANAGER_PORT_NAME)
-          .withPort(driverBlockManagerPort)
-          .withNewTargetPort(driverBlockManagerPort)
-          .endPort()
-        .endSpec()
-      .build()
-
-    val namespace = sparkConf.get(KUBERNETES_NAMESPACE)
-    val driverHostname = s"${driverService.getMetadata.getName}.$namespace.svc"
-    val resolvedSparkConf = driverSpec.driverSparkConf.clone()
-      .set(DRIVER_HOST_KEY, driverHostname)
-      .set("spark.driver.port", driverPort.toString)
-      .set(
-        org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, driverBlockManagerPort)
-
-    driverSpec.copy(
-      driverSparkConf = resolvedSparkConf,
-      otherKubernetesResources = driverSpec.otherKubernetesResources ++ Seq(driverService))
-  }
-}
-
-private[spark] object DriverServiceBootstrapStep {
-  val DRIVER_BIND_ADDRESS_KEY = org.apache.spark.internal.config.DRIVER_BIND_ADDRESS.key
-  val DRIVER_HOST_KEY = org.apache.spark.internal.config.DRIVER_HOST_ADDRESS.key
-  val DRIVER_SVC_POSTFIX = "-driver-svc"
-  val MAX_SERVICE_NAME_LENGTH = 63
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
deleted file mode 100644
index 01469853dacc2..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStep.scala
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, PodWithDetachedInitContainer}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.KubernetesUtils
-
-/**
- * Performs basic configuration for the driver init-container with most of the work delegated to
- * the given InitContainerBootstrap.
- */
-private[spark] class BasicInitContainerConfigurationStep(
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    jarsDownloadPath: String,
-    filesDownloadPath: String,
-    bootstrap: InitContainerBootstrap)
-  extends InitContainerConfigurationStep {
-
-  override def configureInitContainer(spec: InitContainerSpec): InitContainerSpec = {
-    val remoteJarsToDownload = KubernetesUtils.getOnlyRemoteFiles(sparkJars)
-    val remoteFilesToDownload = KubernetesUtils.getOnlyRemoteFiles(sparkFiles)
-    val remoteJarsConf = if (remoteJarsToDownload.nonEmpty) {
-      Map(INIT_CONTAINER_REMOTE_JARS.key -> remoteJarsToDownload.mkString(","))
-    } else {
-      Map()
-    }
-    val remoteFilesConf = if (remoteFilesToDownload.nonEmpty) {
-      Map(INIT_CONTAINER_REMOTE_FILES.key -> remoteFilesToDownload.mkString(","))
-    } else {
-      Map()
-    }
-
-    val baseInitContainerConfig = Map(
-      JARS_DOWNLOAD_LOCATION.key -> jarsDownloadPath,
-      FILES_DOWNLOAD_LOCATION.key -> filesDownloadPath) ++
-      remoteJarsConf ++
-      remoteFilesConf
-
-    val bootstrapped = bootstrap.bootstrapInitContainer(
-      PodWithDetachedInitContainer(
-        spec.driverPod,
-        spec.initContainer,
-        spec.driverContainer))
-
-    spec.copy(
-      initContainer = bootstrapped.initContainer,
-      driverContainer = bootstrapped.mainContainer,
-      driverPod = bootstrapped.pod,
-      properties = spec.properties ++ baseInitContainerConfig)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
deleted file mode 100644
index f2c29c7ce1076..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestrator.scala
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import org.apache.spark.{SparkConf, SparkException}
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-
-/**
- * Figures out and returns the complete ordered list of InitContainerConfigurationSteps required to
- * configure the driver init-container. The returned steps will be applied in the given order to
- * produce a final InitContainerSpec that is used to construct the driver init-container in
- * DriverInitContainerBootstrapStep. This class is only used when an init-container is needed, i.e.,
- * when there are remote application dependencies to localize.
- */
-private[spark] class InitContainerConfigOrchestrator(
-    sparkJars: Seq[String],
-    sparkFiles: Seq[String],
-    jarsDownloadPath: String,
-    filesDownloadPath: String,
-    imagePullPolicy: String,
-    configMapName: String,
-    configMapKey: String,
-    sparkConf: SparkConf) {
-
-  private val initContainerImage = sparkConf
-    .get(INIT_CONTAINER_IMAGE)
-    .getOrElse(throw new SparkException(
-      "Must specify the init-container image when there are remote dependencies"))
-
-  def getAllConfigurationSteps: Seq[InitContainerConfigurationStep] = {
-    val initContainerBootstrap = new InitContainerBootstrap(
-      initContainerImage,
-      imagePullPolicy,
-      jarsDownloadPath,
-      filesDownloadPath,
-      configMapName,
-      configMapKey,
-      SPARK_POD_DRIVER_ROLE,
-      sparkConf)
-    val baseStep = new BasicInitContainerConfigurationStep(
-      sparkJars,
-      sparkFiles,
-      jarsDownloadPath,
-      filesDownloadPath,
-      initContainerBootstrap)
-
-    val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
-      sparkConf,
-      KUBERNETES_DRIVER_SECRETS_PREFIX)
-    // Mount user-specified driver secrets also into the driver's init-container. The
-    // init-container may need credentials in the secrets to be able to download remote
-    // dependencies. The driver's main container and its init-container share the secrets
-    // because the init-container is sort of an implementation details and this sharing
-    // avoids introducing a dedicated configuration property just for the init-container.
-    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
-      Seq(new InitContainerMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
-    } else {
-      Nil
-    }
-
-    Seq(baseStep) ++ mountSecretsStep
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
deleted file mode 100644
index 0372ad5270951..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigurationStep.scala
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-/**
- * Represents a step in configuring the driver init-container.
- */
-private[spark] trait InitContainerConfigurationStep {
-
-  def configureInitContainer(spec: InitContainerSpec): InitContainerSpec
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
deleted file mode 100644
index 0daa7b95e8aae..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStep.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import org.apache.spark.deploy.k8s.MountSecretsBootstrap
-
-/**
- * An init-container configuration step for mounting user-specified secrets onto user-specified
- * paths.
- *
- * @param bootstrap a utility actually handling mounting of the secrets
- */
-private[spark] class InitContainerMountSecretsStep(
-    bootstrap: MountSecretsBootstrap) extends InitContainerConfigurationStep {
-
-  override def configureInitContainer(spec: InitContainerSpec) : InitContainerSpec = {
-    // Mount the secret volumes given that the volumes have already been added to the driver pod
-    // when mounting the secrets into the main driver container.
-    val initContainer = bootstrap.mountSecrets(spec.initContainer)
-    spec.copy(initContainer = initContainer)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
deleted file mode 100644
index b52c343f0c0ed..0000000000000
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerSpec.scala
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import io.fabric8.kubernetes.api.model.{Container, HasMetadata, Pod}
-
-/**
- * Represents a specification of the init-container for the driver pod.
- *
- * @param properties properties that should be set on the init-container
- * @param driverSparkConf Spark configuration properties that will be carried back to the driver
- * @param initContainer the init-container object
- * @param driverContainer the driver container object
- * @param driverPod the driver pod object
- * @param dependentResources resources the init-container depends on to work
- */
-private[spark] case class InitContainerSpec(
-    properties: Map[String, String],
-    driverSparkConf: Map[String, String],
-    initContainer: Container,
-    driverContainer: Container,
-    driverPod: Pod,
-    dependentResources: Seq[HasMetadata])
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index 47c3a4bb5bf23..6c70c6509c208 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -18,7 +18,8 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import java.io.Closeable
 import java.net.InetAddress
-import java.util.concurrent.{ConcurrentHashMap, ExecutorService, ScheduledExecutorService, TimeUnit}
+import java.util.Collections
+import java.util.concurrent.{ConcurrentHashMap, ExecutorService, ScheduledExecutorService, ThreadPoolExecutor, TimeUnit}
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
 import io.fabric8.kubernetes.api.model._
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
deleted file mode 100644
index 16780584a674a..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SecretVolumeUtils.scala
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{Container, Pod}
-
-private[spark] object SecretVolumeUtils {
-
-  def podHasVolume(pod: Pod, volumeName: String): Boolean = {
-    pod.getSpec.getVolumes.asScala.exists { volume =>
-      volume.getName == volumeName
-    }
-  }
-
-  def containerHasVolume(container: Container, volumeName: String, mountPath: String): Boolean = {
-    container.getVolumeMounts.asScala.exists { volumeMount =>
-      volumeMount.getName == volumeName && volumeMount.getMountPath == mountPath
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
deleted file mode 100644
index e0f29ecd0fb53..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/SparkPodInitContainerSuite.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s
-
-import java.io.File
-import java.util.UUID
-
-import com.google.common.base.Charsets
-import com.google.common.io.Files
-import org.mockito.Mockito
-import org.scalatest.BeforeAndAfter
-import org.scalatest.mockito.MockitoSugar._
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.util.Utils
-
-class SparkPodInitContainerSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val DOWNLOAD_JARS_SECRET_LOCATION = createTempFile("txt")
-  private val DOWNLOAD_FILES_SECRET_LOCATION = createTempFile("txt")
-
-  private var downloadJarsDir: File = _
-  private var downloadFilesDir: File = _
-  private var downloadJarsSecretValue: String = _
-  private var downloadFilesSecretValue: String = _
-  private var fileFetcher: FileFetcher = _
-
-  override def beforeAll(): Unit = {
-    downloadJarsSecretValue = Files.toString(
-      new File(DOWNLOAD_JARS_SECRET_LOCATION), Charsets.UTF_8)
-    downloadFilesSecretValue = Files.toString(
-      new File(DOWNLOAD_FILES_SECRET_LOCATION), Charsets.UTF_8)
-  }
-
-  before {
-    downloadJarsDir = Utils.createTempDir()
-    downloadFilesDir = Utils.createTempDir()
-    fileFetcher = mock[FileFetcher]
-  }
-
-  after {
-    downloadJarsDir.delete()
-    downloadFilesDir.delete()
-  }
-
-  test("Downloads from remote server should invoke the file fetcher") {
-    val sparkConf = getSparkConfForRemoteFileDownloads
-    val initContainerUnderTest = new SparkPodInitContainer(sparkConf, fileFetcher)
-    initContainerUnderTest.run()
-    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/jar1.jar", downloadJarsDir)
-    Mockito.verify(fileFetcher).fetchFile("hdfs://localhost:9000/jar2.jar", downloadJarsDir)
-    Mockito.verify(fileFetcher).fetchFile("http://localhost:9000/file.txt", downloadFilesDir)
-  }
-
-  private def getSparkConfForRemoteFileDownloads: SparkConf = {
-    new SparkConf(true)
-      .set(INIT_CONTAINER_REMOTE_JARS,
-        "http://localhost:9000/jar1.jar,hdfs://localhost:9000/jar2.jar")
-      .set(INIT_CONTAINER_REMOTE_FILES,
-        "http://localhost:9000/file.txt")
-      .set(JARS_DOWNLOAD_LOCATION, downloadJarsDir.getAbsolutePath)
-      .set(FILES_DOWNLOAD_LOCATION, downloadFilesDir.getAbsolutePath)
-  }
-
-  private def createTempFile(extension: String): String = {
-    val dir = Utils.createTempDir()
-    val file = new File(dir, s"${UUID.randomUUID().toString}.$extension")
-    Files.write(UUID.randomUUID().toString, file, Charsets.UTF_8)
-    file.getAbsolutePath
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
deleted file mode 100644
index 65274c6f50e01..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/DriverConfigOrchestratorSuite.scala
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.submit.steps._
-
-class DriverConfigOrchestratorSuite extends SparkFunSuite {
-
-  private val DRIVER_IMAGE = "driver-image"
-  private val IC_IMAGE = "init-container-image"
-  private val APP_ID = "spark-app-id"
-  private val LAUNCH_TIME = 975256L
-  private val APP_NAME = "spark"
-  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val APP_ARGS = Array("arg1", "arg2")
-  private val SECRET_FOO = "foo"
-  private val SECRET_BAR = "bar"
-  private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
-
-  test("Base submission steps with a main app resource.") {
-    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
-    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
-    val orchestrator = new DriverConfigOrchestrator(
-      APP_ID,
-      LAUNCH_TIME,
-      Some(mainAppResource),
-      APP_NAME,
-      MAIN_CLASS,
-      APP_ARGS,
-      sparkConf)
-    validateStepTypes(
-      orchestrator,
-      classOf[BasicDriverConfigurationStep],
-      classOf[DriverServiceBootstrapStep],
-      classOf[DriverKubernetesCredentialsStep],
-      classOf[DependencyResolutionStep]
-    )
-  }
-
-  test("Base submission steps without a main app resource.") {
-    val sparkConf = new SparkConf(false).set(CONTAINER_IMAGE, DRIVER_IMAGE)
-    val orchestrator = new DriverConfigOrchestrator(
-      APP_ID,
-      LAUNCH_TIME,
-      Option.empty,
-      APP_NAME,
-      MAIN_CLASS,
-      APP_ARGS,
-      sparkConf)
-    validateStepTypes(
-      orchestrator,
-      classOf[BasicDriverConfigurationStep],
-      classOf[DriverServiceBootstrapStep],
-      classOf[DriverKubernetesCredentialsStep]
-    )
-  }
-
-  test("Submission steps with an init-container.") {
-    val sparkConf = new SparkConf(false)
-      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
-      .set(INIT_CONTAINER_IMAGE.key, IC_IMAGE)
-      .set("spark.jars", "hdfs://localhost:9000/var/apps/jars/jar1.jar")
-    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
-    val orchestrator = new DriverConfigOrchestrator(
-      APP_ID,
-      LAUNCH_TIME,
-      Some(mainAppResource),
-      APP_NAME,
-      MAIN_CLASS,
-      APP_ARGS,
-      sparkConf)
-    validateStepTypes(
-      orchestrator,
-      classOf[BasicDriverConfigurationStep],
-      classOf[DriverServiceBootstrapStep],
-      classOf[DriverKubernetesCredentialsStep],
-      classOf[DependencyResolutionStep],
-      classOf[DriverInitContainerBootstrapStep])
-  }
-
-  test("Submission steps with driver secrets to mount") {
-    val sparkConf = new SparkConf(false)
-      .set(CONTAINER_IMAGE, DRIVER_IMAGE)
-      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
-      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
-    val mainAppResource = JavaMainAppResource("local:///var/apps/jars/main.jar")
-    val orchestrator = new DriverConfigOrchestrator(
-      APP_ID,
-      LAUNCH_TIME,
-      Some(mainAppResource),
-      APP_NAME,
-      MAIN_CLASS,
-      APP_ARGS,
-      sparkConf)
-    validateStepTypes(
-      orchestrator,
-      classOf[BasicDriverConfigurationStep],
-      classOf[DriverServiceBootstrapStep],
-      classOf[DriverKubernetesCredentialsStep],
-      classOf[DependencyResolutionStep],
-      classOf[DriverMountSecretsStep])
-  }
-
-  private def validateStepTypes(
-      orchestrator: DriverConfigOrchestrator,
-      types: Class[_ <: DriverConfigurationStep]*): Unit = {
-    val steps = orchestrator.getAllConfigurationSteps
-    assert(steps.size === types.size)
-    assert(steps.map(_.getClass) === types)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
deleted file mode 100644
index b136f2c02ffba..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/BasicDriverConfigurationStepSuite.scala
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-class BasicDriverConfigurationStepSuite extends SparkFunSuite {
-
-  private val APP_ID = "spark-app-id"
-  private val RESOURCE_NAME_PREFIX = "spark"
-  private val DRIVER_LABELS = Map("labelkey" -> "labelvalue")
-  private val CONTAINER_IMAGE_PULL_POLICY = "IfNotPresent"
-  private val APP_NAME = "spark-test"
-  private val MAIN_CLASS = "org.apache.spark.examples.SparkPi"
-  private val APP_ARGS = Array("arg1", "arg2", "\"arg 3\"")
-  private val CUSTOM_ANNOTATION_KEY = "customAnnotation"
-  private val CUSTOM_ANNOTATION_VALUE = "customAnnotationValue"
-  private val DRIVER_CUSTOM_ENV_KEY1 = "customDriverEnv1"
-  private val DRIVER_CUSTOM_ENV_KEY2 = "customDriverEnv2"
-
-  test("Set all possible configurations from the user.") {
-    val sparkConf = new SparkConf()
-      .set(KUBERNETES_DRIVER_POD_NAME, "spark-driver-pod")
-      .set(org.apache.spark.internal.config.DRIVER_CLASS_PATH, "/opt/spark/spark-examples.jar")
-      .set("spark.driver.cores", "2")
-      .set(KUBERNETES_DRIVER_LIMIT_CORES, "4")
-      .set(org.apache.spark.internal.config.DRIVER_MEMORY.key, "256M")
-      .set(org.apache.spark.internal.config.DRIVER_MEMORY_OVERHEAD, 200L)
-      .set(CONTAINER_IMAGE, "spark-driver:latest")
-      .set(s"$KUBERNETES_DRIVER_ANNOTATION_PREFIX$CUSTOM_ANNOTATION_KEY", CUSTOM_ANNOTATION_VALUE)
-      .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY1", "customDriverEnv1")
-      .set(s"$KUBERNETES_DRIVER_ENV_KEY$DRIVER_CUSTOM_ENV_KEY2", "customDriverEnv2")
-
-    val submissionStep = new BasicDriverConfigurationStep(
-      APP_ID,
-      RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      CONTAINER_IMAGE_PULL_POLICY,
-      APP_NAME,
-      MAIN_CLASS,
-      APP_ARGS,
-      sparkConf)
-    val basePod = new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build()
-    val baseDriverSpec = KubernetesDriverSpec(
-      driverPod = basePod,
-      driverContainer = new ContainerBuilder().build(),
-      driverSparkConf = new SparkConf(false),
-      otherKubernetesResources = Seq.empty[HasMetadata])
-    val preparedDriverSpec = submissionStep.configureDriver(baseDriverSpec)
-
-    assert(preparedDriverSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
-    assert(preparedDriverSpec.driverContainer.getImage === "spark-driver:latest")
-    assert(preparedDriverSpec.driverContainer.getImagePullPolicy === CONTAINER_IMAGE_PULL_POLICY)
-
-    assert(preparedDriverSpec.driverContainer.getEnv.size === 7)
-    val envs = preparedDriverSpec.driverContainer
-      .getEnv
-      .asScala
-      .map(env => (env.getName, env.getValue))
-      .toMap
-    assert(envs(ENV_CLASSPATH) === "/opt/spark/spark-examples.jar")
-    assert(envs(ENV_DRIVER_MEMORY) === "256M")
-    assert(envs(ENV_DRIVER_MAIN_CLASS) === MAIN_CLASS)
-    assert(envs(ENV_DRIVER_ARGS) === "arg1 arg2 \"arg 3\"")
-    assert(envs(DRIVER_CUSTOM_ENV_KEY1) === "customDriverEnv1")
-    assert(envs(DRIVER_CUSTOM_ENV_KEY2) === "customDriverEnv2")
-
-    assert(preparedDriverSpec.driverContainer.getEnv.asScala.exists(envVar =>
-      envVar.getName.equals(ENV_DRIVER_BIND_ADDRESS) &&
-        envVar.getValueFrom.getFieldRef.getApiVersion.equals("v1") &&
-        envVar.getValueFrom.getFieldRef.getFieldPath.equals("status.podIP")))
-
-    val resourceRequirements = preparedDriverSpec.driverContainer.getResources
-    val requests = resourceRequirements.getRequests.asScala
-    assert(requests("cpu").getAmount === "2")
-    assert(requests("memory").getAmount === "256Mi")
-    val limits = resourceRequirements.getLimits.asScala
-    assert(limits("memory").getAmount === "456Mi")
-    assert(limits("cpu").getAmount === "4")
-
-    val driverPodMetadata = preparedDriverSpec.driverPod.getMetadata
-    assert(driverPodMetadata.getName === "spark-driver-pod")
-    assert(driverPodMetadata.getLabels.asScala === DRIVER_LABELS)
-    val expectedAnnotations = Map(
-      CUSTOM_ANNOTATION_KEY -> CUSTOM_ANNOTATION_VALUE,
-      SPARK_APP_NAME_ANNOTATION -> APP_NAME)
-    assert(driverPodMetadata.getAnnotations.asScala === expectedAnnotations)
-    assert(preparedDriverSpec.driverPod.getSpec.getRestartPolicy === "Never")
-
-    val resolvedSparkConf = preparedDriverSpec.driverSparkConf.getAll.toMap
-    val expectedSparkConf = Map(
-      KUBERNETES_DRIVER_POD_NAME.key -> "spark-driver-pod",
-      "spark.app.id" -> APP_ID,
-      KUBERNETES_EXECUTOR_POD_NAME_PREFIX.key -> RESOURCE_NAME_PREFIX)
-    assert(resolvedSparkConf === expectedSparkConf)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
deleted file mode 100644
index 991b03cafb76c..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DependencyResolutionStepSuite.scala
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.File
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-class DependencyResolutionStepSuite extends SparkFunSuite {
-
-  private val SPARK_JARS = Seq(
-    "hdfs://localhost:9000/apps/jars/jar1.jar",
-    "file:///home/user/apps/jars/jar2.jar",
-    "local:///var/apps/jars/jar3.jar")
-
-  private val SPARK_FILES = Seq(
-    "file:///home/user/apps/files/file1.txt",
-    "hdfs://localhost:9000/apps/files/file2.txt",
-    "local:///var/apps/files/file3.txt")
-
-  private val JARS_DOWNLOAD_PATH = "/mnt/spark-data/jars"
-  private val FILES_DOWNLOAD_PATH = "/mnt/spark-data/files"
-
-  test("Added dependencies should be resolved in Spark configuration and environment") {
-    val dependencyResolutionStep = new DependencyResolutionStep(
-      SPARK_JARS,
-      SPARK_FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH)
-    val driverPod = new PodBuilder().build()
-    val baseDriverSpec = KubernetesDriverSpec(
-      driverPod = driverPod,
-      driverContainer = new ContainerBuilder().build(),
-      driverSparkConf = new SparkConf(false),
-      otherKubernetesResources = Seq.empty[HasMetadata])
-    val preparedDriverSpec = dependencyResolutionStep.configureDriver(baseDriverSpec)
-    assert(preparedDriverSpec.driverPod === driverPod)
-    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
-    val resolvedSparkJars = preparedDriverSpec.driverSparkConf.get("spark.jars").split(",").toSet
-    val expectedResolvedSparkJars = Set(
-      "hdfs://localhost:9000/apps/jars/jar1.jar",
-      s"$JARS_DOWNLOAD_PATH/jar2.jar",
-      "/var/apps/jars/jar3.jar")
-    assert(resolvedSparkJars === expectedResolvedSparkJars)
-    val resolvedSparkFiles = preparedDriverSpec.driverSparkConf.get("spark.files").split(",").toSet
-    val expectedResolvedSparkFiles = Set(
-      s"$FILES_DOWNLOAD_PATH/file1.txt",
-      s"hdfs://localhost:9000/apps/files/file2.txt",
-      s"/var/apps/files/file3.txt")
-    assert(resolvedSparkFiles === expectedResolvedSparkFiles)
-    val driverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
-    assert(driverEnv.size === 1)
-    assert(driverEnv.head.getName === ENV_MOUNTED_CLASSPATH)
-    val resolvedDriverClasspath = driverEnv.head.getValue.split(File.pathSeparator).toSet
-    val expectedResolvedDriverClasspath = Set(
-      s"$JARS_DOWNLOAD_PATH/jar1.jar",
-      s"$JARS_DOWNLOAD_PATH/jar2.jar",
-      "/var/apps/jars/jar3.jar")
-    assert(resolvedDriverClasspath === expectedResolvedDriverClasspath)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
deleted file mode 100644
index 758871e2ba356..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverInitContainerBootstrapStepSuite.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.StringReader
-import java.util.Properties
-
-import scala.collection.JavaConverters._
-
-import com.google.common.collect.Maps
-import io.fabric8.kubernetes.api.model.{ConfigMap, ContainerBuilder, HasMetadata, PodBuilder, SecretBuilder}
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.deploy.k8s.submit.steps.initcontainer.{InitContainerConfigurationStep, InitContainerSpec}
-import org.apache.spark.util.Utils
-
-class DriverInitContainerBootstrapStepSuite extends SparkFunSuite {
-
-  private val CONFIG_MAP_NAME = "spark-init-config-map"
-  private val CONFIG_MAP_KEY = "spark-init-config-map-key"
-
-  test("The init container bootstrap step should use all of the init container steps") {
-    val baseDriverSpec = KubernetesDriverSpec(
-      driverPod = new PodBuilder().build(),
-      driverContainer = new ContainerBuilder().build(),
-      driverSparkConf = new SparkConf(false),
-      otherKubernetesResources = Seq.empty[HasMetadata])
-    val initContainerSteps = Seq(
-      FirstTestInitContainerConfigurationStep,
-      SecondTestInitContainerConfigurationStep)
-    val bootstrapStep = new DriverInitContainerBootstrapStep(
-      initContainerSteps,
-      CONFIG_MAP_NAME,
-      CONFIG_MAP_KEY)
-
-    val preparedDriverSpec = bootstrapStep.configureDriver(baseDriverSpec)
-
-    assert(preparedDriverSpec.driverPod.getMetadata.getLabels.asScala ===
-      FirstTestInitContainerConfigurationStep.additionalLabels)
-    val additionalDriverEnv = preparedDriverSpec.driverContainer.getEnv.asScala
-    assert(additionalDriverEnv.size === 1)
-    assert(additionalDriverEnv.head.getName ===
-      FirstTestInitContainerConfigurationStep.additionalMainContainerEnvKey)
-    assert(additionalDriverEnv.head.getValue ===
-      FirstTestInitContainerConfigurationStep.additionalMainContainerEnvValue)
-
-    assert(preparedDriverSpec.otherKubernetesResources.size === 2)
-    assert(preparedDriverSpec.otherKubernetesResources.contains(
-      FirstTestInitContainerConfigurationStep.additionalKubernetesResource))
-    assert(preparedDriverSpec.otherKubernetesResources.exists {
-      case configMap: ConfigMap =>
-        val hasMatchingName = configMap.getMetadata.getName == CONFIG_MAP_NAME
-        val configMapData = configMap.getData.asScala
-        val hasCorrectNumberOfEntries = configMapData.size == 1
-        val initContainerPropertiesRaw = configMapData(CONFIG_MAP_KEY)
-        val initContainerProperties = new Properties()
-        Utils.tryWithResource(new StringReader(initContainerPropertiesRaw)) {
-          initContainerProperties.load(_)
-        }
-        val initContainerPropertiesMap = Maps.fromProperties(initContainerProperties).asScala
-        val expectedInitContainerProperties = Map(
-          SecondTestInitContainerConfigurationStep.additionalInitContainerPropertyKey ->
-            SecondTestInitContainerConfigurationStep.additionalInitContainerPropertyValue)
-        val hasMatchingProperties = initContainerPropertiesMap == expectedInitContainerProperties
-        hasMatchingName && hasCorrectNumberOfEntries && hasMatchingProperties
-
-      case _ => false
-    })
-
-    val initContainers = preparedDriverSpec.driverPod.getSpec.getInitContainers
-    assert(initContainers.size() === 1)
-    val initContainerEnv = initContainers.get(0).getEnv.asScala
-    assert(initContainerEnv.size === 1)
-    assert(initContainerEnv.head.getName ===
-      SecondTestInitContainerConfigurationStep.additionalInitContainerEnvKey)
-    assert(initContainerEnv.head.getValue ===
-      SecondTestInitContainerConfigurationStep.additionalInitContainerEnvValue)
-
-    val expectedSparkConf = Map(
-      INIT_CONTAINER_CONFIG_MAP_NAME.key -> CONFIG_MAP_NAME,
-      INIT_CONTAINER_CONFIG_MAP_KEY_CONF.key -> CONFIG_MAP_KEY,
-      SecondTestInitContainerConfigurationStep.additionalDriverSparkConfKey ->
-        SecondTestInitContainerConfigurationStep.additionalDriverSparkConfValue)
-    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
-  }
-}
-
-private object FirstTestInitContainerConfigurationStep extends InitContainerConfigurationStep {
-
-  val additionalLabels = Map("additionalLabelkey" -> "additionalLabelValue")
-  val additionalMainContainerEnvKey = "TEST_ENV_MAIN_KEY"
-  val additionalMainContainerEnvValue = "TEST_ENV_MAIN_VALUE"
-  val additionalKubernetesResource = new SecretBuilder()
-    .withNewMetadata()
-    .withName("test-secret")
-    .endMetadata()
-    .addToData("secret-key", "secret-value")
-    .build()
-
-  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
-    val driverPod = new PodBuilder(initContainerSpec.driverPod)
-      .editOrNewMetadata()
-      .addToLabels(additionalLabels.asJava)
-      .endMetadata()
-      .build()
-    val mainContainer = new ContainerBuilder(initContainerSpec.driverContainer)
-      .addNewEnv()
-      .withName(additionalMainContainerEnvKey)
-      .withValue(additionalMainContainerEnvValue)
-      .endEnv()
-      .build()
-    initContainerSpec.copy(
-      driverPod = driverPod,
-      driverContainer = mainContainer,
-      dependentResources = initContainerSpec.dependentResources ++
-        Seq(additionalKubernetesResource))
-  }
-}
-
-private object SecondTestInitContainerConfigurationStep extends InitContainerConfigurationStep {
-  val additionalInitContainerEnvKey = "TEST_ENV_INIT_KEY"
-  val additionalInitContainerEnvValue = "TEST_ENV_INIT_VALUE"
-  val additionalInitContainerPropertyKey = "spark.initcontainer.testkey"
-  val additionalInitContainerPropertyValue = "testvalue"
-  val additionalDriverSparkConfKey = "spark.driver.testkey"
-  val additionalDriverSparkConfValue = "spark.driver.testvalue"
-
-  override def configureInitContainer(initContainerSpec: InitContainerSpec): InitContainerSpec = {
-    val initContainer = new ContainerBuilder(initContainerSpec.initContainer)
-      .addNewEnv()
-      .withName(additionalInitContainerEnvKey)
-      .withValue(additionalInitContainerEnvValue)
-      .endEnv()
-      .build()
-    val initContainerProperties = initContainerSpec.properties ++
-      Map(additionalInitContainerPropertyKey -> additionalInitContainerPropertyValue)
-    val driverSparkConf = initContainerSpec.driverSparkConf ++
-      Map(additionalDriverSparkConfKey -> additionalDriverSparkConfValue)
-    initContainerSpec.copy(
-      initContainer = initContainer,
-      properties = initContainerProperties,
-      driverSparkConf = driverSparkConf)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
deleted file mode 100644
index 64553d25883bb..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverKubernetesCredentialsStepSuite.scala
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import java.io.File
-
-import scala.collection.JavaConverters._
-
-import com.google.common.base.Charsets
-import com.google.common.io.{BaseEncoding, Files}
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, HasMetadata, PodBuilder, Secret}
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.util.Utils
-
-class DriverKubernetesCredentialsStepSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val KUBERNETES_RESOURCE_NAME_PREFIX = "spark"
-  private var credentialsTempDirectory: File = _
-  private val BASE_DRIVER_SPEC = new KubernetesDriverSpec(
-    driverPod = new PodBuilder().build(),
-    driverContainer = new ContainerBuilder().build(),
-    driverSparkConf = new SparkConf(false),
-    otherKubernetesResources = Seq.empty[HasMetadata])
-
-  before {
-    credentialsTempDirectory = Utils.createTempDir()
-  }
-
-  after {
-    credentialsTempDirectory.delete()
-  }
-
-  test("Don't set any credentials") {
-    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
-        new SparkConf(false), KUBERNETES_RESOURCE_NAME_PREFIX)
-    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
-    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
-    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
-    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
-    assert(preparedDriverSpec.driverSparkConf.getAll.isEmpty)
-  }
-
-  test("Only set credentials that are manually mounted.") {
-    val submissionSparkConf = new SparkConf(false)
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX",
-        "/mnt/secrets/my-token.txt")
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-        "/mnt/secrets/my-key.pem")
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-        "/mnt/secrets/my-cert.pem")
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-        "/mnt/secrets/my-ca.pem")
-
-    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
-      submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
-    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(BASE_DRIVER_SPEC)
-    assert(preparedDriverSpec.driverPod === BASE_DRIVER_SPEC.driverPod)
-    assert(preparedDriverSpec.driverContainer === BASE_DRIVER_SPEC.driverContainer)
-    assert(preparedDriverSpec.otherKubernetesResources.isEmpty)
-    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === submissionSparkConf.getAll.toMap)
-  }
-
-  test("Mount credentials from the submission client as a secret.") {
-    val caCertFile = writeCredentials("ca.pem", "ca-cert")
-    val clientKeyFile = writeCredentials("key.pem", "key")
-    val clientCertFile = writeCredentials("cert.pem", "cert")
-    val submissionSparkConf = new SparkConf(false)
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX",
-        "token")
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX",
-        clientKeyFile.getAbsolutePath)
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX",
-        clientCertFile.getAbsolutePath)
-      .set(
-        s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX",
-        caCertFile.getAbsolutePath)
-    val kubernetesCredentialsStep = new DriverKubernetesCredentialsStep(
-      submissionSparkConf, KUBERNETES_RESOURCE_NAME_PREFIX)
-    val preparedDriverSpec = kubernetesCredentialsStep.configureDriver(
-      BASE_DRIVER_SPEC.copy(driverSparkConf = submissionSparkConf))
-    val expectedSparkConf = Map(
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$OAUTH_TOKEN_CONF_SUFFIX" -> "<present_but_redacted>",
-      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$OAUTH_TOKEN_FILE_CONF_SUFFIX" ->
-        DRIVER_CREDENTIALS_OAUTH_TOKEN_PATH,
-      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
-        DRIVER_CREDENTIALS_CLIENT_KEY_PATH,
-      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
-        DRIVER_CREDENTIALS_CLIENT_CERT_PATH,
-      s"$KUBERNETES_AUTH_DRIVER_MOUNTED_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
-        DRIVER_CREDENTIALS_CA_CERT_PATH,
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_KEY_FILE_CONF_SUFFIX" ->
-        clientKeyFile.getAbsolutePath,
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CLIENT_CERT_FILE_CONF_SUFFIX" ->
-        clientCertFile.getAbsolutePath,
-      s"$KUBERNETES_AUTH_DRIVER_CONF_PREFIX.$CA_CERT_FILE_CONF_SUFFIX" ->
-        caCertFile.getAbsolutePath)
-    assert(preparedDriverSpec.driverSparkConf.getAll.toMap === expectedSparkConf)
-    assert(preparedDriverSpec.otherKubernetesResources.size === 1)
-    val credentialsSecret = preparedDriverSpec.otherKubernetesResources.head.asInstanceOf[Secret]
-    assert(credentialsSecret.getMetadata.getName ===
-      s"$KUBERNETES_RESOURCE_NAME_PREFIX-kubernetes-credentials")
-    val decodedSecretData = credentialsSecret.getData.asScala.map { data =>
-      (data._1, new String(BaseEncoding.base64().decode(data._2), Charsets.UTF_8))
-    }
-    val expectedSecretData = Map(
-      DRIVER_CREDENTIALS_CA_CERT_SECRET_NAME -> "ca-cert",
-      DRIVER_CREDENTIALS_OAUTH_TOKEN_SECRET_NAME -> "token",
-      DRIVER_CREDENTIALS_CLIENT_KEY_SECRET_NAME -> "key",
-      DRIVER_CREDENTIALS_CLIENT_CERT_SECRET_NAME -> "cert")
-    assert(decodedSecretData === expectedSecretData)
-    val driverPodVolumes = preparedDriverSpec.driverPod.getSpec.getVolumes.asScala
-    assert(driverPodVolumes.size === 1)
-    assert(driverPodVolumes.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-    assert(driverPodVolumes.head.getSecret != null)
-    assert(driverPodVolumes.head.getSecret.getSecretName === credentialsSecret.getMetadata.getName)
-    val driverContainerVolumeMount = preparedDriverSpec.driverContainer.getVolumeMounts.asScala
-    assert(driverContainerVolumeMount.size === 1)
-    assert(driverContainerVolumeMount.head.getName === DRIVER_CREDENTIALS_SECRET_VOLUME_NAME)
-    assert(driverContainerVolumeMount.head.getMountPath === DRIVER_CREDENTIALS_SECRETS_BASE_DIR)
-  }
-
-  private def writeCredentials(credentialsFileName: String, credentialsContents: String): File = {
-    val credentialsFile = new File(credentialsTempDirectory, credentialsFileName)
-    Files.write(credentialsContents, credentialsFile, Charsets.UTF_8)
-    credentialsFile
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
deleted file mode 100644
index 960d0bda1d011..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverMountSecretsStepSuite.scala
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-
-class DriverMountSecretsStepSuite extends SparkFunSuite {
-
-  private val SECRET_FOO = "foo"
-  private val SECRET_BAR = "bar"
-  private val SECRET_MOUNT_PATH = "/etc/secrets/driver"
-
-  test("mounts all given secrets") {
-    val baseDriverSpec = KubernetesDriverSpec.initialSpec(new SparkConf(false))
-    val secretNamesToMountPaths = Map(
-      SECRET_FOO -> SECRET_MOUNT_PATH,
-      SECRET_BAR -> SECRET_MOUNT_PATH)
-
-    val mountSecretsBootstrap = new MountSecretsBootstrap(secretNamesToMountPaths)
-    val mountSecretsStep = new DriverMountSecretsStep(mountSecretsBootstrap)
-    val configuredDriverSpec = mountSecretsStep.configureDriver(baseDriverSpec)
-    val driverPodWithSecretsMounted = configuredDriverSpec.driverPod
-    val driverContainerWithSecretsMounted = configuredDriverSpec.driverContainer
-
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach { volumeName =>
-      assert(SecretVolumeUtils.podHasVolume(driverPodWithSecretsMounted, volumeName))
-    }
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach { volumeName =>
-      assert(SecretVolumeUtils.containerHasVolume(
-        driverContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH))
-    }
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
deleted file mode 100644
index 78c8c3ba1afbd..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/DriverServiceBootstrapStepSuite.scala
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model.Service
-import org.mockito.{Mock, MockitoAnnotations}
-import org.mockito.Mockito.when
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-import org.apache.spark.deploy.k8s.submit.KubernetesDriverSpec
-import org.apache.spark.util.Clock
-
-class DriverServiceBootstrapStepSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val SHORT_RESOURCE_NAME_PREFIX =
-    "a" * (DriverServiceBootstrapStep.MAX_SERVICE_NAME_LENGTH -
-      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX.length)
-
-  private val LONG_RESOURCE_NAME_PREFIX =
-    "a" * (DriverServiceBootstrapStep.MAX_SERVICE_NAME_LENGTH -
-      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX.length + 1)
-  private val DRIVER_LABELS = Map(
-    "label1key" -> "label1value",
-    "label2key" -> "label2value")
-
-  @Mock
-  private var clock: Clock = _
-
-  private var sparkConf: SparkConf = _
-
-  before {
-    MockitoAnnotations.initMocks(this)
-    sparkConf = new SparkConf(false)
-  }
-
-  test("Headless service has a port for the driver RPC and the block manager.") {
-    val configurationStep = new DriverServiceBootstrapStep(
-      SHORT_RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      sparkConf
-        .set("spark.driver.port", "9000")
-        .set(org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, 8080),
-      clock)
-    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
-    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
-    assert(resolvedDriverSpec.otherKubernetesResources.size === 1)
-    assert(resolvedDriverSpec.otherKubernetesResources.head.isInstanceOf[Service])
-    val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
-    verifyService(
-      9000,
-      8080,
-      s"$SHORT_RESOURCE_NAME_PREFIX${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}",
-      driverService)
-  }
-
-  test("Hostname and ports are set according to the service name.") {
-    val configurationStep = new DriverServiceBootstrapStep(
-      SHORT_RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      sparkConf
-        .set("spark.driver.port", "9000")
-        .set(org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT, 8080)
-        .set(KUBERNETES_NAMESPACE, "my-namespace"),
-      clock)
-    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
-    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
-    val expectedServiceName = SHORT_RESOURCE_NAME_PREFIX +
-      DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
-    verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
-  }
-
-  test("Ports should resolve to defaults in SparkConf and in the service.") {
-    val configurationStep = new DriverServiceBootstrapStep(
-      SHORT_RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      sparkConf,
-      clock)
-    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
-    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
-    verifyService(
-      DEFAULT_DRIVER_PORT,
-      DEFAULT_BLOCKMANAGER_PORT,
-      s"$SHORT_RESOURCE_NAME_PREFIX${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}",
-      resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service])
-    assert(resolvedDriverSpec.driverSparkConf.get("spark.driver.port") ===
-      DEFAULT_DRIVER_PORT.toString)
-    assert(resolvedDriverSpec.driverSparkConf.get(
-      org.apache.spark.internal.config.DRIVER_BLOCK_MANAGER_PORT) === DEFAULT_BLOCKMANAGER_PORT)
-  }
-
-  test("Long prefixes should switch to using a generated name.") {
-    val configurationStep = new DriverServiceBootstrapStep(
-      LONG_RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      sparkConf.set(KUBERNETES_NAMESPACE, "my-namespace"),
-      clock)
-    when(clock.getTimeMillis()).thenReturn(10000)
-    val baseDriverSpec = KubernetesDriverSpec.initialSpec(sparkConf.clone())
-    val resolvedDriverSpec = configurationStep.configureDriver(baseDriverSpec)
-    val driverService = resolvedDriverSpec.otherKubernetesResources.head.asInstanceOf[Service]
-    val expectedServiceName = s"spark-10000${DriverServiceBootstrapStep.DRIVER_SVC_POSTFIX}"
-    assert(driverService.getMetadata.getName === expectedServiceName)
-    val expectedHostName = s"$expectedServiceName.my-namespace.svc"
-    verifySparkConfHostNames(resolvedDriverSpec.driverSparkConf, expectedHostName)
-  }
-
-  test("Disallow bind address and driver host to be set explicitly.") {
-    val configurationStep = new DriverServiceBootstrapStep(
-      LONG_RESOURCE_NAME_PREFIX,
-      DRIVER_LABELS,
-      sparkConf.set(org.apache.spark.internal.config.DRIVER_BIND_ADDRESS, "host"),
-      clock)
-    try {
-      configurationStep.configureDriver(KubernetesDriverSpec.initialSpec(sparkConf))
-      fail("The driver bind address should not be allowed.")
-    } catch {
-      case e: Throwable =>
-        assert(e.getMessage ===
-          s"requirement failed: ${DriverServiceBootstrapStep.DRIVER_BIND_ADDRESS_KEY} is" +
-          " not supported in Kubernetes mode, as the driver's bind address is managed" +
-          " and set to the driver pod's IP address.")
-    }
-    sparkConf.remove(org.apache.spark.internal.config.DRIVER_BIND_ADDRESS)
-    sparkConf.set(org.apache.spark.internal.config.DRIVER_HOST_ADDRESS, "host")
-    try {
-      configurationStep.configureDriver(KubernetesDriverSpec.initialSpec(sparkConf))
-      fail("The driver host address should not be allowed.")
-    } catch {
-      case e: Throwable =>
-        assert(e.getMessage ===
-          s"requirement failed: ${DriverServiceBootstrapStep.DRIVER_HOST_KEY} is" +
-          " not supported in Kubernetes mode, as the driver's hostname will be managed via" +
-          " a Kubernetes service.")
-    }
-  }
-
-  private def verifyService(
-      driverPort: Int,
-      blockManagerPort: Int,
-      expectedServiceName: String,
-      service: Service): Unit = {
-    assert(service.getMetadata.getName === expectedServiceName)
-    assert(service.getSpec.getClusterIP === "None")
-    assert(service.getSpec.getSelector.asScala === DRIVER_LABELS)
-    assert(service.getSpec.getPorts.size() === 2)
-    val driverServicePorts = service.getSpec.getPorts.asScala
-    assert(driverServicePorts.head.getName === DRIVER_PORT_NAME)
-    assert(driverServicePorts.head.getPort.intValue() === driverPort)
-    assert(driverServicePorts.head.getTargetPort.getIntVal === driverPort)
-    assert(driverServicePorts(1).getName === BLOCK_MANAGER_PORT_NAME)
-    assert(driverServicePorts(1).getPort.intValue() === blockManagerPort)
-    assert(driverServicePorts(1).getTargetPort.getIntVal === blockManagerPort)
-  }
-
-  private def verifySparkConfHostNames(
-      driverSparkConf: SparkConf, expectedHostName: String): Unit = {
-    assert(driverSparkConf.get(
-      org.apache.spark.internal.config.DRIVER_HOST_ADDRESS) === expectedHostName)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
deleted file mode 100644
index 4553f9f6b1d45..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/BasicInitContainerConfigurationStepSuite.scala
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import scala.collection.JavaConverters._
-
-import io.fabric8.kubernetes.api.model._
-import org.mockito.{Mock, MockitoAnnotations}
-import org.mockito.Matchers.any
-import org.mockito.Mockito.when
-import org.mockito.invocation.InvocationOnMock
-import org.mockito.stubbing.Answer
-import org.scalatest.BeforeAndAfter
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.k8s.{InitContainerBootstrap, PodWithDetachedInitContainer}
-import org.apache.spark.deploy.k8s.Config._
-
-class BasicInitContainerConfigurationStepSuite extends SparkFunSuite with BeforeAndAfter {
-
-  private val SPARK_JARS = Seq(
-    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
-  private val SPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
-  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
-  private val FILES_DOWNLOAD_PATH = "/var/data/files"
-  private val POD_LABEL = Map("bootstrap" -> "true")
-  private val INIT_CONTAINER_NAME = "init-container"
-  private val DRIVER_CONTAINER_NAME = "driver-container"
-
-  @Mock
-  private var podAndInitContainerBootstrap : InitContainerBootstrap = _
-
-  before {
-    MockitoAnnotations.initMocks(this)
-    when(podAndInitContainerBootstrap.bootstrapInitContainer(
-      any[PodWithDetachedInitContainer])).thenAnswer(new Answer[PodWithDetachedInitContainer] {
-      override def answer(invocation: InvocationOnMock) : PodWithDetachedInitContainer = {
-        val pod = invocation.getArgumentAt(0, classOf[PodWithDetachedInitContainer])
-        pod.copy(
-          pod = new PodBuilder(pod.pod)
-            .withNewMetadata()
-            .addToLabels("bootstrap", "true")
-            .endMetadata()
-            .withNewSpec().endSpec()
-            .build(),
-          initContainer = new ContainerBuilder()
-            .withName(INIT_CONTAINER_NAME)
-            .build(),
-          mainContainer = new ContainerBuilder()
-            .withName(DRIVER_CONTAINER_NAME)
-            .build()
-        )}})
-  }
-
-  test("additionalDriverSparkConf with mix of remote files and jars") {
-    val baseInitStep = new BasicInitContainerConfigurationStep(
-      SPARK_JARS,
-      SPARK_FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      podAndInitContainerBootstrap)
-    val expectedDriverSparkConf = Map(
-      JARS_DOWNLOAD_LOCATION.key -> JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_LOCATION.key -> FILES_DOWNLOAD_PATH,
-      INIT_CONTAINER_REMOTE_JARS.key -> "hdfs://localhost:9000/app/jars/jar1.jar",
-      INIT_CONTAINER_REMOTE_FILES.key -> "hdfs://localhost:9000/app/files/file1.txt")
-    val initContainerSpec = InitContainerSpec(
-      Map.empty[String, String],
-      Map.empty[String, String],
-      new Container(),
-      new Container(),
-      new Pod,
-      Seq.empty[HasMetadata])
-    val returnContainerSpec = baseInitStep.configureInitContainer(initContainerSpec)
-    assert(expectedDriverSparkConf === returnContainerSpec.properties)
-    assert(returnContainerSpec.initContainer.getName === INIT_CONTAINER_NAME)
-    assert(returnContainerSpec.driverContainer.getName === DRIVER_CONTAINER_NAME)
-    assert(returnContainerSpec.driverPod.getMetadata.getLabels.asScala === POD_LABEL)
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
deleted file mode 100644
index 09b42e4484d86..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerConfigOrchestratorSuite.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import org.apache.spark.{SparkConf, SparkFunSuite}
-import org.apache.spark.deploy.k8s.Config._
-import org.apache.spark.deploy.k8s.Constants._
-
-class InitContainerConfigOrchestratorSuite extends SparkFunSuite {
-
-  private val DOCKER_IMAGE = "init-container"
-  private val SPARK_JARS = Seq(
-    "hdfs://localhost:9000/app/jars/jar1.jar", "file:///app/jars/jar2.jar")
-  private val SPARK_FILES = Seq(
-    "hdfs://localhost:9000/app/files/file1.txt", "file:///app/files/file2.txt")
-  private val JARS_DOWNLOAD_PATH = "/var/data/jars"
-  private val FILES_DOWNLOAD_PATH = "/var/data/files"
-  private val DOCKER_IMAGE_PULL_POLICY: String = "IfNotPresent"
-  private val CUSTOM_LABEL_KEY = "customLabel"
-  private val CUSTOM_LABEL_VALUE = "customLabelValue"
-  private val INIT_CONTAINER_CONFIG_MAP_NAME = "spark-init-config-map"
-  private val INIT_CONTAINER_CONFIG_MAP_KEY = "spark-init-config-map-key"
-  private val SECRET_FOO = "foo"
-  private val SECRET_BAR = "bar"
-  private val SECRET_MOUNT_PATH = "/etc/secrets/init-container"
-
-  test("including basic configuration step") {
-    val sparkConf = new SparkConf(true)
-      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
-      .set(s"$KUBERNETES_DRIVER_LABEL_PREFIX$CUSTOM_LABEL_KEY", CUSTOM_LABEL_VALUE)
-
-    val orchestrator = new InitContainerConfigOrchestrator(
-      SPARK_JARS.take(1),
-      SPARK_FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      DOCKER_IMAGE_PULL_POLICY,
-      INIT_CONTAINER_CONFIG_MAP_NAME,
-      INIT_CONTAINER_CONFIG_MAP_KEY,
-      sparkConf)
-    val initSteps = orchestrator.getAllConfigurationSteps
-    assert(initSteps.lengthCompare(1) == 0)
-    assert(initSteps.head.isInstanceOf[BasicInitContainerConfigurationStep])
-  }
-
-  test("including step to mount user-specified secrets") {
-    val sparkConf = new SparkConf(false)
-      .set(CONTAINER_IMAGE, DOCKER_IMAGE)
-      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_FOO", SECRET_MOUNT_PATH)
-      .set(s"$KUBERNETES_DRIVER_SECRETS_PREFIX$SECRET_BAR", SECRET_MOUNT_PATH)
-
-    val orchestrator = new InitContainerConfigOrchestrator(
-      SPARK_JARS.take(1),
-      SPARK_FILES,
-      JARS_DOWNLOAD_PATH,
-      FILES_DOWNLOAD_PATH,
-      DOCKER_IMAGE_PULL_POLICY,
-      INIT_CONTAINER_CONFIG_MAP_NAME,
-      INIT_CONTAINER_CONFIG_MAP_KEY,
-      sparkConf)
-    val initSteps = orchestrator.getAllConfigurationSteps
-    assert(initSteps.length === 2)
-    assert(initSteps.head.isInstanceOf[BasicInitContainerConfigurationStep])
-    assert(initSteps(1).isInstanceOf[InitContainerMountSecretsStep])
-  }
-}
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
deleted file mode 100644
index 7ac0bde80dfe6..0000000000000
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/submit/steps/initcontainer/InitContainerMountSecretsStepSuite.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.deploy.k8s.submit.steps.initcontainer
-
-import io.fabric8.kubernetes.api.model.{ContainerBuilder, PodBuilder}
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.deploy.k8s.{MountSecretsBootstrap, SecretVolumeUtils}
-
-class InitContainerMountSecretsStepSuite extends SparkFunSuite {
-
-  private val SECRET_FOO = "foo"
-  private val SECRET_BAR = "bar"
-  private val SECRET_MOUNT_PATH = "/etc/secrets/init-container"
-
-  test("mounts all given secrets") {
-    val baseInitContainerSpec = InitContainerSpec(
-      Map.empty,
-      Map.empty,
-      new ContainerBuilder().build(),
-      new ContainerBuilder().build(),
-      new PodBuilder().withNewMetadata().endMetadata().withNewSpec().endSpec().build(),
-      Seq.empty)
-    val secretNamesToMountPaths = Map(
-      SECRET_FOO -> SECRET_MOUNT_PATH,
-      SECRET_BAR -> SECRET_MOUNT_PATH)
-
-    val mountSecretsBootstrap = new MountSecretsBootstrap(secretNamesToMountPaths)
-    val initContainerMountSecretsStep = new InitContainerMountSecretsStep(mountSecretsBootstrap)
-    val configuredInitContainerSpec = initContainerMountSecretsStep.configureInitContainer(
-      baseInitContainerSpec)
-    val initContainerWithSecretsMounted = configuredInitContainerSpec.initContainer
-
-    Seq(s"$SECRET_FOO-volume", s"$SECRET_BAR-volume").foreach(volumeName =>
-      assert(SecretVolumeUtils.containerHasVolume(
-        initContainerWithSecretsMounted, volumeName, SECRET_MOUNT_PATH)))
-  }
-}
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
deleted file mode 100644
index 491b7cf692478..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/Dockerfile
+++ /dev/null
@@ -1,52 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-FROM openjdk:8-alpine
-
-ARG spark_jars=jars
-ARG img_path=kubernetes/dockerfiles
-
-# Before building the docker image, first build and make a Spark distribution following
-# the instructions in http://spark.apache.org/docs/latest/building-spark.html.
-# If this docker file is being used in the context of building your images from a Spark
-# distribution, the docker build command should be invoked from the top level directory
-# of the Spark distribution. E.g.:
-# docker build -t spark:latest -f kubernetes/dockerfiles/spark/Dockerfile .
-
-RUN set -ex && \
-    apk upgrade --no-cache && \
-    apk add --no-cache bash tini libc6-compat && \
-    mkdir -p /opt/spark && \
-    mkdir -p /opt/spark/work-dir \
-    touch /opt/spark/RELEASE && \
-    rm /bin/sh && \
-    ln -sv /bin/bash /bin/sh && \
-    chgrp root /etc/passwd && chmod ug+rw /etc/passwd
-
-COPY ${spark_jars} /opt/spark/jars
-COPY bin /opt/spark/bin
-COPY sbin /opt/spark/sbin
-COPY conf /opt/spark/conf
-COPY ${img_path}/spark/entrypoint.sh /opt/
-COPY examples /opt/spark/examples
-COPY data /opt/spark/data
-
-ENV SPARK_HOME /opt/spark
-
-WORKDIR /opt/spark/work-dir
-
-ENTRYPOINT [ "/opt/entrypoint.sh" ]
diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
deleted file mode 100755
index 0c28c75857871..0000000000000
--- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# echo commands to the terminal output
-set -ex
-
-# Check whether there is a passwd entry for the container UID
-myuid=$(id -u)
-mygid=$(id -g)
-uidentry=$(getent passwd $myuid)
-
-# If there is no passwd entry for the container UID, attempt to create one
-if [ -z "$uidentry" ] ; then
-    if [ -w /etc/passwd ] ; then
-        echo "$myuid:x:$myuid:$mygid:anonymous uid:$SPARK_HOME:/bin/false" >> /etc/passwd
-    else
-        echo "Container ENTRYPOINT failed to add passwd entry for anonymous UID"
-    fi
-fi
-
-SPARK_K8S_CMD="$1"
-if [ -z "$SPARK_K8S_CMD" ]; then
-  echo "No command to execute has been provided." 1>&2
-  exit 1
-fi
-shift 1
-
-SPARK_CLASSPATH="$SPARK_CLASSPATH:${SPARK_HOME}/jars/*"
-env | grep SPARK_JAVA_OPT_ | sed 's/[^=]*=\(.*\)/\1/g' > /tmp/java_opts.txt
-readarray -t SPARK_DRIVER_JAVA_OPTS < /tmp/java_opts.txt
-if [ -n "$SPARK_MOUNTED_CLASSPATH" ]; then
-  SPARK_CLASSPATH="$SPARK_CLASSPATH:$SPARK_MOUNTED_CLASSPATH"
-fi
-if [ -n "$SPARK_MOUNTED_FILES_DIR" ]; then
-  cp -R "$SPARK_MOUNTED_FILES_DIR/." .
-fi
-
-case "$SPARK_K8S_CMD" in
-  driver)
-    CMD=(
-      ${JAVA_HOME}/bin/java
-      "${SPARK_DRIVER_JAVA_OPTS[@]}"
-      -cp "$SPARK_CLASSPATH"
-      -Xms$SPARK_DRIVER_MEMORY
-      -Xmx$SPARK_DRIVER_MEMORY
-      -Dspark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS
-      $SPARK_DRIVER_CLASS
-      $SPARK_DRIVER_ARGS
-    )
-    ;;
-
-  executor)
-    CMD=(
-      ${JAVA_HOME}/bin/java
-      "${SPARK_EXECUTOR_JAVA_OPTS[@]}"
-      -Xms$SPARK_EXECUTOR_MEMORY
-      -Xmx$SPARK_EXECUTOR_MEMORY
-      -cp "$SPARK_CLASSPATH"
-      org.apache.spark.executor.CoarseGrainedExecutorBackend
-      --driver-url $SPARK_DRIVER_URL
-      --executor-id $SPARK_EXECUTOR_ID
-      --cores $SPARK_EXECUTOR_CORES
-      --app-id $SPARK_APPLICATION_ID
-      --hostname $SPARK_EXECUTOR_POD_IP
-    )
-    ;;
-
-  init)
-    CMD=(
-      "$SPARK_HOME/bin/spark-class"
-      "org.apache.spark.deploy.k8s.SparkPodInitContainer"
-      "$@"
-    )
-    ;;
-
-  *)
-    echo "Unknown command: $SPARK_K8S_CMD" 1>&2
-    exit 1
-esac
-
-# Execute the container CMD under tini for better hygiene
-exec /sbin/tini -s -- "${CMD[@]}"

From 277425d576e9457a9b8b68998b45296544aaa038 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Tue, 23 Jan 2018 15:16:11 +0000
Subject: [PATCH 351/356] need to write as int96

---
 .../parquet/ParquetInteroperabilitySuite.scala         | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index e3edafa9c84e1..6c87e9affd659 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -117,9 +117,13 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
         "2006-06-06 06:06:06"
       ).map { s => java.sql.Timestamp.valueOf(s) }
       import testImplicits._
-      // match the column names of the file from impala
-      val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts")
-      df.write.parquet(tableDir.getAbsolutePath)
+      withSQLConf(
+        (SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key,
+          SQLConf.ParquetOutputTimestampType.INT96.toString)) {
+        // match the column names of the file from impala
+        val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts")
+        df.write.parquet(tableDir.getAbsolutePath)
+      }
       FileUtils.copyFile(new File(impalaPath), new File(tableDir, "part-00001.parq"))
 
       Seq(false, true).foreach { int96TimestampConversion =>

From d37f4eeaa7d1e90aa3a1e48a88b3c0c4f13bb9b4 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Tue, 23 Jan 2018 15:20:51 +0000
Subject: [PATCH 352/356] we're smaller?

---
 .../sql/execution/columnar/InMemoryColumnarQuerySuite.scala   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
index ff7c5e58e9863..295943e418e82 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/InMemoryColumnarQuerySuite.scala
@@ -498,7 +498,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
             case plan: InMemoryRelation => plan
           }.head
           // InMemoryRelation's stats is file size before the underlying RDD is materialized
-          assert(inMemoryRelation.computeStats().sizeInBytes === 740)
+          assert(inMemoryRelation.computeStats().sizeInBytes === 700)
 
           // InMemoryRelation's stats is updated after materializing RDD
           dfFromFile.collect()
@@ -511,7 +511,7 @@ class InMemoryColumnarQuerySuite extends QueryTest with SharedSQLContext {
 
           // Even CBO enabled, InMemoryRelation's stats keeps as the file size before table's stats
           // is calculated
-          assert(inMemoryRelation2.computeStats().sizeInBytes === 740)
+          assert(inMemoryRelation2.computeStats().sizeInBytes === 700)
 
           // InMemoryRelation's stats should be updated after calculating stats of the table
           // clear cache to simulate a fresh environment

From fab5e5a0b604b9853d9a7cbbbeace1195549c2e2 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Tue, 23 Jan 2018 18:26:13 +0000
Subject: [PATCH 353/356] no mesos and hive thrift

---
 dev/lint-java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/lint-java b/dev/lint-java
index 08992f42e6e4c..58ee47ae85fef 100755
--- a/dev/lint-java
+++ b/dev/lint-java
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname $SCRIPT_DIR)"
 
-ERRORS=$($SCRIPT_DIR/../build/mvn -Phadoop-cloud -Pkubernetes -Pkinesis-asl -Pmesos -Pyarn -Phive -Phive-thriftserver checkstyle:check | grep ERROR)
+ERRORS=$($SCRIPT_DIR/../build/mvn -Phadoop-cloud -Pkubernetes -Pkinesis-asl -Pyarn -Phive checkstyle:check | grep ERROR)
 
 if test ! -z "$ERRORS"; then
     echo -e "Checkstyle checks failed at following occurrences:\n$ERRORS"

From bd4056d901a56bdaef65f91f05b33d90915d463e Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Wed, 24 Jan 2018 11:48:06 +0000
Subject: [PATCH 354/356] change test

---
 .../datasources/parquet/ParquetInteroperabilitySuite.scala   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index 6c87e9affd659..e0bb869c3f14d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -188,7 +188,10 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
                 // when the data is read back as mentioned above, b/c int96 is unsigned.  This
                 // assert makes sure this holds even if we change parquet versions (if eg. there
                 // were ever statistics even on unsigned columns).
-                assert(columnStats.isEmpty)
+
+                // Note: This is not true in palantir/parquet-mr and statistics are always returned
+                // and they are always unsigned.
+                assert(!columnStats.isEmpty)
               }
 
               // These queries should return the entire dataset with the conversion applied,

From 2cf672d7c4ee99a415c896ef334eff0d6822c957 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robertk@palantir.com>
Date: Wed, 24 Jan 2018 18:10:05 +0000
Subject: [PATCH 355/356] correct assert

---
 .../datasources/parquet/ParquetInteroperabilitySuite.scala   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index e0bb869c3f14d..58d5946118ea1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -21,10 +21,10 @@ import java.io.File
 
 import org.apache.commons.io.FileUtils
 import org.apache.hadoop.fs.{FileSystem, Path, PathFilter}
+import org.apache.parquet.CorruptStatistics
 import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER
 import org.apache.parquet.hadoop.ParquetFileReader
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
-
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf
@@ -191,7 +191,8 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
 
                 // Note: This is not true in palantir/parquet-mr and statistics are always returned
                 // and they are always unsigned.
-                assert(!columnStats.isEmpty)
+                assert(!(oneFooter.getFileMetaData.getCreatedBy.contains("impala") ^
+                  columnStats.isEmpty))
               }
 
               // These queries should return the entire dataset with the conversion applied,

From ab93a5363c18836a04aef0b865431c0b1a26e063 Mon Sep 17 00:00:00 2001
From: Robert Kruszewski <robert@robert.sx>
Date: Thu, 25 Jan 2018 04:19:13 +0000
Subject: [PATCH 356/356] Update ParquetInteroperabilitySuite.scala

---
 .../datasources/parquet/ParquetInteroperabilitySuite.scala       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index 58d5946118ea1..24044e66e8468 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -25,6 +25,7 @@ import org.apache.parquet.CorruptStatistics
 import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER
 import org.apache.parquet.hadoop.ParquetFileReader
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
+
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf