[SPARK-29991][INFRA] Support Hive 1.2 and Hive 2.3 (default) in PR builder

HyukjinKwon · HyukjinKwon · commit 4a73bed3180a · 2019-11-30T12:48:15.000+09:00
### What changes were proposed in this pull request? Currently, Apache Spark PR Builder using `hive-1.2` for `hadoop-2.7` and `hive-2.3` for `hadoop-3.2`. This PR aims to support - `[test-hive1.2]` in PR builder - `[test-hive2.3]` in PR builder to be consistent and independent of the default profile - After this PR, all PR builders will use Hive 2.3 by default (because Spark uses Hive 2.3 by default as of c98e5eb) - Use default profile in AppVeyor build. Note that this was reverted due to unexpected test failure at `ThriftServerPageSuite`, which was investigated in #26706 . This PR fixed it by letting it use their own forked JVM. There is no explicit evidence for this fix and it was just my speculation, and thankfully it fixed at least. ### Why are the changes needed? This new tag allows us more flexibility. ### Does this PR introduce any user-facing change? No. (This is a dev-only change.) ### How was this patch tested? Check the Jenkins triggers in this PR. Default: ``` ======================================================================== Building Spark ======================================================================== [info] Building Spark using SBT with these arguments: -Phadoop-2.7 -Phive-2.3 -Phive-thriftserver -Pmesos -Pspark-ganglia-lgpl -Phadoop-cloud -Phive -Pkubernetes -Pkinesis-asl -Pyarn test:package streaming-kinesis-asl-assembly/assembly ``` `[test-hive1.2][test-hadoop3.2]`: ``` ======================================================================== Building Spark ======================================================================== [info] Building Spark using SBT with these arguments: -Phadoop-3.2 -Phive-1.2 -Phadoop-cloud -Pyarn -Pspark-ganglia-lgpl -Phive -Phive-thriftserver -Pmesos -Pkubernetes -Pkinesis-asl test:package streaming-kinesis-asl-assembly/assembly ``` `[test-maven][test-hive-2.3]`: ``` ======================================================================== Building Spark ======================================================================== [info] Building Spark using Maven with these arguments: -Phadoop-2.7 -Phive-2.3 -Pspark-ganglia-lgpl -Pyarn -Phive -Phadoop-cloud -Pkinesis-asl -Pmesos -Pkubernetes -Phive-thriftserver clean package -DskipTests ``` Closes #26710 from HyukjinKwon/SPARK-29991. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/appveyor.yml b/appveyor.yml
@@ -53,7 +53,7 @@ install:
 build_script:
   # '-Djna.nosys=true' is required to avoid kernel32.dll load failure.
   # See SPARK-28759.
-  - cmd: mvn -DskipTests -Psparkr -Phive -Phive-1.2 -Djna.nosys=true package
+  - cmd: mvn -DskipTests -Psparkr -Phive -Djna.nosys=true package
 
 environment:
   NOT_CRAN: true
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
@@ -182,6 +182,11 @@ def main():
         os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.7"
     if "test-hadoop3.2" in ghprb_pull_title:
         os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop3.2"
+    # Switch the Hive profile based on the PR title:
+    if "test-hive1.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_HIVE_PROFILE"] = "hive1.2"
+    if "test-hive2.3" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_HIVE_PROFILE"] = "hive2.3"
 
     build_display_name = os.environ["BUILD_DISPLAY_NAME"]
     build_url = os.environ["BUILD_URL"]
diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -179,7 +179,8 @@ def run_apache_rat_checks():
     run_cmd([os.path.join(SPARK_HOME, "dev", "check-license")])
 
 
-def run_scala_style_checks(build_profiles):
+def run_scala_style_checks(extra_profiles):
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     set_title_and_block("Running Scala style checks", "BLOCK_SCALA_STYLE")
     profiles = " ".join(build_profiles)
     print("[info] Checking Scala style using SBT with these profiles: ", profiles)
@@ -283,8 +284,8 @@ def get_hadoop_profiles(hadoop_version):
     """
 
     sbt_maven_hadoop_profiles = {
-        "hadoop2.7": ["-Phadoop-2.7", "-Phive-1.2"],
-        "hadoop3.2": ["-Phadoop-3.2", "-Phive-2.3"],
+        "hadoop2.7": ["-Phadoop-2.7"],
+        "hadoop3.2": ["-Phadoop-3.2"],
     }
 
     if hadoop_version in sbt_maven_hadoop_profiles:
@@ -295,9 +296,28 @@ def get_hadoop_profiles(hadoop_version):
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
-def build_spark_maven(hadoop_version):
+def get_hive_profiles(hive_version):
+    """
+    For the given Hive version tag, return a list of Maven/SBT profile flags for
+    building and testing against that Hive version.
+    """
+
+    sbt_maven_hive_profiles = {
+        "hive1.2": ["-Phive-1.2"],
+        "hive2.3": ["-Phive-2.3"],
+    }
+
+    if hive_version in sbt_maven_hive_profiles:
+        return sbt_maven_hive_profiles[hive_version]
+    else:
+        print("[error] Could not find", hive_version, "in the list. Valid options",
+              " are", sbt_maven_hive_profiles.keys())
+        sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
+
+
+def build_spark_maven(extra_profiles):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
@@ -306,9 +326,9 @@ def build_spark_maven(hadoop_version):
     exec_maven(profiles_and_goals)
 
 
-def build_spark_sbt(hadoop_version):
+def build_spark_sbt(extra_profiles):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     sbt_goals = ["test:package",  # Build test jars as some tests depend on them
                  "streaming-kinesis-asl-assembly/assembly"]
     profiles_and_goals = build_profiles + sbt_goals
@@ -318,10 +338,10 @@ def build_spark_sbt(hadoop_version):
     exec_sbt(profiles_and_goals)
 
 
-def build_spark_unidoc_sbt(hadoop_version):
+def build_spark_unidoc_sbt(extra_profiles):
     set_title_and_block("Building Unidoc API Documentation", "BLOCK_DOCUMENTATION")
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     sbt_goals = ["unidoc"]
     profiles_and_goals = build_profiles + sbt_goals
 
@@ -331,9 +351,9 @@ def build_spark_unidoc_sbt(hadoop_version):
     exec_sbt(profiles_and_goals)
 
 
-def build_spark_assembly_sbt(hadoop_version, checkstyle=False):
+def build_spark_assembly_sbt(extra_profiles, checkstyle=False):
     # Enable all of the profiles for the build:
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     sbt_goals = ["assembly/package"]
     profiles_and_goals = build_profiles + sbt_goals
     print("[info] Building Spark assembly using SBT with these arguments: ",
@@ -343,25 +363,25 @@ def build_spark_assembly_sbt(hadoop_version, checkstyle=False):
     if checkstyle:
         run_java_style_checks(build_profiles)
 
-    build_spark_unidoc_sbt(hadoop_version)
+    build_spark_unidoc_sbt(extra_profiles)
 
 
-def build_apache_spark(build_tool, hadoop_version):
-    """Will build Spark against Hive v1.2.1 given the passed in build tool (either `sbt` or
-    `maven`). Defaults to using `sbt`."""
+def build_apache_spark(build_tool, extra_profiles):
+    """Will build Spark with the extra profiles and the passed in build tool
+    (either `sbt` or `maven`). Defaults to using `sbt`."""
 
     set_title_and_block("Building Spark", "BLOCK_BUILD")
 
     rm_r("lib_managed")
 
     if build_tool == "maven":
-        build_spark_maven(hadoop_version)
+        build_spark_maven(extra_profiles)
     else:
-        build_spark_sbt(hadoop_version)
+        build_spark_sbt(extra_profiles)
 
 
-def detect_binary_inop_with_mima(hadoop_version):
-    build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
+def detect_binary_inop_with_mima(extra_profiles):
+    build_profiles = extra_profiles + modules.root.build_profile_flags
     set_title_and_block("Detecting binary incompatibilities with MiMa", "BLOCK_MIMA")
     profiles = " ".join(build_profiles)
     print("[info] Detecting binary incompatibilities with MiMa using SBT with these profiles: ",
@@ -395,14 +415,14 @@ def run_scala_tests_sbt(test_modules, test_profiles):
     exec_sbt(profiles_and_goals)
 
 
-def run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags):
+def run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags):
     """Function to properly execute all tests passed in as a set from the
     `determine_test_suites` function"""
     set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS")
 
     test_modules = set(test_modules)
 
-    test_profiles = get_hadoop_profiles(hadoop_version) + \
+    test_profiles = extra_profiles + \
         list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
 
     if excluded_tags:
@@ -555,6 +575,7 @@ def main():
         # to reflect the environment settings
         build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
         hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7")
+        hive_version = os.environ.get("AMPLAB_JENKINS_BUILD_HIVE_PROFILE", "hive2.3")
         test_env = "amplab_jenkins"
         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
         # TODO(sknapp):  after all builds are ported to the ubuntu workers, change this to be:
@@ -564,10 +585,12 @@ def main():
         # else we're running locally and can use local settings
         build_tool = "sbt"
         hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
+        hive_version = os.environ.get("HIVE_PROFILE", "hive2.3")
         test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
-          "under environment", test_env)
+          "and Hive profile", hive_version, "under environment", test_env)
+    extra_profiles = get_hadoop_profiles(hadoop_version) + get_hive_profiles(hive_version)
 
     changed_modules = None
     changed_files = None
@@ -601,8 +624,7 @@ def main():
     if not changed_files or any(f.endswith(".scala")
                                 or f.endswith("scalastyle-config.xml")
                                 for f in changed_files):
-        build_profiles = get_hadoop_profiles(hadoop_version) + modules.root.build_profile_flags
-        run_scala_style_checks(build_profiles)
+        run_scala_style_checks(extra_profiles)
     should_run_java_style_checks = False
     if not changed_files or any(f.endswith(".java")
                                 or f.endswith("checkstyle.xml")
@@ -630,18 +652,18 @@ def main():
         run_build_tests()
 
     # spark build
-    build_apache_spark(build_tool, hadoop_version)
+    build_apache_spark(build_tool, extra_profiles)
 
     # backwards compatibility checks
     if build_tool == "sbt":
         # Note: compatibility tests only supported in sbt for now
-        detect_binary_inop_with_mima(hadoop_version)
+        detect_binary_inop_with_mima(extra_profiles)
         # Since we did not build assembly/package before running dev/mima, we need to
         # do it here because the tests still rely on it; see SPARK-13294 for details.
-        build_spark_assembly_sbt(hadoop_version, should_run_java_style_checks)
+        build_spark_assembly_sbt(extra_profiles, should_run_java_style_checks)
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, test_modules, excluded_tags)
+    run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags)
 
     modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
     if modules_with_python_tests:
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -476,7 +476,8 @@ object SparkParallelTestGrouping {
     "org.apache.spark.ml.classification.LinearSVCSuite",
     "org.apache.spark.sql.SQLQueryTestSuite",
     "org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite",
-    "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite"
+    "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite",
+    "org.apache.spark.sql.hive.thriftserver.ui.ThriftServerPageSuite"
   )
 
   private val DEFAULT_TEST_GROUP = "default_test_group"

Original file line number	Diff line number	Diff line change
`@@ -476,7 +476,8 @@ object SparkParallelTestGrouping {`
`476`	`476`	`"org.apache.spark.ml.classification.LinearSVCSuite",`
`477`	`477`	`"org.apache.spark.sql.SQLQueryTestSuite",`
`478`	`478`	`"org.apache.spark.sql.hive.thriftserver.ThriftServerQueryTestSuite",`
`479`		`- "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite"`
	`479`	`+ "org.apache.spark.sql.hive.thriftserver.SparkSQLEnvSuite",`
	`480`	`+ "org.apache.spark.sql.hive.thriftserver.ui.ThriftServerPageSuite"`
`480`	`481`	`)`
`481`	`482`
`482`	`483`	`private val DEFAULT_TEST_GROUP = "default_test_group"`