Skip to content

Commit ae1d39b

Browse files
committed
Merge remote-tracking branch 'apache-github/master' into graceful-shutdown
Conflicts: streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala streaming/src/main/scala/org/apache/spark/streaming/dstream/NetworkInputDStream.scala streaming/src/main/scala/org/apache/spark/streaming/scheduler/NetworkInputTracker.scala
2 parents 6b59cfc + 0307db0 commit ae1d39b

File tree

449 files changed

+13295
-2594
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

449 files changed

+13295
-2594
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,4 @@ spark-*-bin.tar.gz
4747
unit-tests.log
4848
/lib/
4949
rat-results.txt
50+
scalastyle.txt

.rat-excludes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@ work
3939
.*\.q
4040
golden
4141
test.out/*
42+
.*iml

.travis.yml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,13 @@
2020
- oraclejdk7
2121
env:
2222
matrix:
23-
- TEST=sql/test
23+
- TEST="scalastyle assembly/assembly"
24+
- TEST="catalyst/test sql/test streaming/test mllib/test graphx/test bagel/test"
2425
- TEST=hive/test
25-
- TEST=catalyst/test
26-
- TEST=streaming/test
27-
- TEST=graphx/test
28-
- TEST=mllib/test
29-
- TEST=graphx/test
30-
- TEST=bagel/test
3126
cache:
3227
directories:
3328
- $HOME/.m2
3429
- $HOME/.ivy2
3530
- $HOME/.sbt
3631
script:
37-
- "sbt ++$TRAVIS_SCALA_VERSION scalastyle $TEST"
32+
- "sbt ++$TRAVIS_SCALA_VERSION $TEST"

assembly/pom.xml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,16 @@
163163
</dependency>
164164
</dependencies>
165165
</profile>
166+
<profile>
167+
<id>hive</id>
168+
<dependencies>
169+
<dependency>
170+
<groupId>org.apache.spark</groupId>
171+
<artifactId>spark-hive_${scala.binary.version}</artifactId>
172+
<version>${project.version}</version>
173+
</dependency>
174+
</dependencies>
175+
</profile>
166176
<profile>
167177
<id>spark-ganglia-lgpl</id>
168178
<dependencies>
@@ -208,7 +218,7 @@
208218
<plugin>
209219
<groupId>org.codehaus.mojo</groupId>
210220
<artifactId>buildnumber-maven-plugin</artifactId>
211-
<version>1.1</version>
221+
<version>1.2</version>
212222
<executions>
213223
<execution>
214224
<phase>validate</phase>

bin/compute-classpath.sh

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
3030
# Build up classpath
3131
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
3232

33-
# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
34-
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
35-
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
36-
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
37-
# the future.
38-
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
39-
echo "Hive assembly found, including hive support. If this isn't desired run sbt hive/clean."
40-
41-
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
42-
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
43-
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
44-
45-
ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
46-
else
47-
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
48-
fi
33+
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
4934

5035
# First check if we have a dependencies jar. If so, include binary classes with the deps jar
5136
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -60,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
6045
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
6146
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
6247

63-
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
48+
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
6449
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
6550
else
6651
# Else use spark-assembly jar from either RELEASE or assembly directory
@@ -72,6 +57,23 @@ else
7257
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
7358
fi
7459

60+
# When Hive support is needed, Datanucleus jars must be included on the classpath.
61+
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
62+
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
63+
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
64+
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
65+
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
66+
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
67+
if [ $num_datanucleus_jars -gt 0 ]; then
68+
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
69+
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
70+
if [ $num_hive_files -gt 0 ]; then
71+
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
72+
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
73+
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
74+
fi
75+
fi
76+
7577
# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
7678
if [[ $SPARK_TESTING == 1 ]]; then
7779
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"

bin/load-spark-env.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
3030
use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
3131

3232
if [ -f "${use_conf_dir}/spark-env.sh" ]; then
33+
# Promote all variable declarations to environment (exported) variables
34+
set -a
3335
. "${use_conf_dir}/spark-env.sh"
36+
set +a
3437
fi
3538
fi

bin/pyspark

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ if [ -n "$IPYTHON_OPTS" ]; then
5555
IPYTHON=1
5656
fi
5757

58-
if [[ "$IPYTHON" = "1" ]] ; then
58+
# Only use ipython if no command line arguments were provided [SPARK-1134]
59+
if [[ "$IPYTHON" = "1" && $# = 0 ]] ; then
5960
exec ipython $IPYTHON_OPTS
6061
else
6162
exec "$PYSPARK_PYTHON" "$@"

bin/spark-class

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
154154
fi
155155

156156
exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
157-
158-

bin/spark-shell

Lines changed: 168 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -30,67 +30,189 @@ esac
3030
# Enter posix mode for bash
3131
set -o posix
3232

33-
CORE_PATTERN="^[0-9]+$"
34-
MEM_PATTERN="^[0-9]+[m|g|M|G]$"
35-
33+
## Global script variables
3634
FWDIR="$(cd `dirname $0`/..; pwd)"
3735

38-
if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
39-
echo "Usage: spark-shell [OPTIONS]"
40-
echo "OPTIONS:"
41-
echo "-c --cores num, the maximum number of cores to be used by the spark shell"
42-
echo "-em --execmem num[m|g], the memory used by each executor of spark shell"
43-
echo "-dm --drivermem num[m|g], the memory used by the spark shell and driver"
44-
echo "-h --help, print this help information"
45-
exit
46-
fi
36+
SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
37+
DEFAULT_MASTER="local[*]"
38+
MASTER=${MASTER:-""}
39+
40+
info_log=0
41+
42+
#CLI Color Templates
43+
txtund=$(tput sgr 0 1) # Underline
44+
txtbld=$(tput bold) # Bold
45+
bldred=${txtbld}$(tput setaf 1) # red
46+
bldyel=${txtbld}$(tput setaf 3) # yellow
47+
bldblu=${txtbld}$(tput setaf 4) # blue
48+
bldwht=${txtbld}$(tput setaf 7) # white
49+
txtrst=$(tput sgr0) # Reset
50+
info=${bldwht}*${txtrst} # Feedback
51+
pass=${bldblu}*${txtrst}
52+
warn=${bldred}*${txtrst}
53+
ques=${bldblu}?${txtrst}
54+
55+
# Helper function to describe the script usage
56+
function usage() {
57+
cat << EOF
58+
${txtbld}Usage${txtrst}: spark-shell [OPTIONS]
59+
60+
${txtbld}OPTIONS${txtrst}:
61+
-h --help : Print this help information.
62+
-c --cores : The maximum number of cores to be used by the Spark Shell.
63+
-em --executor-memory : The memory used by each executor of the Spark Shell, the number
64+
is followed by m for megabytes or g for gigabytes, e.g. "1g".
65+
-dm --driver-memory : The memory used by the Spark Shell, the number is followed
66+
by m for megabytes or g for gigabytes, e.g. "1g".
67+
-m --master : A full string that describes the Spark Master, defaults to "local[*]"
68+
e.g. "spark://localhost:7077".
69+
--log-conf : Enables logging of the supplied SparkConf as INFO at start of the
70+
Spark Context.
71+
72+
e.g.
73+
spark-shell -m spark://localhost:7077 -c 4 -dm 512m -em 2g
74+
75+
EOF
76+
}
77+
78+
function out_error(){
79+
echo -e "${txtund}${bldred}ERROR${txtrst}: $1"
80+
usage
81+
exit 1
82+
}
83+
84+
function log_info(){
85+
[ $info_log -eq 1 ] && echo -e "${bldyel}INFO${txtrst}: $1"
86+
}
87+
88+
function log_warn(){
89+
echo -e "${txtund}${bldyel}WARN${txtrst}: $1"
90+
}
4791

48-
for o in "$@"; do
49-
if [ "$1" = "-c" -o "$1" = "--cores" ]; then
50-
shift
92+
# PATTERNS used to validate more than one optional arg.
93+
ARG_FLAG_PATTERN="^-"
94+
MEM_PATTERN="^[0-9]+[m|g|M|G]$"
95+
NUM_PATTERN="^[0-9]+$"
96+
PORT_PATTERN="^[0-9]+$"
97+
98+
# Setters for optional args.
99+
function set_cores(){
100+
CORE_PATTERN="^[0-9]+$"
51101
if [[ "$1" =~ $CORE_PATTERN ]]; then
52-
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
53-
shift
102+
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
54103
else
55-
echo "ERROR: wrong format for -c/--cores"
56-
exit 1
104+
out_error "wrong format for $2"
57105
fi
58-
fi
59-
if [ "$1" = "-em" -o "$1" = "--execmem" ]; then
60-
shift
106+
}
107+
108+
function set_em(){
61109
if [[ $1 =~ $MEM_PATTERN ]]; then
62110
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.executor.memory=$1"
63-
shift
64111
else
65-
echo "ERROR: wrong format for --execmem/-em"
66-
exit 1
112+
out_error "wrong format for $2"
67113
fi
68-
fi
69-
if [ "$1" = "-dm" -o "$1" = "--drivermem" ]; then
70-
shift
114+
}
115+
116+
function set_dm(){
71117
if [[ $1 =~ $MEM_PATTERN ]]; then
72118
export SPARK_DRIVER_MEMORY=$1
73-
shift
74119
else
75-
echo "ERROR: wrong format for --drivermem/-dm"
76-
exit 1
120+
out_error "wrong format for $2"
77121
fi
78-
fi
79-
done
122+
}
123+
124+
function set_spark_log_conf(){
125+
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.logConf=$1"
126+
}
80127

81-
# Set MASTER from spark-env if possible
82-
DEFAULT_SPARK_MASTER_PORT=7077
83-
if [ -z "$MASTER" ]; then
84-
. $FWDIR/bin/load-spark-env.sh
85-
if [ "x" != "x$SPARK_MASTER_IP" ]; then
86-
if [ "y" != "y$SPARK_MASTER_PORT" ]; then
87-
SPARK_MASTER_PORT="${SPARK_MASTER_PORT}"
128+
function set_spark_master(){
129+
if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
130+
export MASTER="$1"
88131
else
89-
SPARK_MASTER_PORT=$DEFAULT_SPARK_MASTER_PORT
132+
out_error "wrong format for $2"
133+
fi
134+
}
135+
136+
function resolve_spark_master(){
137+
# Set MASTER from spark-env if possible
138+
DEFAULT_SPARK_MASTER_PORT=7077
139+
if [ -z "$MASTER" ]; then
140+
. $FWDIR/bin/load-spark-env.sh
141+
if [ -n "$SPARK_MASTER_IP" ]; then
142+
SPARK_MASTER_PORT="${SPARK_MASTER_PORT:-"$DEFAULT_SPARK_MASTER_PORT"}"
143+
export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
144+
fi
145+
fi
146+
147+
if [ -z "$MASTER" ]; then
148+
export MASTER="$DEFAULT_MASTER"
90149
fi
91-
export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
92-
fi
93-
fi
150+
151+
}
152+
153+
function main(){
154+
log_info "Base Directory set to $FWDIR"
155+
156+
resolve_spark_master
157+
log_info "Spark Master is $MASTER"
158+
159+
log_info "Spark REPL options $SPARK_REPL_OPTS"
160+
if $cygwin; then
161+
# Workaround for issue involving JLine and Cygwin
162+
# (see http://sourceforge.net/p/jline/bugs/40/).
163+
# If you're using the Mintty terminal emulator in Cygwin, may need to set the
164+
# "Backspace sends ^H" setting in "Keys" section of the Mintty options
165+
# (see https://github.com/sbt/sbt/issues/562).
166+
stty -icanon min 1 -echo > /dev/null 2>&1
167+
export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
168+
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
169+
stty icanon echo > /dev/null 2>&1
170+
else
171+
export SPARK_REPL_OPTS
172+
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
173+
fi
174+
}
175+
176+
for option in "$@"
177+
do
178+
case $option in
179+
-h | --help )
180+
usage
181+
exit 1
182+
;;
183+
-c | --cores)
184+
shift
185+
_1=$1
186+
shift
187+
set_cores $_1 "-c/--cores"
188+
;;
189+
-em | --executor-memory)
190+
shift
191+
_1=$1
192+
shift
193+
set_em $_1 "-em/--executor-memory"
194+
;;
195+
-dm | --driver-memory)
196+
shift
197+
_1=$1
198+
shift
199+
set_dm $_1 "-dm/--driver-memory"
200+
;;
201+
-m | --master)
202+
shift
203+
_1=$1
204+
shift
205+
set_spark_master $_1 "-m/--master"
206+
;;
207+
--log-conf)
208+
shift
209+
set_spark_log_conf "true"
210+
info_log=1
211+
;;
212+
?)
213+
;;
214+
esac
215+
done
94216

95217
# Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in
96218
# binary distribution of Spark where Scala is not installed
@@ -120,22 +242,10 @@ if [[ ! $? ]]; then
120242
saved_stty=""
121243
fi
122244

123-
if $cygwin; then
124-
# Workaround for issue involving JLine and Cygwin
125-
# (see http://sourceforge.net/p/jline/bugs/40/).
126-
# If you're using the Mintty terminal emulator in Cygwin, may need to set the
127-
# "Backspace sends ^H" setting in "Keys" section of the Mintty options
128-
# (see https://github.com/sbt/sbt/issues/562).
129-
stty -icanon min 1 -echo > /dev/null 2>&1
130-
export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
131-
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
132-
stty icanon echo > /dev/null 2>&1
133-
else
134-
export SPARK_REPL_OPTS
135-
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
136-
fi
245+
main
137246

138247
# record the exit status lest it be overwritten:
139248
# then reenable echo and propagate the code.
140249
exit_status=$?
141250
onExit
251+

0 commit comments

Comments
 (0)