Skip to content

Commit 8644ee8

Browse files
committed
Merge branch 'master' into iterator-to-disk
2 parents 00c98e0 + 4d88030 commit 8644ee8

File tree

433 files changed

+5136
-2503
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

433 files changed

+5136
-2503
lines changed

assembly/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<parent>
2222
<groupId>org.apache.spark</groupId>
2323
<artifactId>spark-parent</artifactId>
24-
<version>0.9.0-incubating-SNAPSHOT</version>
24+
<version>1.0.0-incubating-SNAPSHOT</version>
2525
<relativePath>../pom.xml</relativePath>
2626
</parent>
2727

bagel/pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<parent>
2222
<groupId>org.apache.spark</groupId>
2323
<artifactId>spark-parent</artifactId>
24-
<version>0.9.0-incubating-SNAPSHOT</version>
24+
<version>1.0.0-incubating-SNAPSHOT</version>
2525
<relativePath>../pom.xml</relativePath>
2626
</parent>
2727

bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -28,21 +28,23 @@ object Bagel extends Logging {
2828
/**
2929
* Runs a Bagel program.
3030
* @param sc [[org.apache.spark.SparkContext]] to use for the program.
31-
* @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the Key will be
32-
* the vertex id.
33-
* @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often this will be an
34-
* empty array, i.e. sc.parallelize(Array[K, Message]()).
35-
* @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a given vertex into one
36-
* message before sending (which often involves network I/O).
37-
* @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices after each superstep,
38-
* and provides the result to each vertex in the next superstep.
31+
* @param vertices vertices of the graph represented as an RDD of (Key, Vertex) pairs. Often the
32+
* Key will be the vertex id.
33+
* @param messages initial set of messages represented as an RDD of (Key, Message) pairs. Often
34+
* this will be an empty array, i.e. sc.parallelize(Array[K, Message]()).
35+
* @param combiner [[org.apache.spark.bagel.Combiner]] combines multiple individual messages to a
36+
* given vertex into one message before sending (which often involves network
37+
* I/O).
38+
* @param aggregator [[org.apache.spark.bagel.Aggregator]] performs a reduce across all vertices
39+
* after each superstep and provides the result to each vertex in the next
40+
* superstep.
3941
* @param partitioner [[org.apache.spark.Partitioner]] partitions values by key
4042
* @param numPartitions number of partitions across which to split the graph.
4143
* Default is the default parallelism of the SparkContext
42-
* @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of intermediate RDDs in each superstep.
43-
* Defaults to caching in memory.
44-
* @param compute function that takes a Vertex, optional set of (possibly combined) messages to the Vertex,
45-
* optional Aggregator and the current superstep,
44+
* @param storageLevel [[org.apache.spark.storage.StorageLevel]] to use for caching of
45+
* intermediate RDDs in each superstep. Defaults to caching in memory.
46+
* @param compute function that takes a Vertex, optional set of (possibly combined) messages to
47+
* the Vertex, optional Aggregator and the current superstep,
4648
* and returns a set of (Vertex, outgoing Messages) pairs
4749
* @tparam K key
4850
* @tparam V vertex type
@@ -71,7 +73,7 @@ object Bagel extends Logging {
7173
var msgs = messages
7274
var noActivity = false
7375
do {
74-
logInfo("Starting superstep "+superstep+".")
76+
logInfo("Starting superstep " + superstep + ".")
7577
val startTime = System.currentTimeMillis
7678

7779
val aggregated = agg(verts, aggregator)
@@ -97,7 +99,8 @@ object Bagel extends Logging {
9799
verts
98100
}
99101

100-
/** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default storage level */
102+
/** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default
103+
* storage level */
101104
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
102105
sc: SparkContext,
103106
vertices: RDD[(K, V)],
@@ -106,8 +109,8 @@ object Bagel extends Logging {
106109
partitioner: Partitioner,
107110
numPartitions: Int
108111
)(
109-
compute: (V, Option[C], Int) => (V, Array[M])
110-
): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
112+
compute: (V, Option[C], Int) => (V, Array[M])): RDD[(K, V)] = run(sc, vertices, messages,
113+
combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
111114

112115
/** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] */
113116
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
@@ -127,8 +130,8 @@ object Bagel extends Logging {
127130
}
128131

129132
/**
130-
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]]
131-
* and default storage level
133+
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default
134+
* [[org.apache.spark.HashPartitioner]] and default storage level
132135
*/
133136
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
134137
sc: SparkContext,
@@ -138,9 +141,13 @@ object Bagel extends Logging {
138141
numPartitions: Int
139142
)(
140143
compute: (V, Option[C], Int) => (V, Array[M])
141-
): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
144+
): RDD[(K, V)] = run(sc, vertices, messages, combiner, numPartitions,
145+
DEFAULT_STORAGE_LEVEL)(compute)
142146

143-
/** Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the default [[org.apache.spark.HashPartitioner]]*/
147+
/**
148+
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]] and the
149+
* default [[org.apache.spark.HashPartitioner]]
150+
*/
144151
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest, C: Manifest](
145152
sc: SparkContext,
146153
vertices: RDD[(K, V)],
@@ -158,7 +165,8 @@ object Bagel extends Logging {
158165
}
159166

160167
/**
161-
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], default [[org.apache.spark.HashPartitioner]],
168+
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
169+
* default [[org.apache.spark.HashPartitioner]],
162170
* [[org.apache.spark.bagel.DefaultCombiner]] and the default storage level
163171
*/
164172
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
@@ -171,7 +179,8 @@ object Bagel extends Logging {
171179
): RDD[(K, V)] = run(sc, vertices, messages, numPartitions, DEFAULT_STORAGE_LEVEL)(compute)
172180

173181
/**
174-
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]], the default [[org.apache.spark.HashPartitioner]]
182+
* Runs a Bagel program with no [[org.apache.spark.bagel.Aggregator]],
183+
* the default [[org.apache.spark.HashPartitioner]]
175184
* and [[org.apache.spark.bagel.DefaultCombiner]]
176185
*/
177186
def run[K: Manifest, V <: Vertex : Manifest, M <: Message[K] : Manifest](
@@ -227,8 +236,9 @@ object Bagel extends Logging {
227236
})
228237

229238
numMsgs += newMsgs.size
230-
if (newVert.active)
239+
if (newVert.active) {
231240
numActiveVerts += 1
241+
}
232242

233243
Some((newVert, newMsgs))
234244
}.persist(storageLevel)

bin/run-example2.cmd

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ if "x%SPARK_EXAMPLES_JAR%"=="x" (
4949

5050
rem Compute Spark classpath using external script
5151
set DONT_PRINT_CLASSPATH=1
52-
call "%FWDIR%sbin\compute-classpath.cmd"
52+
call "%FWDIR%bin\compute-classpath.cmd"
5353
set DONT_PRINT_CLASSPATH=0
5454
set CLASSPATH=%SPARK_EXAMPLES_JAR%;%CLASSPATH%
5555

bin/spark-shell

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
# Shell script for starting the Spark Shell REPL
2222
# Note that it will set MASTER to spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
2323
# if those two env vars are set in spark-env.sh but MASTER is not.
24-
# Options:
25-
# -c <cores> Set the number of cores for REPL to use
2624

2725
cygwin=false
2826
case "`uname`" in
@@ -32,14 +30,52 @@ esac
3230
# Enter posix mode for bash
3331
set -o posix
3432

33+
CORE_PATTERN="^[0-9]+$"
34+
MEM_PATTERN="^[0-9]+[m|g|M|G]$"
35+
3536
FWDIR="$(cd `dirname $0`/..; pwd)"
3637

38+
if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
39+
echo "Usage: spark-shell [OPTIONS]"
40+
echo "OPTIONS:"
41+
echo "-c --cores num, the maximum number of cores to be used by the spark shell"
42+
echo "-em --execmem num[m|g], the memory used by each executor of spark shell"
43+
echo "-dm --drivermem num[m|g], the memory used by the spark shell and driver"
44+
echo "-h --help, print this help information"
45+
exit
46+
fi
47+
48+
SPARK_SHELL_OPTS=""
49+
3750
for o in "$@"; do
3851
if [ "$1" = "-c" -o "$1" = "--cores" ]; then
3952
shift
40-
if [ -n "$1" ]; then
41-
OPTIONS="-Dspark.cores.max=$1"
53+
if [[ "$1" =~ $CORE_PATTERN ]]; then
54+
SPARK_SHELL_OPTS="$SPARK_SHELL_OPTS -Dspark.cores.max=$1"
4255
shift
56+
else
57+
echo "ERROR: wrong format for -c/--cores"
58+
exit 1
59+
fi
60+
fi
61+
if [ "$1" = "-em" -o "$1" = "--execmem" ]; then
62+
shift
63+
if [[ $1 =~ $MEM_PATTERN ]]; then
64+
SPARK_SHELL_OPTS="$SPARK_SHELL_OPTS -Dspark.executor.memory=$1"
65+
shift
66+
else
67+
echo "ERROR: wrong format for --execmem/-em"
68+
exit 1
69+
fi
70+
fi
71+
if [ "$1" = "-dm" -o "$1" = "--drivermem" ]; then
72+
shift
73+
if [[ $1 =~ $MEM_PATTERN ]]; then
74+
export SPARK_MEM=$1
75+
shift
76+
else
77+
echo "ERROR: wrong format for --drivermem/-dm"
78+
exit 1
4379
fi
4480
fi
4581
done
@@ -95,10 +131,10 @@ if $cygwin; then
95131
# "Backspace sends ^H" setting in "Keys" section of the Mintty options
96132
# (see https://github.com/sbt/sbt/issues/562).
97133
stty -icanon min 1 -echo > /dev/null 2>&1
98-
$FWDIR/bin/spark-class -Djline.terminal=unix $OPTIONS org.apache.spark.repl.Main "$@"
134+
$FWDIR/bin/spark-class -Djline.terminal=unix $SPARK_SHELL_OPTS org.apache.spark.repl.Main "$@"
99135
stty icanon echo > /dev/null 2>&1
100136
else
101-
$FWDIR/bin/spark-class $OPTIONS org.apache.spark.repl.Main "$@"
137+
$FWDIR/bin/spark-class $SPARK_SHELL_OPTS org.apache.spark.repl.Main "$@"
102138
fi
103139

104140
# record the exit status lest it be overwritten:

conf/metrics.properties.template

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
# period 10 Poll period
6868
# unit seconds Units of poll period
6969
# ttl 1 TTL of messages sent by Ganglia
70-
# mode multicast Ganglia network mode ('unicast' or 'mulitcast')
70+
# mode multicast Ganglia network mode ('unicast' or 'multicast')
7171

7272
# org.apache.spark.metrics.sink.JmxSink
7373

conf/spark-env.sh.template

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@
1919
# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT
2020
# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node
2121
# - SPARK_WORKER_DIR, to set the working directory of worker processes
22+
# - SPARK_PUBLIC_DNS, to set the public dns name of the master

core/pom.xml

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
<parent>
2222
<groupId>org.apache.spark</groupId>
2323
<artifactId>spark-parent</artifactId>
24-
<version>0.9.0-incubating-SNAPSHOT</version>
24+
<version>1.0.0-incubating-SNAPSHOT</version>
2525
<relativePath>../pom.xml</relativePath>
2626
</parent>
2727

@@ -39,6 +39,12 @@
3939
<dependency>
4040
<groupId>net.java.dev.jets3t</groupId>
4141
<artifactId>jets3t</artifactId>
42+
<exclusions>
43+
<exclusion>
44+
<groupId>commons-logging</groupId>
45+
<artifactId>commons-logging</artifactId>
46+
</exclusion>
47+
</exclusions>
4248
</dependency>
4349
<dependency>
4450
<groupId>org.apache.avro</groupId>
@@ -68,6 +74,22 @@
6874
<groupId>org.slf4j</groupId>
6975
<artifactId>slf4j-api</artifactId>
7076
</dependency>
77+
<dependency>
78+
<groupId>org.slf4j</groupId>
79+
<artifactId>jul-to-slf4j</artifactId>
80+
</dependency>
81+
<dependency>
82+
<groupId>org.slf4j</groupId>
83+
<artifactId>jcl-over-slf4j</artifactId>
84+
</dependency>
85+
<dependency>
86+
<groupId>log4j</groupId>
87+
<artifactId>log4j</artifactId>
88+
</dependency>
89+
<dependency>
90+
<groupId>org.slf4j</groupId>
91+
<artifactId>slf4j-log4j12</artifactId>
92+
</dependency>
7193
<dependency>
7294
<groupId>com.ning</groupId>
7395
<artifactId>compress-lzf</artifactId>
@@ -127,10 +149,6 @@
127149
<groupId>io.netty</groupId>
128150
<artifactId>netty-all</artifactId>
129151
</dependency>
130-
<dependency>
131-
<groupId>log4j</groupId>
132-
<artifactId>log4j</artifactId>
133-
</dependency>
134152
<dependency>
135153
<groupId>com.clearspring.analytics</groupId>
136154
<artifactId>stream</artifactId>
@@ -190,11 +208,6 @@
190208
<artifactId>junit-interface</artifactId>
191209
<scope>test</scope>
192210
</dependency>
193-
<dependency>
194-
<groupId>org.slf4j</groupId>
195-
<artifactId>slf4j-log4j12</artifactId>
196-
<scope>test</scope>
197-
</dependency>
198211
</dependencies>
199212
<build>
200213
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>

core/src/main/scala/org/apache/spark/api/java/StorageLevels.java renamed to core/src/main/java/org/apache/spark/api/java/StorageLevels.java

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,17 @@
2323
* Expose some commonly useful storage level constants.
2424
*/
2525
public class StorageLevels {
26-
public static final StorageLevel NONE = new StorageLevel(false, false, false, 1);
27-
public static final StorageLevel DISK_ONLY = new StorageLevel(true, false, false, 1);
28-
public static final StorageLevel DISK_ONLY_2 = new StorageLevel(true, false, false, 2);
29-
public static final StorageLevel MEMORY_ONLY = new StorageLevel(false, true, true, 1);
30-
public static final StorageLevel MEMORY_ONLY_2 = new StorageLevel(false, true, true, 2);
31-
public static final StorageLevel MEMORY_ONLY_SER = new StorageLevel(false, true, false, 1);
32-
public static final StorageLevel MEMORY_ONLY_SER_2 = new StorageLevel(false, true, false, 2);
33-
public static final StorageLevel MEMORY_AND_DISK = new StorageLevel(true, true, true, 1);
34-
public static final StorageLevel MEMORY_AND_DISK_2 = new StorageLevel(true, true, true, 2);
35-
public static final StorageLevel MEMORY_AND_DISK_SER = new StorageLevel(true, true, false, 1);
36-
public static final StorageLevel MEMORY_AND_DISK_SER_2 = new StorageLevel(true, true, false, 2);
26+
public static final StorageLevel NONE = create(false, false, false, 1);
27+
public static final StorageLevel DISK_ONLY = create(true, false, false, 1);
28+
public static final StorageLevel DISK_ONLY_2 = create(true, false, false, 2);
29+
public static final StorageLevel MEMORY_ONLY = create(false, true, true, 1);
30+
public static final StorageLevel MEMORY_ONLY_2 = create(false, true, true, 2);
31+
public static final StorageLevel MEMORY_ONLY_SER = create(false, true, false, 1);
32+
public static final StorageLevel MEMORY_ONLY_SER_2 = create(false, true, false, 2);
33+
public static final StorageLevel MEMORY_AND_DISK = create(true, true, true, 1);
34+
public static final StorageLevel MEMORY_AND_DISK_2 = create(true, true, true, 2);
35+
public static final StorageLevel MEMORY_AND_DISK_SER = create(true, true, false, 1);
36+
public static final StorageLevel MEMORY_AND_DISK_SER_2 = create(true, true, false, 2);
3737

3838
/**
3939
* Create a new StorageLevel object.

0 commit comments

Comments
 (0)