Skip to content

Commit 64f72ad

Browse files
committed
Merge branch 'master' into SPARK-6263
2 parents c728046 + c2f0821 commit 64f72ad

File tree

758 files changed

+69212
-7116
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

758 files changed

+69212
-7116
lines changed

.rat-excludes

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ TAGS
1515
RELEASE
1616
control
1717
docs
18+
docker.properties.template
1819
fairscheduler.xml.template
1920
spark-defaults.conf.template
2021
log4j.properties
@@ -29,7 +30,13 @@ spark-env.sh.template
2930
log4j-defaults.properties
3031
bootstrap-tooltip.js
3132
jquery-1.11.1.min.js
33+
d3.min.js
34+
dagre-d3.min.js
35+
graphlib-dot.min.js
3236
sorttable.js
37+
vis.min.js
38+
vis.min.css
39+
vis.map
3340
.*avsc
3441
.*txt
3542
.*json

LICENSE

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
643643
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
644644
THE SOFTWARE.
645645

646+
========================================================================
647+
For d3 (core/src/main/resources/org/apache/spark/ui/static/d3.min.js):
648+
========================================================================
649+
650+
Copyright (c) 2010-2015, Michael Bostock
651+
All rights reserved.
652+
653+
Redistribution and use in source and binary forms, with or without
654+
modification, are permitted provided that the following conditions are met:
655+
656+
* Redistributions of source code must retain the above copyright notice, this
657+
list of conditions and the following disclaimer.
658+
659+
* Redistributions in binary form must reproduce the above copyright notice,
660+
this list of conditions and the following disclaimer in the documentation
661+
and/or other materials provided with the distribution.
662+
663+
* The name Michael Bostock may not be used to endorse or promote products
664+
derived from this software without specific prior written permission.
665+
666+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
667+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
668+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
669+
DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
670+
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
671+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
672+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
673+
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
674+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
675+
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
646676

647677
========================================================================
648678
For Scala Interpreter classes (all .scala files in repl/src/main/scala
@@ -814,6 +844,7 @@ BSD-style licenses
814844
The following components are provided under a BSD-style license. See project link for details.
815845

816846
(BSD 3 Clause) core (com.github.fommil.netlib:core:1.1.2 - https://github.com/fommil/netlib-java/core)
847+
(BSD 3 Clause) JPMML-Model (org.jpmml:pmml-model:1.1.15 - https://github.com/jpmml/jpmml-model)
817848
(BSD 3-clause style license) jblas (org.jblas:jblas:1.2.3 - http://jblas.org/)
818849
(BSD License) AntLR Parser Generator (antlr:antlr:2.7.7 - http://www.antlr.org/)
819850
(BSD License) Javolution (javolution:javolution:5.5.1 - http://javolution.org)

R/pkg/NAMESPACE

Lines changed: 14 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,36 @@
1-
#exportPattern("^[[:alpha:]]+")
2-
exportClasses("RDD")
3-
exportClasses("Broadcast")
4-
exportMethods(
5-
"aggregateByKey",
6-
"aggregateRDD",
7-
"cache",
8-
"cartesian",
9-
"checkpoint",
10-
"coalesce",
11-
"cogroup",
12-
"collect",
13-
"collectAsMap",
14-
"collectPartition",
15-
"combineByKey",
16-
"count",
17-
"countByKey",
18-
"countByValue",
19-
"distinct",
20-
"Filter",
21-
"filterRDD",
22-
"first",
23-
"flatMap",
24-
"flatMapValues",
25-
"fold",
26-
"foldByKey",
27-
"foreach",
28-
"foreachPartition",
29-
"fullOuterJoin",
30-
"glom",
31-
"groupByKey",
32-
"intersection",
33-
"join",
34-
"keyBy",
35-
"keys",
36-
"length",
37-
"lapply",
38-
"lapplyPartition",
39-
"lapplyPartitionsWithIndex",
40-
"leftOuterJoin",
41-
"lookup",
42-
"map",
43-
"mapPartitions",
44-
"mapPartitionsWithIndex",
45-
"mapValues",
46-
"maximum",
47-
"minimum",
48-
"numPartitions",
49-
"partitionBy",
50-
"persist",
51-
"pipeRDD",
52-
"reduce",
53-
"reduceByKey",
54-
"reduceByKeyLocally",
55-
"repartition",
56-
"rightOuterJoin",
57-
"sampleByKey",
58-
"sampleRDD",
59-
"saveAsTextFile",
60-
"saveAsObjectFile",
61-
"sortBy",
62-
"sortByKey",
63-
"subtract",
64-
"subtractByKey",
65-
"sumRDD",
66-
"take",
67-
"takeOrdered",
68-
"takeSample",
69-
"top",
70-
"unionRDD",
71-
"unpersist",
72-
"value",
73-
"values",
74-
"zipRDD",
75-
"zipWithIndex",
76-
"zipWithUniqueId"
77-
)
1+
# Imports from base R
2+
importFrom(methods, setGeneric, setMethod, setOldClass)
3+
useDynLib(SparkR, stringHashCode)
784

795
# S3 methods exported
80-
export(
81-
"textFile",
82-
"objectFile",
83-
"parallelize",
84-
"hashCode",
85-
"includePackage",
86-
"broadcast",
87-
"setBroadcastValue",
88-
"setCheckpointDir"
89-
)
906
export("sparkR.init")
917
export("sparkR.stop")
928
export("print.jobj")
93-
useDynLib(SparkR, stringHashCode)
94-
importFrom(methods, setGeneric, setMethod, setOldClass)
95-
96-
# SparkRSQL
979

9810
exportClasses("DataFrame")
9911

100-
exportMethods("columns",
12+
exportMethods("cache",
13+
"collect",
14+
"columns",
15+
"count",
16+
"describe",
10117
"distinct",
10218
"dtypes",
10319
"except",
10420
"explain",
10521
"filter",
22+
"first",
10623
"groupBy",
10724
"head",
10825
"insertInto",
10926
"intersect",
11027
"isLocal",
28+
"join",
29+
"length",
11130
"limit",
11231
"orderBy",
11332
"names",
33+
"persist",
11434
"printSchema",
11535
"registerTempTable",
11636
"repartition",
@@ -124,9 +44,9 @@ exportMethods("columns",
12444
"show",
12545
"showDF",
12646
"sortDF",
127-
"toJSON",
128-
"toRDD",
47+
"take",
12948
"unionAll",
49+
"unpersist",
13050
"where",
13151
"withColumn",
13252
"withColumnRenamed")
@@ -173,14 +93,12 @@ export("cacheTable",
17393
"createExternalTable",
17494
"dropTempTable",
17595
"jsonFile",
176-
"jsonRDD",
17796
"loadDF",
17897
"parquetFile",
17998
"sql",
18099
"table",
181100
"tableNames",
182101
"tables",
183-
"toDF",
184102
"uncacheTable")
185103

186104
export("sparkRSQL.init",

R/pkg/R/DataFrame.R

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ setMethod("isLocal",
167167
setMethod("showDF",
168168
signature(x = "DataFrame"),
169169
function(x, numRows = 20) {
170-
cat(callJMethod(x@sdf, "showString", numToInt(numRows)), "\n")
170+
callJMethod(x@sdf, "showString", numToInt(numRows))
171171
})
172172

173173
#' show
@@ -272,7 +272,7 @@ setMethod("names",
272272
setMethod("registerTempTable",
273273
signature(x = "DataFrame", tableName = "character"),
274274
function(x, tableName) {
275-
callJMethod(x@sdf, "registerTempTable", tableName)
275+
invisible(callJMethod(x@sdf, "registerTempTable", tableName))
276276
})
277277

278278
#' insertInto
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
12761276
callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
12771277
})
12781278

1279+
#' describe
1280+
#'
1281+
#' Computes statistics for numeric columns.
1282+
#' If no columns are given, this function computes statistics for all numerical columns.
1283+
#'
1284+
#' @param x A DataFrame to be computed.
1285+
#' @param col A string of name
1286+
#' @param ... Additional expressions
1287+
#' @return A DataFrame
1288+
#' @rdname describe
1289+
#' @export
1290+
#' @examples
1291+
#'\dontrun{
1292+
#' sc <- sparkR.init()
1293+
#' sqlCtx <- sparkRSQL.init(sc)
1294+
#' path <- "path/to/file.json"
1295+
#' df <- jsonFile(sqlCtx, path)
1296+
#' describe(df)
1297+
#' describe(df, "col1")
1298+
#' describe(df, "col1", "col2")
1299+
#' }
1300+
setMethod("describe",
1301+
signature(x = "DataFrame", col = "character"),
1302+
function(x, col, ...) {
1303+
colList <- list(col, ...)
1304+
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
1305+
dataFrame(sdf)
1306+
})
1307+
1308+
#' @rdname describe
1309+
setMethod("describe",
1310+
signature(x = "DataFrame"),
1311+
function(x) {
1312+
colList <- as.list(c(columns(x)))
1313+
sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
1314+
dataFrame(sdf)
1315+
})

R/pkg/R/RDD.R

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ setMethod("initialize", "RDD", function(.Object, jrdd, serializedMode,
6666
.Object
6767
})
6868

69+
setMethod("show", "RDD",
70+
function(.Object) {
71+
cat(paste(callJMethod(.Object@jrdd, "toString"), "\n", sep=""))
72+
})
73+
6974
setMethod("initialize", "PipelinedRDD", function(.Object, prev, func, jrdd_val) {
7075
.Object@env <- new.env()
7176
.Object@env$isCached <- FALSE
@@ -792,7 +797,7 @@ setMethod("first",
792797
#' @aliases distinct,RDD-method
793798
setMethod("distinct",
794799
signature(x = "RDD"),
795-
function(x, numPartitions = SparkR::numPartitions(x)) {
800+
function(x, numPartitions = SparkR:::numPartitions(x)) {
796801
identical.mapped <- lapply(x, function(x) { list(x, NULL) })
797802
reduced <- reduceByKey(identical.mapped,
798803
function(x, y) { x },
@@ -988,7 +993,7 @@ setMethod("coalesce",
988993
signature(x = "RDD", numPartitions = "numeric"),
989994
function(x, numPartitions, shuffle = FALSE) {
990995
numPartitions <- numToInt(numPartitions)
991-
if (shuffle || numPartitions > SparkR::numPartitions(x)) {
996+
if (shuffle || numPartitions > SparkR:::numPartitions(x)) {
992997
func <- function(partIndex, part) {
993998
set.seed(partIndex) # partIndex as seed
994999
start <- as.integer(sample(numPartitions, 1) - 1)
@@ -1073,7 +1078,7 @@ setMethod("saveAsTextFile",
10731078
#' @aliases sortBy,RDD,RDD-method
10741079
setMethod("sortBy",
10751080
signature(x = "RDD", func = "function"),
1076-
function(x, func, ascending = TRUE, numPartitions = SparkR::numPartitions(x)) {
1081+
function(x, func, ascending = TRUE, numPartitions = SparkR:::numPartitions(x)) {
10771082
values(sortByKey(keyBy(x, func), ascending, numPartitions))
10781083
})
10791084

@@ -1547,7 +1552,7 @@ setMethod("cartesian",
15471552
#' @aliases subtract,RDD
15481553
setMethod("subtract",
15491554
signature(x = "RDD", other = "RDD"),
1550-
function(x, other, numPartitions = SparkR::numPartitions(x)) {
1555+
function(x, other, numPartitions = SparkR:::numPartitions(x)) {
15511556
mapFunction <- function(e) { list(e, NA) }
15521557
rdd1 <- map(x, mapFunction)
15531558
rdd2 <- map(other, mapFunction)
@@ -1578,7 +1583,7 @@ setMethod("subtract",
15781583
#' @aliases intersection,RDD
15791584
setMethod("intersection",
15801585
signature(x = "RDD", other = "RDD"),
1581-
function(x, other, numPartitions = SparkR::numPartitions(x)) {
1586+
function(x, other, numPartitions = SparkR:::numPartitions(x)) {
15821587
rdd1 <- map(x, function(v) { list(v, NA) })
15831588
rdd2 <- map(other, function(v) { list(v, NA) })
15841589

@@ -1590,3 +1595,49 @@ setMethod("intersection",
15901595

15911596
keys(filterRDD(cogroup(rdd1, rdd2, numPartitions = numPartitions), filterFunction))
15921597
})
1598+
1599+
#' Zips an RDD's partitions with one (or more) RDD(s).
1600+
#' Same as zipPartitions in Spark.
1601+
#'
1602+
#' @param ... RDDs to be zipped.
1603+
#' @param func A function to transform zipped partitions.
1604+
#' @return A new RDD by applying a function to the zipped partitions.
1605+
#' Assumes that all the RDDs have the *same number of partitions*, but
1606+
#' does *not* require them to have the same number of elements in each partition.
1607+
#' @examples
1608+
#'\dontrun{
1609+
#' sc <- sparkR.init()
1610+
#' rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2
1611+
#' rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4
1612+
#' rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6
1613+
#' collect(zipPartitions(rdd1, rdd2, rdd3,
1614+
#' func = function(x, y, z) { list(list(x, y, z))} ))
1615+
#' # list(list(1, c(1,2), c(1,2,3)), list(2, c(3,4), c(4,5,6)))
1616+
#'}
1617+
#' @rdname zipRDD
1618+
#' @aliases zipPartitions,RDD
1619+
setMethod("zipPartitions",
1620+
"RDD",
1621+
function(..., func) {
1622+
rrdds <- list(...)
1623+
if (length(rrdds) == 1) {
1624+
return(rrdds[[1]])
1625+
}
1626+
nPart <- sapply(rrdds, numPartitions)
1627+
if (length(unique(nPart)) != 1) {
1628+
stop("Can only zipPartitions RDDs which have the same number of partitions.")
1629+
}
1630+
1631+
rrdds <- lapply(rrdds, function(rdd) {
1632+
mapPartitionsWithIndex(rdd, function(partIndex, part) {
1633+
print(length(part))
1634+
list(list(partIndex, part))
1635+
})
1636+
})
1637+
union.rdd <- Reduce(unionRDD, rrdds)
1638+
zipped.rdd <- values(groupByKey(union.rdd, numPartitions = nPart[1]))
1639+
res <- mapPartitions(zipped.rdd, function(plist) {
1640+
do.call(func, plist[[1]])
1641+
})
1642+
res
1643+
})

0 commit comments

Comments
 (0)