Skip to content

Commit 59e2d54

Browse files
committed
merge with upstream
2 parents d968664 + 5836650 commit 59e2d54

File tree

8 files changed

+425
-0
lines changed

8 files changed

+425
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,6 @@ work/
2020
SparkR-pkg.Rproj
2121
*.o
2222
*.so
23+
# Eclipse Meta Files
24+
.project
25+
.classpath

pkg/NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
exportClasses("RDD")
33
exportClasses("Broadcast")
44
exportMethods(
5+
"aggregateRDD",
56
"cache",
67
"checkpoint",
78
"cogroup",
@@ -17,6 +18,7 @@ exportMethods(
1718
"filterRDD",
1819
"flatMap",
1920
"flatMapValues",
21+
"fold",
2022
"foreach",
2123
"foreachPartition",
2224
"fullOuterJoin",
@@ -49,7 +51,9 @@ exportMethods(
4951
"sortBy",
5052
"sortByKey",
5153
"take",
54+
"takeOrdered",
5255
"takeSample",
56+
"top",
5357
"unionRDD",
5458
"unpersist",
5559
"value",

pkg/R/RDD.R

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,236 @@ setMethod("sortBy",
11281128
values(sortByKey(keyBy(rdd, func), ascending, numPartitions))
11291129
})
11301130

1131+
# Helper function to get first N elements from an RDD in the specified order.
1132+
# Param:
1133+
# rdd An RDD.
1134+
# num Number of elements to return.
1135+
# ascending A flag to indicate whether the sorting is ascending or descending.
1136+
# Return:
1137+
# A list of the first N elements from the RDD in the specified order.
1138+
#
1139+
takeOrderedElem <- function(rdd, num, ascending = TRUE) {
1140+
if (num <= 0L) {
1141+
return(list())
1142+
}
1143+
1144+
partitionFunc <- function(part) {
1145+
if (num < length(part)) {
1146+
# R limitation: order works only on primitive types!
1147+
ord <- order(unlist(part, recursive = FALSE), decreasing = !ascending)
1148+
list(part[ord[1:num]])
1149+
} else {
1150+
list(part)
1151+
}
1152+
}
1153+
1154+
reduceFunc <- function(elems, part) {
1155+
newElems <- append(elems, part)
1156+
# R limitation: order works only on primitive types!
1157+
ord <- order(unlist(newElems, recursive = FALSE), decreasing = !ascending)
1158+
newElems[ord[1:num]]
1159+
}
1160+
1161+
newRdd <- mapPartitions(rdd, partitionFunc)
1162+
reduce(newRdd, reduceFunc)
1163+
}
1164+
1165+
#' Returns the first N elements from an RDD in ascending order.
1166+
#'
1167+
#' @param rdd An RDD.
1168+
#' @param num Number of elements to return.
1169+
#' @return The first N elements from the RDD in ascending order.
1170+
#' @rdname takeOrdered
1171+
#' @export
1172+
#' @examples
1173+
#'\dontrun{
1174+
#' sc <- sparkR.init()
1175+
#' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1176+
#' takeOrdered(rdd, 6L) # list(1, 2, 3, 4, 5, 6)
1177+
#'}
1178+
setGeneric("takeOrdered", function(rdd, num) { standardGeneric("takeOrdered") })
1179+
1180+
#' @rdname takeOrdered
1181+
#' @aliases takeOrdered,RDD,RDD-method
1182+
setMethod("takeOrdered",
1183+
signature(rdd = "RDD", num = "integer"),
1184+
function(rdd, num) {
1185+
takeOrderedElem(rdd, num)
1186+
})
1187+
1188+
#' Returns the top N elements from an RDD.
1189+
#'
1190+
#' @param rdd An RDD.
1191+
#' @param num Number of elements to return.
1192+
#' @return The top N elements from the RDD.
1193+
#' @rdname top
1194+
#' @export
1195+
#' @examples
1196+
#'\dontrun{
1197+
#' sc <- sparkR.init()
1198+
#' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1199+
#' top(rdd, 6L) # list(10, 9, 7, 6, 5, 4)
1200+
#'}
1201+
setGeneric("top", function(rdd, num) { standardGeneric("top") })
1202+
1203+
#' @rdname top
1204+
#' @aliases top,RDD,RDD-method
1205+
setMethod("top",
1206+
signature(rdd = "RDD", num = "integer"),
1207+
function(rdd, num) {
1208+
takeOrderedElem(rdd, num, FALSE)
1209+
})
1210+
1211+
#' Fold an RDD using a given associative function and a neutral "zero value".
1212+
#'
1213+
#' Aggregate the elements of each partition, and then the results for all the
1214+
#' partitions, using a given associative function and a neutral "zero value".
1215+
#'
1216+
#' @param rdd An RDD.
1217+
#' @param zeroValue A neutral "zero value".
1218+
#' @param op An associative function for the folding operation.
1219+
#' @return The folding result.
1220+
#' @rdname fold
1221+
#' @seealso reduce
1222+
#' @export
1223+
#' @examples
1224+
#'\dontrun{
1225+
#' sc <- sparkR.init()
1226+
#' rdd <- parallelize(sc, list(1, 2, 3, 4, 5))
1227+
#' fold(rdd, 0, "+") # 15
1228+
#'}
1229+
setGeneric("fold", function(rdd, zeroValue, op) { standardGeneric("fold") })
1230+
1231+
#' @rdname fold
1232+
#' @aliases fold,RDD,RDD-method
1233+
setMethod("fold",
1234+
signature(rdd = "RDD", zeroValue = "ANY", op = "ANY"),
1235+
function(rdd, zeroValue, op) {
1236+
aggregateRDD(rdd, zeroValue, op, op)
1237+
})
1238+
1239+
#' Aggregate an RDD using the given combine functions and a neutral "zero value".
1240+
#'
1241+
#' Aggregate the elements of each partition, and then the results for all the
1242+
#' partitions, using given combine functions and a neutral "zero value".
1243+
#'
1244+
#' @param rdd An RDD.
1245+
#' @param zeroValue A neutral "zero value".
1246+
#' @param seqOp A function to aggregate the RDD elements. It may return a different
1247+
#' result type from the type of the RDD elements.
1248+
#' @param combOp A function to aggregate results of seqOp.
1249+
#' @return The aggregation result.
1250+
#' @rdname aggregateRDD
1251+
#' @seealso reduce
1252+
#' @export
1253+
#' @examples
1254+
#'\dontrun{
1255+
#' sc <- sparkR.init()
1256+
#' rdd <- parallelize(sc, list(1, 2, 3, 4))
1257+
#' zeroValue <- list(0, 0)
1258+
#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
1259+
#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
1260+
#' aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4)
1261+
#'}
1262+
setGeneric("aggregateRDD", function(rdd, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
1263+
1264+
#' @rdname aggregateRDD
1265+
#' @aliases aggregateRDD,RDD,RDD-method
1266+
setMethod("aggregateRDD",
1267+
signature(rdd = "RDD", zeroValue = "ANY", seqOp = "ANY", combOp = "ANY"),
1268+
function(rdd, zeroValue, seqOp, combOp) {
1269+
partitionFunc <- function(part) {
1270+
Reduce(seqOp, part, zeroValue)
1271+
}
1272+
1273+
partitionList <- collect(lapplyPartition(rdd, partitionFunc),
1274+
flatten = FALSE)
1275+
Reduce(combOp, partitionList, zeroValue)
1276+
})
1277+
1278+
############ Shuffle Functions ############
1279+
1280+
#' Partition an RDD by key
1281+
#'
1282+
#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
1283+
#' For each element of this RDD, the partitioner is used to compute a hash
1284+
#' function and the RDD is partitioned using this hash value.
1285+
#'
1286+
#' @param rdd The RDD to partition. Should be an RDD where each element is
1287+
#' list(K, V) or c(K, V).
1288+
#' @param numPartitions Number of partitions to create.
1289+
#' @param ... Other optional arguments to partitionBy.
1290+
#'
1291+
#' @param partitionFunc The partition function to use. Uses a default hashCode
1292+
#' function if not provided
1293+
#' @return An RDD partitioned using the specified partitioner.
1294+
#' @rdname partitionBy
1295+
#' @export
1296+
#' @examples
1297+
#'\dontrun{
1298+
#' sc <- sparkR.init()
1299+
#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
1300+
#' rdd <- parallelize(sc, pairs)
1301+
#' parts <- partitionBy(rdd, 2L)
1302+
#' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4)
1303+
#'}
1304+
setGeneric("partitionBy",
1305+
function(rdd, numPartitions, ...) {
1306+
standardGeneric("partitionBy")
1307+
})
1308+
1309+
#' @rdname partitionBy
1310+
#' @aliases partitionBy,RDD,integer-method
1311+
setMethod("partitionBy",
1312+
signature(rdd = "RDD", numPartitions = "integer"),
1313+
function(rdd, numPartitions, partitionFunc = hashCode) {
1314+
1315+
#if (missing(partitionFunc)) {
1316+
# partitionFunc <- hashCode
1317+
#}
1318+
1319+
depsBinArr <- getDependencies(partitionFunc)
1320+
1321+
serializedHashFuncBytes <- serialize(as.character(substitute(partitionFunc)),
1322+
connection = NULL,
1323+
ascii = TRUE)
1324+
1325+
packageNamesArr <- serialize(.sparkREnv$.packages,
1326+
connection = NULL,
1327+
ascii = TRUE)
1328+
broadcastArr <- lapply(ls(.broadcastNames), function(name) {
1329+
get(name, .broadcastNames) })
1330+
jrdd <- getJRDD(rdd)
1331+
1332+
# We create a PairwiseRRDD that extends RDD[(Array[Byte],
1333+
# Array[Byte])], where the key is the hashed split, the value is
1334+
# the content (key-val pairs).
1335+
pairwiseRRDD <- newJObject("edu.berkeley.cs.amplab.sparkr.PairwiseRRDD",
1336+
callJMethod(jrdd, "rdd"),
1337+
as.integer(numPartitions),
1338+
serializedHashFuncBytes,
1339+
rdd@env$serialized,
1340+
depsBinArr,
1341+
packageNamesArr,
1342+
as.character(.sparkREnv$libname),
1343+
broadcastArr,
1344+
callJMethod(jrdd, "classTag"))
1345+
1346+
# Create a corresponding partitioner.
1347+
rPartitioner <- newJObject("org.apache.spark.HashPartitioner",
1348+
as.integer(numPartitions))
1349+
1350+
# Call partitionBy on the obtained PairwiseRDD.
1351+
javaPairRDD <- callJMethod(pairwiseRRDD, "asJavaPairRDD")
1352+
javaPairRDD <- callJMethod(javaPairRDD, "partitionBy", rPartitioner)
1353+
1354+
# Call .values() on the result to get back the final result, the
1355+
# shuffled acutal content key-val pairs.
1356+
r <- callJMethod(javaPairRDD, "values")
1357+
1358+
RDD(r, serialized = TRUE)
1359+
})
1360+
11311361
############ Binary Functions #############
11321362

11331363
#' Return the union RDD of two RDDs.

pkg/inst/tests/test_rdd.R

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,52 @@ test_that("sortBy() on RDDs", {
278278
expect_equal(actual, as.list(nums))
279279
})
280280

281+
test_that("takeOrdered() on RDDs", {
282+
l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
283+
rdd <- parallelize(sc, l)
284+
actual <- takeOrdered(rdd, 6L)
285+
expect_equal(actual, as.list(sort(unlist(l)))[1:6])
286+
287+
l <- list("e", "d", "c", "d", "a")
288+
rdd <- parallelize(sc, l)
289+
actual <- takeOrdered(rdd, 3L)
290+
expect_equal(actual, as.list(sort(unlist(l)))[1:3])
291+
})
292+
293+
test_that("top() on RDDs", {
294+
l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
295+
rdd <- parallelize(sc, l)
296+
actual <- top(rdd, 6L)
297+
expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:6])
298+
299+
l <- list("e", "d", "c", "d", "a")
300+
rdd <- parallelize(sc, l)
301+
actual <- top(rdd, 3L)
302+
expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:3])
303+
})
304+
305+
test_that("fold() on RDDs", {
306+
actual <- fold(rdd, 0, "+")
307+
expect_equal(actual, Reduce("+", nums, 0))
308+
309+
rdd <- parallelize(sc, list())
310+
actual <- fold(rdd, 0, "+")
311+
expect_equal(actual, 0)
312+
})
313+
314+
test_that("aggregateRDD() on RDDs", {
315+
rdd <- parallelize(sc, list(1, 2, 3, 4))
316+
zeroValue <- list(0, 0)
317+
seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
318+
combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
319+
actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
320+
expect_equal(actual, list(10, 4))
321+
322+
rdd <- parallelize(sc, list())
323+
actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
324+
expect_equal(actual, list(0, 0))
325+
})
326+
281327
test_that("keys() on RDDs", {
282328
keys <- keys(intRdd)
283329
actual <- collect(keys)

pkg/man/aggregateRDD.Rd

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
% Generated by roxygen2 (4.0.2): do not edit by hand
2+
\docType{methods}
3+
\name{aggregateRDD}
4+
\alias{aggregateRDD}
5+
\alias{aggregateRDD,RDD,RDD-method}
6+
\alias{aggregateRDD,RDD-method}
7+
\title{Aggregate an RDD using the given combine functions and a neutral "zero value".}
8+
\usage{
9+
aggregateRDD(rdd, zeroValue, seqOp, combOp)
10+
11+
\S4method{aggregateRDD}{RDD}(rdd, zeroValue, seqOp, combOp)
12+
}
13+
\arguments{
14+
\item{rdd}{An RDD.}
15+
16+
\item{zeroValue}{A neutral "zero value".}
17+
18+
\item{seqOp}{A function to aggregate the RDD elements. It may return a different
19+
result type from the type of the RDD elements.}
20+
21+
\item{combOp}{A function to aggregate results of seqOp.}
22+
}
23+
\value{
24+
The aggregation result.
25+
}
26+
\description{
27+
Aggregate the elements of each partition, and then the results for all the
28+
partitions, using given combine functions and a neutral "zero value".
29+
}
30+
\examples{
31+
\dontrun{
32+
sc <- sparkR.init()
33+
rdd <- parallelize(sc, list(1, 2, 3, 4))
34+
zeroValue <- list(0, 0)
35+
seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
36+
combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
37+
aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4)
38+
}
39+
}
40+
\seealso{
41+
reduce
42+
}
43+

0 commit comments

Comments
 (0)