@@ -358,6 +358,7 @@ setMethod("collect",
358
358
convertJListToRList(collected , flatten )
359
359
})
360
360
361
+
361
362
# ' @rdname collect-methods
362
363
# ' @export
363
364
# ' @description
@@ -382,6 +383,29 @@ setMethod("collectPartition",
382
383
convertJListToRList(jList , flatten = TRUE )
383
384
})
384
385
386
+ # ' @rdname collect-methods
387
+ # ' @export
388
+ # ' @description
389
+ # ' \code{collectAsMap} returns a named list as a map that contains all of the elements
390
+ # ' in a key-value pair RDD.
391
+ # ' @examples
392
+ # '\dontrun{
393
+ # ' sc <- sparkR.init()
394
+ # ' rdd <- parallelize(sc, list(list(1, 2), list(3, 4)), 2L)
395
+ # ' collectAsMap(rdd) # list(`1` = 2, `3` = 4)
396
+ # '}
397
+ setGeneric ("collectAsMap ", function(rdd) { standardGeneric("collectAsMap") })
398
+
399
+ # ' @rdname collect-methods
400
+ # ' @aliases collectAsMap,RDD-method
401
+ setMethod ("collectAsMap ",
402
+ signature(rdd = " RDD" ),
403
+ function (rdd ) {
404
+ pairList <- collect(rdd )
405
+ map <- new.env()
406
+ lapply(pairList , function (x ) { assign(as.character(x [[1 ]]), x [[2 ]], envir = map ) })
407
+ as.list(map )
408
+ })
385
409
386
410
# ' Look up elements of a key in an RDD
387
411
# '
@@ -1388,26 +1412,32 @@ setMethod("groupByKey",
1388
1412
groupVals <- function (part ) {
1389
1413
vals <- new.env()
1390
1414
keys <- new.env()
1415
+ pred <- function (item ) exists(item $ hash , keys )
1416
+ appendList <- function (acc , x ) {
1417
+ addItemToAccumulator(acc , x )
1418
+ acc
1419
+ }
1420
+ makeList <- function (x ) {
1421
+ acc <- initAccumulator()
1422
+ addItemToAccumulator(acc , x )
1423
+ acc
1424
+ }
1391
1425
# Each item in the partition is list of (K, V)
1392
1426
lapply(part ,
1393
1427
function (item ) {
1394
- hashVal <- as.character(hashCode(item [[1 ]]))
1395
- if (exists(hashVal , vals )) {
1396
- acc <- vals [[hashVal ]]
1397
- acc [[length(acc ) + 1 ]] <- item [[2 ]]
1398
- vals [[hashVal ]] <- acc
1399
- } else {
1400
- vals [[hashVal ]] <- list (item [[2 ]])
1401
- keys [[hashVal ]] <- item [[1 ]]
1402
- }
1428
+ item $ hash <- as.character(hashCode(item [[1 ]]))
1429
+ updateOrCreatePair(item , keys , vals , pred ,
1430
+ appendList , makeList )
1403
1431
})
1432
+ # extract out data field
1433
+ vals <- eapply(vals ,
1434
+ function (x ) {
1435
+ length(x $ data ) <- x $ counter
1436
+ x $ data
1437
+ })
1404
1438
# Every key in the environment contains a list
1405
1439
# Convert that to list(K, Seq[V])
1406
- grouped <- lapply(ls(vals ),
1407
- function (name ) {
1408
- list (keys [[name ]], vals [[name ]])
1409
- })
1410
- grouped
1440
+ convertEnvsToList(keys , vals )
1411
1441
}
1412
1442
lapplyPartition(shuffled , groupVals )
1413
1443
})
@@ -1448,28 +1478,78 @@ setMethod("reduceByKey",
1448
1478
reduceVals <- function (part ) {
1449
1479
vals <- new.env()
1450
1480
keys <- new.env()
1481
+ pred <- function (item ) exists(item $ hash , keys )
1451
1482
lapply(part ,
1452
1483
function (item ) {
1453
- hashVal <- as.character(hashCode(item [[1 ]]))
1454
- if (exists(hashVal , vals )) {
1455
- vals [[hashVal ]] <- do.call(
1456
- combineFunc , list (vals [[hashVal ]], item [[2 ]]))
1457
- } else {
1458
- vals [[hashVal ]] <- item [[2 ]]
1459
- keys [[hashVal ]] <- item [[1 ]]
1460
- }
1484
+ item $ hash <- as.character(hashCode(item [[1 ]]))
1485
+ updateOrCreatePair(item , keys , vals , pred , combineFunc , identity )
1461
1486
})
1462
- combined <- lapply(ls(vals ),
1463
- function (name ) {
1464
- list (keys [[name ]], vals [[name ]])
1465
- })
1466
- combined
1487
+ convertEnvsToList(keys , vals )
1467
1488
}
1468
1489
locallyReduced <- lapplyPartition(rdd , reduceVals )
1469
1490
shuffled <- partitionBy(locallyReduced , numPartitions )
1470
1491
lapplyPartition(shuffled , reduceVals )
1471
1492
})
1472
1493
1494
+ # ' Merge values by key locally
1495
+ # '
1496
+ # ' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
1497
+ # ' and merges the values for each key using an associative reduce function, but return the
1498
+ # ' results immediately to the driver as an R list.
1499
+ # '
1500
+ # ' @param rdd The RDD to reduce by key. Should be an RDD where each element is
1501
+ # ' list(K, V) or c(K, V).
1502
+ # ' @param combineFunc The associative reduce function to use.
1503
+ # ' @return A list of elements of type list(K, V') where V' is the merged value for each key
1504
+ # ' @rdname reduceByKeyLocally
1505
+ # ' @seealso reduceByKey
1506
+ # ' @export
1507
+ # ' @examples
1508
+ # '\dontrun{
1509
+ # ' sc <- sparkR.init()
1510
+ # ' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
1511
+ # ' rdd <- parallelize(sc, pairs)
1512
+ # ' reduced <- reduceByKeyLocally(rdd, "+")
1513
+ # ' reduced # list(list(1, 6), list(1.1, 3))
1514
+ # '}
1515
+ setGeneric ("reduceByKeyLocally ",
1516
+ function (rdd , combineFunc ) {
1517
+ standardGeneric(" reduceByKeyLocally" )
1518
+ })
1519
+
1520
+ # ' @rdname reduceByKeyLocally
1521
+ # ' @aliases reduceByKeyLocally,RDD,integer-method
1522
+ setMethod ("reduceByKeyLocally ",
1523
+ signature(rdd = " RDD" , combineFunc = " ANY" ),
1524
+ function (rdd , combineFunc ) {
1525
+ reducePart <- function (part ) {
1526
+ vals <- new.env()
1527
+ keys <- new.env()
1528
+ pred <- function (item ) exists(item $ hash , keys )
1529
+ lapply(part ,
1530
+ function (item ) {
1531
+ item $ hash <- as.character(hashCode(item [[1 ]]))
1532
+ updateOrCreatePair(item , keys , vals , pred , combineFunc , identity )
1533
+ })
1534
+ list (list (keys , vals )) # return hash to avoid re-compute in merge
1535
+ }
1536
+ mergeParts <- function (accum , x ) {
1537
+ pred <- function (item ) {
1538
+ exists(item $ hash , accum [[1 ]])
1539
+ }
1540
+ lapply(ls(x [[1 ]]),
1541
+ function (name ) {
1542
+ item <- list (x [[1 ]][[name ]], x [[2 ]][[name ]])
1543
+ item $ hash <- name
1544
+ updateOrCreatePair(item , accum [[1 ]], accum [[2 ]], pred , combineFunc , identity )
1545
+ })
1546
+ accum
1547
+ }
1548
+ reduced <- mapPartitions(rdd , reducePart )
1549
+ merged <- reduce(reduced , mergeParts )
1550
+ convertEnvsToList(merged [[1 ]], merged [[2 ]])
1551
+ })
1552
+
1473
1553
# ' Combine values by key
1474
1554
# '
1475
1555
# ' Generic function to combine the elements for each key using a custom set of
@@ -1519,46 +1599,28 @@ setMethod("combineByKey",
1519
1599
combineLocally <- function (part ) {
1520
1600
combiners <- new.env()
1521
1601
keys <- new.env()
1602
+ pred <- function (item ) exists(item $ hash , keys )
1522
1603
lapply(part ,
1523
1604
function (item ) {
1524
- k <- as.character(item [[1 ]])
1525
- if (! exists(k , keys )) {
1526
- combiners [[k ]] <- do.call(createCombiner ,
1527
- list (item [[2 ]]))
1528
- keys [[k ]] <- item [[1 ]]
1529
- } else {
1530
- combiners [[k ]] <- do.call(mergeValue ,
1531
- list (combiners [[k ]],
1532
- item [[2 ]]))
1533
- }
1534
- })
1535
- lapply(ls(keys ), function (k ) {
1536
- list (keys [[k ]], combiners [[k ]])
1605
+ item $ hash <- as.character(item [[1 ]])
1606
+ updateOrCreatePair(item , keys , combiners , pred , mergeValue , createCombiner )
1537
1607
})
1608
+ convertEnvsToList(keys , combiners )
1538
1609
}
1539
1610
locallyCombined <- lapplyPartition(rdd , combineLocally )
1540
1611
shuffled <- partitionBy(locallyCombined , numPartitions )
1541
1612
mergeAfterShuffle <- function (part ) {
1542
1613
combiners <- new.env()
1543
1614
keys <- new.env()
1615
+ pred <- function (item ) exists(item $ hash , keys )
1544
1616
lapply(part ,
1545
1617
function (item ) {
1546
- k <- as.character(item [[1 ]])
1547
- if (! exists(k , combiners )) {
1548
- combiners [[k ]] <- item [[2 ]]
1549
- keys [[k ]] <- item [[1 ]]
1550
- } else {
1551
- combiners [[k ]] <- do.call(mergeCombiners ,
1552
- list (combiners [[k ]],
1553
- item [[2 ]]))
1554
- }
1555
- })
1556
- lapply(ls(keys ), function (k ) {
1557
- list (keys [[k ]], combiners [[k ]])
1618
+ item $ hash <- as.character(item [[1 ]])
1619
+ updateOrCreatePair(item , keys , combiners , pred , mergeCombiners , identity )
1558
1620
})
1621
+ convertEnvsToList(keys , combiners )
1559
1622
}
1560
- combined <- lapplyPartition(shuffled , mergeAfterShuffle )
1561
- combined
1623
+ lapplyPartition(shuffled , mergeAfterShuffle )
1562
1624
})
1563
1625
1564
1626
# ########### Binary Functions #############
0 commit comments