@@ -1128,6 +1128,236 @@ setMethod("sortBy",
1128
1128
values(sortByKey(keyBy(rdd , func ), ascending , numPartitions ))
1129
1129
})
1130
1130
1131
+ # Helper function to get first N elements from an RDD in the specified order.
1132
+ # Param:
1133
+ # rdd An RDD.
1134
+ # num Number of elements to return.
1135
+ # ascending A flag to indicate whether the sorting is ascending or descending.
1136
+ # Return:
1137
+ # A list of the first N elements from the RDD in the specified order.
1138
+ #
1139
+ takeOrderedElem <- function (rdd , num , ascending = TRUE ) {
1140
+ if (num < = 0L ) {
1141
+ return (list ())
1142
+ }
1143
+
1144
+ partitionFunc <- function (part ) {
1145
+ if (num < length(part )) {
1146
+ # R limitation: order works only on primitive types!
1147
+ ord <- order(unlist(part , recursive = FALSE ), decreasing = ! ascending )
1148
+ list (part [ord [1 : num ]])
1149
+ } else {
1150
+ list (part )
1151
+ }
1152
+ }
1153
+
1154
+ reduceFunc <- function (elems , part ) {
1155
+ newElems <- append(elems , part )
1156
+ # R limitation: order works only on primitive types!
1157
+ ord <- order(unlist(newElems , recursive = FALSE ), decreasing = ! ascending )
1158
+ newElems [ord [1 : num ]]
1159
+ }
1160
+
1161
+ newRdd <- mapPartitions(rdd , partitionFunc )
1162
+ reduce(newRdd , reduceFunc )
1163
+ }
1164
+
1165
+ # ' Returns the first N elements from an RDD in ascending order.
1166
+ # '
1167
+ # ' @param rdd An RDD.
1168
+ # ' @param num Number of elements to return.
1169
+ # ' @return The first N elements from the RDD in ascending order.
1170
+ # ' @rdname takeOrdered
1171
+ # ' @export
1172
+ # ' @examples
1173
+ # '\dontrun{
1174
+ # ' sc <- sparkR.init()
1175
+ # ' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1176
+ # ' takeOrdered(rdd, 6L) # list(1, 2, 3, 4, 5, 6)
1177
+ # '}
1178
+ setGeneric ("takeOrdered ", function(rdd, num) { standardGeneric("takeOrdered") })
1179
+
1180
+ # ' @rdname takeOrdered
1181
+ # ' @aliases takeOrdered,RDD,RDD-method
1182
+ setMethod ("takeOrdered ",
1183
+ signature(rdd = " RDD" , num = " integer" ),
1184
+ function (rdd , num ) {
1185
+ takeOrderedElem(rdd , num )
1186
+ })
1187
+
1188
+ # ' Returns the top N elements from an RDD.
1189
+ # '
1190
+ # ' @param rdd An RDD.
1191
+ # ' @param num Number of elements to return.
1192
+ # ' @return The top N elements from the RDD.
1193
+ # ' @rdname top
1194
+ # ' @export
1195
+ # ' @examples
1196
+ # '\dontrun{
1197
+ # ' sc <- sparkR.init()
1198
+ # ' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1199
+ # ' top(rdd, 6L) # list(10, 9, 7, 6, 5, 4)
1200
+ # '}
1201
+ setGeneric ("top ", function(rdd, num) { standardGeneric("top") })
1202
+
1203
+ # ' @rdname top
1204
+ # ' @aliases top,RDD,RDD-method
1205
+ setMethod ("top ",
1206
+ signature(rdd = " RDD" , num = " integer" ),
1207
+ function (rdd , num ) {
1208
+ takeOrderedElem(rdd , num , FALSE )
1209
+ })
1210
+
1211
+ # ' Fold an RDD using a given associative function and a neutral "zero value".
1212
+ # '
1213
+ # ' Aggregate the elements of each partition, and then the results for all the
1214
+ # ' partitions, using a given associative function and a neutral "zero value".
1215
+ # '
1216
+ # ' @param rdd An RDD.
1217
+ # ' @param zeroValue A neutral "zero value".
1218
+ # ' @param op An associative function for the folding operation.
1219
+ # ' @return The folding result.
1220
+ # ' @rdname fold
1221
+ # ' @seealso reduce
1222
+ # ' @export
1223
+ # ' @examples
1224
+ # '\dontrun{
1225
+ # ' sc <- sparkR.init()
1226
+ # ' rdd <- parallelize(sc, list(1, 2, 3, 4, 5))
1227
+ # ' fold(rdd, 0, "+") # 15
1228
+ # '}
1229
+ setGeneric ("fold ", function(rdd, zeroValue, op) { standardGeneric("fold") })
1230
+
1231
+ # ' @rdname fold
1232
+ # ' @aliases fold,RDD,RDD-method
1233
+ setMethod ("fold ",
1234
+ signature(rdd = " RDD" , zeroValue = " ANY" , op = " ANY" ),
1235
+ function (rdd , zeroValue , op ) {
1236
+ aggregateRDD(rdd , zeroValue , op , op )
1237
+ })
1238
+
1239
+ # ' Aggregate an RDD using the given combine functions and a neutral "zero value".
1240
+ # '
1241
+ # ' Aggregate the elements of each partition, and then the results for all the
1242
+ # ' partitions, using given combine functions and a neutral "zero value".
1243
+ # '
1244
+ # ' @param rdd An RDD.
1245
+ # ' @param zeroValue A neutral "zero value".
1246
+ # ' @param seqOp A function to aggregate the RDD elements. It may return a different
1247
+ # ' result type from the type of the RDD elements.
1248
+ # ' @param combOp A function to aggregate results of seqOp.
1249
+ # ' @return The aggregation result.
1250
+ # ' @rdname aggregateRDD
1251
+ # ' @seealso reduce
1252
+ # ' @export
1253
+ # ' @examples
1254
+ # '\dontrun{
1255
+ # ' sc <- sparkR.init()
1256
+ # ' rdd <- parallelize(sc, list(1, 2, 3, 4))
1257
+ # ' zeroValue <- list(0, 0)
1258
+ # ' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
1259
+ # ' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
1260
+ # ' aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4)
1261
+ # '}
1262
+ setGeneric ("aggregateRDD ", function(rdd, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
1263
+
1264
+ # ' @rdname aggregateRDD
1265
+ # ' @aliases aggregateRDD,RDD,RDD-method
1266
+ setMethod ("aggregateRDD ",
1267
+ signature(rdd = " RDD" , zeroValue = " ANY" , seqOp = " ANY" , combOp = " ANY" ),
1268
+ function (rdd , zeroValue , seqOp , combOp ) {
1269
+ partitionFunc <- function (part ) {
1270
+ Reduce(seqOp , part , zeroValue )
1271
+ }
1272
+
1273
+ partitionList <- collect(lapplyPartition(rdd , partitionFunc ),
1274
+ flatten = FALSE )
1275
+ Reduce(combOp , partitionList , zeroValue )
1276
+ })
1277
+
1278
+ # ########### Shuffle Functions ############
1279
+
1280
+ # ' Partition an RDD by key
1281
+ # '
1282
+ # ' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
1283
+ # ' For each element of this RDD, the partitioner is used to compute a hash
1284
+ # ' function and the RDD is partitioned using this hash value.
1285
+ # '
1286
+ # ' @param rdd The RDD to partition. Should be an RDD where each element is
1287
+ # ' list(K, V) or c(K, V).
1288
+ # ' @param numPartitions Number of partitions to create.
1289
+ # ' @param ... Other optional arguments to partitionBy.
1290
+ # '
1291
+ # ' @param partitionFunc The partition function to use. Uses a default hashCode
1292
+ # ' function if not provided
1293
+ # ' @return An RDD partitioned using the specified partitioner.
1294
+ # ' @rdname partitionBy
1295
+ # ' @export
1296
+ # ' @examples
1297
+ # '\dontrun{
1298
+ # ' sc <- sparkR.init()
1299
+ # ' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
1300
+ # ' rdd <- parallelize(sc, pairs)
1301
+ # ' parts <- partitionBy(rdd, 2L)
1302
+ # ' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4)
1303
+ # '}
1304
+ setGeneric ("partitionBy ",
1305
+ function (rdd , numPartitions , ... ) {
1306
+ standardGeneric(" partitionBy" )
1307
+ })
1308
+
1309
+ # ' @rdname partitionBy
1310
+ # ' @aliases partitionBy,RDD,integer-method
1311
+ setMethod ("partitionBy ",
1312
+ signature(rdd = " RDD" , numPartitions = " integer" ),
1313
+ function (rdd , numPartitions , partitionFunc = hashCode ) {
1314
+
1315
+ # if (missing(partitionFunc)) {
1316
+ # partitionFunc <- hashCode
1317
+ # }
1318
+
1319
+ depsBinArr <- getDependencies(partitionFunc )
1320
+
1321
+ serializedHashFuncBytes <- serialize(as.character(substitute(partitionFunc )),
1322
+ connection = NULL ,
1323
+ ascii = TRUE )
1324
+
1325
+ packageNamesArr <- serialize(.sparkREnv $ .packages ,
1326
+ connection = NULL ,
1327
+ ascii = TRUE )
1328
+ broadcastArr <- lapply(ls(.broadcastNames ), function (name ) {
1329
+ get(name , .broadcastNames ) })
1330
+ jrdd <- getJRDD(rdd )
1331
+
1332
+ # We create a PairwiseRRDD that extends RDD[(Array[Byte],
1333
+ # Array[Byte])], where the key is the hashed split, the value is
1334
+ # the content (key-val pairs).
1335
+ pairwiseRRDD <- newJObject(" edu.berkeley.cs.amplab.sparkr.PairwiseRRDD" ,
1336
+ callJMethod(jrdd , " rdd" ),
1337
+ as.integer(numPartitions ),
1338
+ serializedHashFuncBytes ,
1339
+ rdd @ env $ serialized ,
1340
+ depsBinArr ,
1341
+ packageNamesArr ,
1342
+ as.character(.sparkREnv $ libname ),
1343
+ broadcastArr ,
1344
+ callJMethod(jrdd , " classTag" ))
1345
+
1346
+ # Create a corresponding partitioner.
1347
+ rPartitioner <- newJObject(" org.apache.spark.HashPartitioner" ,
1348
+ as.integer(numPartitions ))
1349
+
1350
+ # Call partitionBy on the obtained PairwiseRDD.
1351
+ javaPairRDD <- callJMethod(pairwiseRRDD , " asJavaPairRDD" )
1352
+ javaPairRDD <- callJMethod(javaPairRDD , " partitionBy" , rPartitioner )
1353
+
1354
+ # Call .values() on the result to get back the final result, the
1355
+ # shuffled acutal content key-val pairs.
1356
+ r <- callJMethod(javaPairRDD , " values" )
1357
+
1358
+ RDD(r , serialized = TRUE )
1359
+ })
1360
+
1131
1361
# ########### Binary Functions #############
1132
1362
1133
1363
# ' Return the union RDD of two RDDs.
0 commit comments