Skip to content

Commit db6f67e

Browse files
committed
SPARK-1168, Added foldByKey to pyspark.
1 parent b9be160 commit db6f67e

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

python/pyspark/rdd.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,21 @@ def _mergeCombiners(iterator):
939939
combiners[k] = mergeCombiners(combiners[k], v)
940940
return combiners.iteritems()
941941
return shuffled.mapPartitions(_mergeCombiners)
942+
943+
def foldByKey(self, zeroValue, func, numPartitions=None):
944+
"""
945+
Merge the values for each key using an associative function "func" and a neutral "zeroValue"
946+
which may be added to the result an arbitrary number of times, and must not change
947+
the result (e.g., 0 for addition, or 1 for multiplication.).
942948
949+
>>> rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
950+
>>> from operator import add
951+
>>> rdd.foldByKey(0, add).collect()
952+
[('a', 2), ('b', 1)]
953+
"""
954+
return self.combineByKey(lambda v: func(zeroValue, v), func, func, numPartitions)
955+
956+
943957
# TODO: support variant with custom partitioner
944958
def groupByKey(self, numPartitions=None):
945959
"""

0 commit comments

Comments
 (0)