@@ -134,6 +134,7 @@ class MaxHeapQ(object):
134
134
135
135
"""
136
136
An implementation of MaxHeap.
137
+
137
138
>>> import pyspark.rdd
138
139
>>> heap = pyspark.rdd.MaxHeapQ(5)
139
140
>>> [heap.insert(i) for i in range(10)]
@@ -381,6 +382,7 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
381
382
def getNumPartitions (self ):
382
383
"""
383
384
Returns the number of partitions in RDD
385
+
384
386
>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
385
387
>>> rdd.getNumPartitions()
386
388
2
@@ -570,6 +572,7 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
570
572
"""
571
573
Sorts this RDD, which is assumed to consist of (key, value) pairs.
572
574
# noqa
575
+
573
576
>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
574
577
>>> sc.parallelize(tmp).sortByKey(True, 2).collect()
575
578
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
@@ -1209,6 +1212,7 @@ def collectAsMap(self):
1209
1212
def keys (self ):
1210
1213
"""
1211
1214
Return an RDD with the keys of each tuple.
1215
+
1212
1216
>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
1213
1217
>>> m.collect()
1214
1218
[1, 3]
@@ -1218,6 +1222,7 @@ def keys(self):
1218
1222
def values (self ):
1219
1223
"""
1220
1224
Return an RDD with the values of each tuple.
1225
+
1221
1226
>>> m = sc.parallelize([(1, 2), (3, 4)]).values()
1222
1227
>>> m.collect()
1223
1228
[2, 4]
@@ -1642,6 +1647,7 @@ def repartition(self, numPartitions):
1642
1647
Internally, this uses a shuffle to redistribute data.
1643
1648
If you are decreasing the number of partitions in this RDD, consider
1644
1649
using `coalesce`, which can avoid performing a shuffle.
1650
+
1645
1651
>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
1646
1652
>>> sorted(rdd.glom().collect())
1647
1653
[[1], [2, 3], [4, 5], [6, 7]]
@@ -1656,6 +1662,7 @@ def repartition(self, numPartitions):
1656
1662
def coalesce (self , numPartitions , shuffle = False ):
1657
1663
"""
1658
1664
Return a new RDD that is reduced into `numPartitions` partitions.
1665
+
1659
1666
>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
1660
1667
[[1], [2, 3], [4, 5]]
1661
1668
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
@@ -1694,6 +1701,7 @@ def name(self):
1694
1701
def setName (self , name ):
1695
1702
"""
1696
1703
Assign a name to this RDD.
1704
+
1697
1705
>>> rdd1 = sc.parallelize([1,2])
1698
1706
>>> rdd1.setName('RDD1')
1699
1707
>>> rdd1.name()
@@ -1753,6 +1761,7 @@ class PipelinedRDD(RDD):
1753
1761
1754
1762
"""
1755
1763
Pipelined maps:
1764
+
1756
1765
>>> rdd = sc.parallelize([1, 2, 3, 4])
1757
1766
>>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()
1758
1767
[4, 8, 12, 16]
0 commit comments