Skip to content

Commit 96316e4

Browse files
author
Matthew Farrellee
committed
[SPARK-3580] add 'partitions' property to PySpark RDD
'rdd.partitions' is available in scala&java, primarily used for its size() method to get the number of partitions. pyspark instead has a getNumPartitions() call and no access to 'partitions' this change adds 'partitions' to pyspark's rdd, allowing for len(rdd.partitions) to get the number of partitions in a way familiar to python developers
1 parent a95ad99 commit 96316e4

File tree

2 files changed

+11
-1
lines changed

2 files changed

+11
-1
lines changed

python/pyspark/rdd.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ class RDD(object):
120120
operated on in parallel.
121121
"""
122122

123+
@property
124+
def partitions(self):
125+
return self._jrdd.partitions()
126+
123127
def __init__(self, jrdd, ctx, jrdd_deserializer):
124128
self._jrdd = jrdd
125129
self.is_cached = False
@@ -287,7 +291,7 @@ def getNumPartitions(self):
287291
>>> rdd.getNumPartitions()
288292
2
289293
"""
290-
return self._jrdd.partitions().size()
294+
return self.partitions.size()
291295

292296
def filter(self, f):
293297
"""

python/pyspark/tests.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,12 @@ def test_distinct(self):
601601
self.assertEquals(result.getNumPartitions(), 5)
602602
self.assertEquals(result.count(), 3)
603603

604+
def test_partitions_property(self):
605+
rdd = self.sc.parallelize([], 80)
606+
self.assertEquals(rdd.getNumPartitions(), 80)
607+
self.assertEquals(rdd.partitions.size(), 80)
608+
self.assertEquals(len(rdd.partitions), 80)
609+
604610

605611
class TestSQL(PySparkTestCase):
606612

0 commit comments

Comments
 (0)