Skip to content

Commit c04b447

Browse files
committed
Fix Python doc and add back deleted code
1 parent 433ad5b commit c04b447

File tree

2 files changed

+39
-1
lines changed

2 files changed

+39
-1
lines changed

python/pyspark/rdd.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ def repartitionAndSortWithinPartitions(self, numPartitions=None, partitionFunc=p
527527
sort records by their keys.
528528
529529
>>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
530-
>>> rdd2 = rdd.repartitionAndSortWithinPartitions(True, lambda x: x % 2, 2)
530+
>>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, 2)
531531
>>> rdd2.glom().collect()
532532
[[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]
533533
"""

python/pyspark/tests.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from pyspark.files import SparkFiles
4444
from pyspark.serializers import read_int, BatchedSerializer, MarshalSerializer, PickleSerializer
4545
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
46+
from pyspark.sql import SQLContext, IntegerType
4647

4748
_have_scipy = False
4849
_have_numpy = False
@@ -424,6 +425,22 @@ def test_zip_with_different_number_of_items(self):
424425
self.assertEquals(a.count(), b.count())
425426
self.assertRaises(Exception, lambda: a.zip(b).count())
426427

428+
def test_count_approx_distinct(self):
429+
rdd = self.sc.parallelize(range(1000))
430+
self.assertTrue(950 < rdd.countApproxDistinct(0.04) < 1050)
431+
self.assertTrue(950 < rdd.map(float).countApproxDistinct(0.04) < 1050)
432+
self.assertTrue(950 < rdd.map(str).countApproxDistinct(0.04) < 1050)
433+
self.assertTrue(950 < rdd.map(lambda x: (x, -x)).countApproxDistinct(0.04) < 1050)
434+
435+
rdd = self.sc.parallelize([i % 20 for i in range(1000)], 7)
436+
self.assertTrue(18 < rdd.countApproxDistinct() < 22)
437+
self.assertTrue(18 < rdd.map(float).countApproxDistinct() < 22)
438+
self.assertTrue(18 < rdd.map(str).countApproxDistinct() < 22)
439+
self.assertTrue(18 < rdd.map(lambda x: (x, -x)).countApproxDistinct() < 22)
440+
441+
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.00000001))
442+
self.assertRaises(ValueError, lambda: rdd.countApproxDistinct(0.5))
443+
427444
def test_histogram(self):
428445
# empty
429446
rdd = self.sc.parallelize([])
@@ -537,6 +554,27 @@ def test_repartitionAndSortWithinPartitions(self):
537554
self.assertEquals(partitions[1], [(1, 3), (3, 8), (3, 8)])
538555

539556

557+
class TestSQL(PySparkTestCase):
558+
559+
def setUp(self):
560+
PySparkTestCase.setUp(self)
561+
self.sqlCtx = SQLContext(self.sc)
562+
563+
def test_udf(self):
564+
self.sqlCtx.registerFunction("twoArgs", lambda x, y: len(x) + y, IntegerType())
565+
[row] = self.sqlCtx.sql("SELECT twoArgs('test', 1)").collect()
566+
self.assertEqual(row[0], 5)
567+
568+
def test_broadcast_in_udf(self):
569+
bar = {"a": "aa", "b": "bb", "c": "abc"}
570+
foo = self.sc.broadcast(bar)
571+
self.sqlCtx.registerFunction("MYUDF", lambda x: foo.value[x] if x else '')
572+
[res] = self.sqlCtx.sql("SELECT MYUDF('c')").collect()
573+
self.assertEqual("abc", res[0])
574+
[res] = self.sqlCtx.sql("SELECT MYUDF('')").collect()
575+
self.assertEqual("", res[0])
576+
577+
540578
class TestIO(PySparkTestCase):
541579

542580
def test_stdout_redirection(self):

0 commit comments

Comments
 (0)