@@ -62,7 +62,7 @@ def portable_hash(x):
62
62
63
63
>>> portable_hash(None)
64
64
0
65
- >>> portable_hash((None, 1))
65
+ >>> portable_hash((None, 1)) & 0xffffffff
66
66
219750521
67
67
"""
68
68
if x is None :
@@ -72,7 +72,7 @@ def portable_hash(x):
72
72
for i in x :
73
73
h ^= portable_hash (i )
74
74
h *= 1000003
75
- h &= 0xffffffff
75
+ h &= sys . maxint
76
76
h ^= len (x )
77
77
if h == - 1 :
78
78
h = - 2
@@ -1942,7 +1942,7 @@ def _is_pickled(self):
1942
1942
return True
1943
1943
return False
1944
1944
1945
- def _to_jrdd (self ):
1945
+ def _to_java_object_rdd (self ):
1946
1946
""" Return an JavaRDD of Object by unpickling
1947
1947
1948
1948
It will convert each Python object into Java object by Pyrolite, whenever the
@@ -1977,7 +1977,7 @@ def sumApprox(self, timeout, confidence=0.95):
1977
1977
>>> (rdd.sumApprox(1000) - r) / r < 0.05
1978
1978
True
1979
1979
"""
1980
- jrdd = self .mapPartitions (lambda it : [float (sum (it ))])._to_jrdd ()
1980
+ jrdd = self .mapPartitions (lambda it : [float (sum (it ))])._to_java_object_rdd ()
1981
1981
jdrdd = self .ctx ._jvm .JavaDoubleRDD .fromRDD (jrdd .rdd ())
1982
1982
r = jdrdd .sumApprox (timeout , confidence ).getFinalValue ()
1983
1983
return BoundedFloat (r .mean (), r .confidence (), r .low (), r .high ())
@@ -1993,11 +1993,40 @@ def meanApprox(self, timeout, confidence=0.95):
1993
1993
>>> (rdd.meanApprox(1000) - r) / r < 0.05
1994
1994
True
1995
1995
"""
1996
- jrdd = self .map (float )._to_jrdd ()
1996
+ jrdd = self .map (float )._to_java_object_rdd ()
1997
1997
jdrdd = self .ctx ._jvm .JavaDoubleRDD .fromRDD (jrdd .rdd ())
1998
1998
r = jdrdd .meanApprox (timeout , confidence ).getFinalValue ()
1999
1999
return BoundedFloat (r .mean (), r .confidence (), r .low (), r .high ())
2000
2000
2001
+ def countApproxDistinct (self , relativeSD = 0.05 ):
2002
+ """
2003
+ :: Experimental ::
2004
+ Return approximate number of distinct elements in the RDD.
2005
+
2006
+ The algorithm used is based on streamlib's implementation of
2007
+ "HyperLogLog in Practice: Algorithmic Engineering of a State
2008
+ of The Art Cardinality Estimation Algorithm", available
2009
+ <a href="http://dx.doi.org/10.1145/2452376.2452456">here</a>.
2010
+
2011
+ @param relativeSD Relative accuracy. Smaller values create
2012
+ counters that require more space.
2013
+ It must be greater than 0.000017.
2014
+
2015
+ >>> n = sc.parallelize(range(1000)).map(str).countApproxDistinct()
2016
+ >>> 950 < n < 1050
2017
+ True
2018
+ >>> n = sc.parallelize([i % 20 for i in range(1000)]).countApproxDistinct()
2019
+ >>> 18 < n < 22
2020
+ True
2021
+ """
2022
+ if relativeSD < 0.000017 :
2023
+ raise ValueError ("relativeSD should be greater than 0.000017" )
2024
+ if relativeSD > 0.37 :
2025
+ raise ValueError ("relativeSD should be smaller than 0.37" )
2026
+ # the hash space in Java is 2^32
2027
+ hashRDD = self .map (lambda x : portable_hash (x ) & 0xFFFFFFFF )
2028
+ return hashRDD ._to_java_object_rdd ().countApproxDistinct (relativeSD )
2029
+
2001
2030
2002
2031
class PipelinedRDD (RDD ):
2003
2032
0 commit comments