Skip to content

Commit c5b9252

Browse files
author
Davies Liu
committed
Merge pull request #1 from mengxr/SPARK-4477
make poisson sampling slightly faster
2 parents ee17d78 + 98eb31b commit c5b9252

File tree

1 file changed

+16
-10
lines changed

1 file changed

+16
-10
lines changed

python/pyspark/rddsampler.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import sys
1919
import random
20+
import math
2021

2122

2223
class RDDSamplerBase(object):
@@ -37,16 +38,21 @@ def getUniformSample(self):
3738
return self._random.random()
3839

3940
def getPoissonSample(self, mean):
40-
# here we simulate drawing numbers n_i ~ Poisson(lambda = 1/mean) by
41-
# drawing a sequence of numbers delta_j ~ Exp(mean)
42-
num_arrivals = 0
43-
cur_time = self._random.expovariate(mean)
44-
45-
while cur_time < 1.0:
46-
cur_time += self._random.expovariate(mean)
47-
num_arrivals += 1
48-
49-
return num_arrivals
41+
# Using Knuth's algorithm described in http://en.wikipedia.org/wiki/Poisson_distribution
42+
if mean < 20.0: # one exp and k+1 random calls
43+
l = math.exp(-mean)
44+
p = self._random.random()
45+
k = 0
46+
while p > l:
47+
k += 1
48+
p *= self._random.random()
49+
else: # switch to the log domain, k+1 expovariate (random + log) calls
50+
p = self._random.expovariate(mean)
51+
k = 0
52+
while p < 1.0:
53+
k += 1
54+
p += self._random.expovariate(mean)
55+
return k
5056

5157
def func(self, split, iterator):
5258
raise NotImplementedError

0 commit comments

Comments
 (0)