limit used memory and size of objects in partitionBy()

davies · davies · commit 1a97ce40ff37 · 2014-07-22T14:36:35.000-07:00
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -1227,23 +1227,39 @@ def partitionBy(self, numPartitions, partitionFunc=portable_hash):
 
         # Transferring O(n) objects to Java is too expensive.  Instead, we'll
         # form the hash buckets in Python, transferring O(numPartitions) objects
-        # to Java.  Each object is a (splitNumber, [objects]) pair.
+        # to Java. Each object is a (splitNumber, [objects]) pair.
+        # In order to void too huge objects, the objects are grouped into chunks.
         outputSerializer = self.ctx._unbatched_serializer
 
-        limit = _parse_memory(self.ctx._conf.get("spark.python.worker.memory")
-                or "512m")
+        limit = (_parse_memory(self.ctx._conf.get("spark.python.worker.memory")
+                               or "512m") / 2)
         def add_shuffle_key(split, iterator):
 
             buckets = defaultdict(list)
-            c, batch = 0, 1000
+            c, batch = 0, min(10 * numPartitions, 1000)
+
             for (k, v) in iterator:
                 buckets[partitionFunc(k) % numPartitions].append((k, v))
                 c += 1
-                if c % batch == 0 and get_used_memory() > limit:
+
+                # check used memory and avg size of chunk of objects
+                if (c % 1000 == 0 and get_used_memory() > limit
+                        or c > batch):
+                    n, size = len(buckets), 0
                     for split in buckets.keys():
                         yield pack_long(split)
-                        yield outputSerializer.dumps(buckets[split])
+                        d = outputSerializer.dumps(buckets[split])
                         del buckets[split]
+                        yield d
+                        size += len(d)
+
+                    avg = (size / n) >> 20
+                    # let 1M < avg < 10M
+                    if avg < 1:
+                        batch *= 1.5
+                    elif avg > 10:
+                        batch = max(batch / 1.5, 1)
+                    c = 0
 
             for (split, items) in buckets.iteritems():
                 yield pack_long(split)
diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
@@ -149,7 +149,7 @@ def _get_dirs(self):
         """ get all the directories """
         path = os.environ.get("SPARK_LOCAL_DIR", "/tmp/spark")
         dirs = path.split(",")
-        return [os.path.join(d, "python", str(os.getpid()))
+        return [os.path.join(d, "python", str(os.getpid()), str(id(self)))
                 for d in dirs]
 
     def _get_spill_dir(self, n):