PEP8 fixes, added test cases.

logannc · logannc · commit 6e248ddf9612 · 2017-09-22T00:25:32.000-05:00
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1762,14 +1762,15 @@ def toPandas(self):
         else:
             dtype = {}
             columns_with_null_int = set()
+
             def null_handler(rows, columns_with_null_int):
                 for row in rows:
                     row = row.asDict()
                     for column in columns_with_null_int:
                         val = row[column]
                         dt = dtype[column]
                         if val is not None:
-                            if abs(val) > 16777216: # Max value before np.float32 loses precision.
+                            if abs(val) > 16777216:  # Max value before np.float32 loses precision.
                                 val = np.float64(val)
                                 dt = np.float64
                                 dtype[column] = np.float64
@@ -1778,7 +1779,7 @@ def null_handler(rows, columns_with_null_int):
                             row[column] = val
                     row = Row(**row)
                     yield row
-            row_handler = lambda x,y: x
+            row_handler = lambda x, y: x
             for field in self.schema:
                 pandas_type = _to_corrected_pandas_type(field.dataType)
                 if pandas_type in (np.int8, np.int16, np.int32) and field.nullable:
@@ -1787,8 +1788,8 @@ def null_handler(rows, columns_with_null_int):
                     pandas_type = np.float32
                 if pandas_type is not None:
                     dtype[field.name] = pandas_type
-
-            pdf = pd.DataFrame.from_records(row_handler(self.collect(), columns_with_null_int), columns=self.columns)
+            collected_rows = row_handler(self.collect(), columns_with_null_int)
+            pdf = pd.DataFrame.from_records(collected_rows, columns=self.columns)
 
             for f, t in dtype.items():
                 pdf[f] = pdf[f].astype(t, copy=False)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -2495,17 +2495,22 @@ def count_bucketed_cols(names, table="pyspark_bucket"):
     def test_to_pandas(self):
         import numpy as np
         schema = StructType().add("a", IntegerType()).add("b", StringType())\
-                             .add("c", BooleanType()).add("d", FloatType())
+                             .add("c", BooleanType()).add("d", FloatType())\
+                             .add("e", IntegerType()).add("f", IntegerType())\
+                             .add("g", IntegerType())
         data = [
-            (1, "foo", True, 3.0), (2, "foo", True, 5.0),
-            (3, "bar", False, -1.0), (4, "bar", False, 6.0),
+            (1, "foo", True, 3.0, 1, 16777218, None), (2, "foo", True, 5.0, 2, 16777220, None),
+            (3, "bar", False, -1.0, 3, 1, None), (4, "bar", False, 6.0, None, None, None),
         ]
         df = self.spark.createDataFrame(data, schema)
         types = df.toPandas().dtypes
         self.assertEquals(types[0], np.int32)
         self.assertEquals(types[1], np.object)
         self.assertEquals(types[2], np.bool)
         self.assertEquals(types[3], np.float32)
+        self.assertEquals(types[4], np.float32)
+        self.assertEquals(types[5], np.float64)
+        self.assertEquals(types[6], np.float32)
 
     def test_create_dataframe_from_array_of_long(self):
         import array