29
29
from pyspark .serializers import BatchedSerializer , PickleSerializer , UTF8Deserializer
30
30
from pyspark .storagelevel import StorageLevel
31
31
from pyspark .traceback_utils import SCCallSiteSync
32
- from pyspark .sql .types import *
33
32
from pyspark .sql .types import _create_cls , _parse_datatype_json_string
34
33
from pyspark .sql .column import Column , _to_seq , _to_java_column
34
+ from pyspark .sql .readwriter import DataFrameWriter
35
+ from pyspark .sql .types import *
35
36
36
37
__all__ = ["DataFrame" , "SchemaRDD" , "DataFrameNaFunctions" , "DataFrameStatFunctions" ]
37
38
@@ -151,25 +152,6 @@ def insertInto(self, tableName, overwrite=False):
151
152
"""
152
153
self ._jdf .insertInto (tableName , overwrite )
153
154
154
- def _java_save_mode (self , mode ):
155
- """Returns the Java save mode based on the Python save mode represented by a string.
156
- """
157
- jSaveMode = self ._sc ._jvm .org .apache .spark .sql .SaveMode
158
- jmode = jSaveMode .ErrorIfExists
159
- mode = mode .lower ()
160
- if mode == "append" :
161
- jmode = jSaveMode .Append
162
- elif mode == "overwrite" :
163
- jmode = jSaveMode .Overwrite
164
- elif mode == "ignore" :
165
- jmode = jSaveMode .Ignore
166
- elif mode == "error" :
167
- pass
168
- else :
169
- raise ValueError (
170
- "Only 'append', 'overwrite', 'ignore', and 'error' are acceptable save mode." )
171
- return jmode
172
-
173
155
def saveAsTable (self , tableName , source = None , mode = "error" , ** options ):
174
156
"""Saves the contents of this :class:`DataFrame` to a data source as a table.
175
157
@@ -185,11 +167,7 @@ def saveAsTable(self, tableName, source=None, mode="error", **options):
185
167
* `error`: Throw an exception if data already exists.
186
168
* `ignore`: Silently ignore this operation if data already exists.
187
169
"""
188
- if source is None :
189
- source = self .sql_ctx .getConf ("spark.sql.sources.default" ,
190
- "org.apache.spark.sql.parquet" )
191
- jmode = self ._java_save_mode (mode )
192
- self ._jdf .saveAsTable (tableName , source , jmode , options )
170
+ self .write .saveAsTable (tableName , source , mode , ** options )
193
171
194
172
def save (self , path = None , source = None , mode = "error" , ** options ):
195
173
"""Saves the contents of the :class:`DataFrame` to a data source.
@@ -206,13 +184,17 @@ def save(self, path=None, source=None, mode="error", **options):
206
184
* `error`: Throw an exception if data already exists.
207
185
* `ignore`: Silently ignore this operation if data already exists.
208
186
"""
209
- if path is not None :
210
- options ["path" ] = path
211
- if source is None :
212
- source = self .sql_ctx .getConf ("spark.sql.sources.default" ,
213
- "org.apache.spark.sql.parquet" )
214
- jmode = self ._java_save_mode (mode )
215
- self ._jdf .save (source , jmode , options )
187
+ return self .write .save (path , source , mode , ** options )
188
+
189
+ @property
190
+ def write (self ):
191
+ """
192
+ Interface for saving the content of the :class:`DataFrame` out
193
+ into external storage.
194
+
195
+ :return :class:`DataFrameWriter`
196
+ """
197
+ return DataFrameWriter (self )
216
198
217
199
@property
218
200
def schema (self ):
@@ -411,9 +393,19 @@ def unpersist(self, blocking=True):
411
393
self ._jdf .unpersist (blocking )
412
394
return self
413
395
414
- # def coalesce(self, numPartitions, shuffle=False):
415
- # rdd = self._jdf.coalesce(numPartitions, shuffle, None)
416
- # return DataFrame(rdd, self.sql_ctx)
396
+ def coalesce (self , numPartitions ):
397
+ """
398
+ Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.
399
+
400
+ Similar to coalesce defined on an :class:`RDD`, this operation results in a
401
+ narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,
402
+ there will not be a shuffle, instead each of the 100 new partitions will
403
+ claim 10 of the current partitions.
404
+
405
+ >>> df.coalesce(1).rdd.getNumPartitions()
406
+ 1
407
+ """
408
+ return DataFrame (self ._jdf .coalesce (numPartitions ), self .sql_ctx )
417
409
418
410
def repartition (self , numPartitions ):
419
411
"""Returns a new :class:`DataFrame` that has exactly ``numPartitions`` partitions.
0 commit comments