@@ -341,7 +341,7 @@ class SparkContext(
341
341
*/
342
342
def textFile (path : String , minSplits : Int = defaultMinSplits): RDD [String ] = {
343
343
hadoopFile(path, classOf [TextInputFormat ], classOf [LongWritable ], classOf [Text ],
344
- minSplits, cloneRecords = false ).map(pair => pair._2.toString)
344
+ minSplits).map(pair => pair._2.toString)
345
345
}
346
346
347
347
/**
@@ -354,33 +354,37 @@ class SparkContext(
354
354
* @param keyClass Class of the keys
355
355
* @param valueClass Class of the values
356
356
* @param minSplits Minimum number of Hadoop Splits to generate.
357
- * @param cloneRecords If true, Spark will clone the records produced by Hadoop RecordReader.
358
- * Most RecordReader implementations reuse wrapper objects across multiple
359
- * records, and can cause problems in RDD collect or aggregation operations .
360
- * By default the records are cloned in Spark. However, application
361
- * programmers can explicitly disable the cloning for better performance .
357
+ *
358
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
359
+ * record, directly caching the returned RDD will create many references to the same object .
360
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
361
+ * a `map` function .
362
362
*/
363
- def hadoopRDD [K : ClassTag , V : ClassTag ](
363
+ def hadoopRDD [K , V ](
364
364
conf : JobConf ,
365
365
inputFormatClass : Class [_ <: InputFormat [K , V ]],
366
366
keyClass : Class [K ],
367
367
valueClass : Class [V ],
368
- minSplits : Int = defaultMinSplits,
369
- cloneRecords : Boolean = true
368
+ minSplits : Int = defaultMinSplits
370
369
): RDD [(K , V )] = {
371
370
// Add necessary security credentials to the JobConf before broadcasting it.
372
371
SparkHadoopUtil .get.addCredentials(conf)
373
- new HadoopRDD (this , conf, inputFormatClass, keyClass, valueClass, minSplits, cloneRecords )
372
+ new HadoopRDD (this , conf, inputFormatClass, keyClass, valueClass, minSplits)
374
373
}
375
374
376
- /** Get an RDD for a Hadoop file with an arbitrary InputFormat */
377
- def hadoopFile [K : ClassTag , V : ClassTag ](
375
+ /** Get an RDD for a Hadoop file with an arbitrary InputFormat
376
+ *
377
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
378
+ * record, directly caching the returned RDD will create many references to the same object.
379
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
380
+ * a `map` function.
381
+ * */
382
+ def hadoopFile [K , V ](
378
383
path : String ,
379
384
inputFormatClass : Class [_ <: InputFormat [K , V ]],
380
385
keyClass : Class [K ],
381
386
valueClass : Class [V ],
382
- minSplits : Int = defaultMinSplits,
383
- cloneRecords : Boolean = true
387
+ minSplits : Int = defaultMinSplits
384
388
): RDD [(K , V )] = {
385
389
// A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
386
390
val confBroadcast = broadcast(new SerializableWritable (hadoopConfiguration))
@@ -392,8 +396,7 @@ class SparkContext(
392
396
inputFormatClass,
393
397
keyClass,
394
398
valueClass,
395
- minSplits,
396
- cloneRecords)
399
+ minSplits)
397
400
}
398
401
399
402
/**
@@ -403,16 +406,20 @@ class SparkContext(
403
406
* {{{
404
407
* val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minSplits)
405
408
* }}}
409
+ *
410
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
411
+ * record, directly caching the returned RDD will create many references to the same object.
412
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
413
+ * a `map` function.
406
414
*/
407
415
def hadoopFile [K , V , F <: InputFormat [K , V ]]
408
- (path : String , minSplits : Int , cloneRecords : Boolean = true )
416
+ (path : String , minSplits : Int )
409
417
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] = {
410
418
hadoopFile(path,
411
419
fm.runtimeClass.asInstanceOf [Class [F ]],
412
420
km.runtimeClass.asInstanceOf [Class [K ]],
413
421
vm.runtimeClass.asInstanceOf [Class [V ]],
414
- minSplits,
415
- cloneRecords)
422
+ minSplits)
416
423
}
417
424
418
425
/**
@@ -421,69 +428,91 @@ class SparkContext(
421
428
* can just write, for example,
422
429
* {{{
423
430
* val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path)
424
- * }}}
431
+ *
432
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
433
+ * record, directly caching the returned RDD will create many references to the same object.
434
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
435
+ * a `map` function.
425
436
*/
426
- def hadoopFile [K , V , F <: InputFormat [K , V ]](path : String , cloneRecords : Boolean = true )
437
+ def hadoopFile [K , V , F <: InputFormat [K , V ]](path : String )
427
438
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] =
428
- hadoopFile[K , V , F ](path, defaultMinSplits, cloneRecords )
439
+ hadoopFile[K , V , F ](path, defaultMinSplits)
429
440
430
441
/** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */
431
442
def newAPIHadoopFile [K , V , F <: NewInputFormat [K , V ]]
432
- (path : String , cloneRecords : Boolean = true )
443
+ (path : String )
433
444
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] = {
434
445
newAPIHadoopFile(
435
446
path,
436
447
fm.runtimeClass.asInstanceOf [Class [F ]],
437
448
km.runtimeClass.asInstanceOf [Class [K ]],
438
- vm.runtimeClass.asInstanceOf [Class [V ]],
439
- cloneRecords = cloneRecords)
449
+ vm.runtimeClass.asInstanceOf [Class [V ]])
440
450
}
441
451
442
452
/**
443
453
* Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
444
454
* and extra configuration options to pass to the input format.
455
+ *
456
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
457
+ * record, directly caching the returned RDD will create many references to the same object.
458
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
459
+ * a `map` function.
445
460
*/
446
- def newAPIHadoopFile [K : ClassTag , V : ClassTag , F <: NewInputFormat [K , V ]](
461
+ def newAPIHadoopFile [K , V , F <: NewInputFormat [K , V ]](
447
462
path : String ,
448
463
fClass : Class [F ],
449
464
kClass : Class [K ],
450
465
vClass : Class [V ],
451
- conf : Configuration = hadoopConfiguration,
452
- cloneRecords : Boolean = true ): RDD [(K , V )] = {
466
+ conf : Configuration = hadoopConfiguration): RDD [(K , V )] = {
453
467
val job = new NewHadoopJob (conf)
454
468
NewFileInputFormat .addInputPath(job, new Path (path))
455
469
val updatedConf = job.getConfiguration
456
- new NewHadoopRDD (this , fClass, kClass, vClass, updatedConf, cloneRecords )
470
+ new NewHadoopRDD (this , fClass, kClass, vClass, updatedConf)
457
471
}
458
472
459
473
/**
460
474
* Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
461
475
* and extra configuration options to pass to the input format.
476
+ *
477
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
478
+ * record, directly caching the returned RDD will create many references to the same object.
479
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
480
+ * a `map` function.
462
481
*/
463
- def newAPIHadoopRDD [K : ClassTag , V : ClassTag , F <: NewInputFormat [K , V ]](
482
+ def newAPIHadoopRDD [K , V , F <: NewInputFormat [K , V ]](
464
483
conf : Configuration = hadoopConfiguration,
465
484
fClass : Class [F ],
466
485
kClass : Class [K ],
467
- vClass : Class [V ],
468
- cloneRecords : Boolean = true ): RDD [(K , V )] = {
469
- new NewHadoopRDD (this , fClass, kClass, vClass, conf, cloneRecords)
486
+ vClass : Class [V ]): RDD [(K , V )] = {
487
+ new NewHadoopRDD (this , fClass, kClass, vClass, conf)
470
488
}
471
489
472
- /** Get an RDD for a Hadoop SequenceFile with given key and value types. */
490
+ /** Get an RDD for a Hadoop SequenceFile with given key and value types.
491
+ *
492
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
493
+ * record, directly caching the returned RDD will create many references to the same object.
494
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
495
+ * a `map` function.
496
+ * */
473
497
def sequenceFile [K : ClassTag , V : ClassTag ](path : String ,
474
498
keyClass : Class [K ],
475
499
valueClass : Class [V ],
476
- minSplits : Int ,
477
- cloneRecords : Boolean = true
500
+ minSplits : Int
478
501
): RDD [(K , V )] = {
479
502
val inputFormatClass = classOf [SequenceFileInputFormat [K , V ]]
480
- hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits, cloneRecords )
503
+ hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits)
481
504
}
482
505
483
- /** Get an RDD for a Hadoop SequenceFile with given key and value types. */
484
- def sequenceFile [K : ClassTag , V : ClassTag ](path : String , keyClass : Class [K ], valueClass : Class [V ],
485
- cloneRecords : Boolean = true ): RDD [(K , V )] =
486
- sequenceFile(path, keyClass, valueClass, defaultMinSplits, cloneRecords)
506
+ /** Get an RDD for a Hadoop SequenceFile with given key and value types.
507
+ *
508
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
509
+ * record, directly caching the returned RDD will create many references to the same object.
510
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
511
+ * a `map` function.
512
+ * */
513
+ def sequenceFile [K : ClassTag , V : ClassTag ](path : String , keyClass : Class [K ], valueClass : Class [V ]
514
+ ): RDD [(K , V )] =
515
+ sequenceFile(path, keyClass, valueClass, defaultMinSplits)
487
516
488
517
/**
489
518
* Version of sequenceFile() for types implicitly convertible to Writables through a
@@ -500,9 +529,14 @@ class SparkContext(
500
529
* have a parameterized singleton object). We use functions instead to create a new converter
501
530
* for the appropriate type. In addition, we pass the converter a ClassTag of its type to
502
531
* allow it to figure out the Writable class to use in the subclass case.
532
+ *
533
+ * Note: Because Hadoop's RecordReader class re-uses the same Writable object for each
534
+ * record, directly caching the returned RDD will create many references to the same object.
535
+ * If you plan to directly cache Hadoop writable objects, you should first copy them using
536
+ * a `map` function.
503
537
*/
504
538
def sequenceFile [K , V ]
505
- (path : String , minSplits : Int = defaultMinSplits, cloneRecords : Boolean = true )
539
+ (path : String , minSplits : Int = defaultMinSplits)
506
540
(implicit km : ClassTag [K ], vm : ClassTag [V ],
507
541
kcf : () => WritableConverter [K ], vcf : () => WritableConverter [V ])
508
542
: RDD [(K , V )] = {
@@ -511,7 +545,7 @@ class SparkContext(
511
545
val format = classOf [SequenceFileInputFormat [Writable , Writable ]]
512
546
val writables = hadoopFile(path, format,
513
547
kc.writableClass(km).asInstanceOf [Class [Writable ]],
514
- vc.writableClass(vm).asInstanceOf [Class [Writable ]], minSplits, cloneRecords )
548
+ vc.writableClass(vm).asInstanceOf [Class [Writable ]], minSplits)
515
549
writables.map { case (k, v) => (kc.convert(k), vc.convert(v)) }
516
550
}
517
551
0 commit comments