@@ -44,7 +44,7 @@ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat => NewFileInputFor
44
44
45
45
import org .apache .mesos .MesosNativeLibrary
46
46
47
- import org .apache .spark .annotation .{DeveloperApi , Experimental }
47
+ import org .apache .spark .annotation .{DeveloperApi , Experimental , RDDScope }
48
48
import org .apache .spark .broadcast .Broadcast
49
49
import org .apache .spark .deploy .{LocalSparkCluster , SparkHadoopUtil }
50
50
import org .apache .spark .executor .{ExecutorEndpoint , TriggerThreadDump }
@@ -641,6 +641,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
641
641
* @note avoid using `parallelize(Seq())` to create an empty `RDD`. Consider `emptyRDD` for an
642
642
* RDD with no partitions, or `parallelize(Seq[T]())` for an RDD of `T` with empty partitions.
643
643
*/
644
+ @ RDDScope
644
645
def parallelize [T : ClassTag ](seq : Seq [T ], numSlices : Int = defaultParallelism): RDD [T ] = {
645
646
assertNotStopped()
646
647
new ParallelCollectionRDD [T ](this , seq, numSlices, Map [Int , Seq [String ]]())
@@ -650,13 +651,15 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
650
651
*
651
652
* This method is identical to `parallelize`.
652
653
*/
654
+ @ RDDScope
653
655
def makeRDD [T : ClassTag ](seq : Seq [T ], numSlices : Int = defaultParallelism): RDD [T ] = {
654
656
parallelize(seq, numSlices)
655
657
}
656
658
657
659
/** Distribute a local Scala collection to form an RDD, with one or more
658
660
* location preferences (hostnames of Spark nodes) for each object.
659
661
* Create a new partition for each collection item. */
662
+ @ RDDScope
660
663
def makeRDD [T : ClassTag ](seq : Seq [(T , Seq [String ])]): RDD [T ] = {
661
664
assertNotStopped()
662
665
val indexToPrefs = seq.zipWithIndex.map(t => (t._2, t._1._2)).toMap
@@ -667,10 +670,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
667
670
* Read a text file from HDFS, a local file system (available on all nodes), or any
668
671
* Hadoop-supported file system URI, and return it as an RDD of Strings.
669
672
*/
673
+ @ RDDScope
670
674
def textFile (path : String , minPartitions : Int = defaultMinPartitions): RDD [String ] = {
671
675
assertNotStopped()
672
676
hadoopFile(path, classOf [TextInputFormat ], classOf [LongWritable ], classOf [Text ],
673
- minPartitions).map(pair => pair._2.toString).setName(path)
677
+ minPartitions).map(pair => pair._2.toString)
674
678
}
675
679
676
680
/**
@@ -700,6 +704,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
700
704
*
701
705
* @param minPartitions A suggestion value of the minimal splitting number for input data.
702
706
*/
707
+ @ RDDScope
703
708
def wholeTextFiles (path : String , minPartitions : Int = defaultMinPartitions):
704
709
RDD [(String , String )] = {
705
710
assertNotStopped()
@@ -746,6 +751,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
746
751
* @note Small files are preferred; very large files may cause bad performance.
747
752
*/
748
753
@ Experimental
754
+ @ RDDScope
749
755
def binaryFiles (path : String , minPartitions : Int = defaultMinPartitions):
750
756
RDD [(String , PortableDataStream )] = {
751
757
assertNotStopped()
@@ -774,6 +780,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
774
780
* @return An RDD of data with values, represented as byte arrays
775
781
*/
776
782
@ Experimental
783
+ @ RDDScope
777
784
def binaryRecords (path : String , recordLength : Int , conf : Configuration = hadoopConfiguration)
778
785
: RDD [Array [Byte ]] = {
779
786
assertNotStopped()
@@ -811,6 +818,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
811
818
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
812
819
* copy them using a `map` function.
813
820
*/
821
+ @ RDDScope
814
822
def hadoopRDD [K , V ](
815
823
conf : JobConf ,
816
824
inputFormatClass : Class [_ <: InputFormat [K , V ]],
@@ -832,6 +840,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
832
840
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
833
841
* copy them using a `map` function.
834
842
*/
843
+ @ RDDScope
835
844
def hadoopFile [K , V ](
836
845
path : String ,
837
846
inputFormatClass : Class [_ <: InputFormat [K , V ]],
@@ -850,7 +859,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
850
859
inputFormatClass,
851
860
keyClass,
852
861
valueClass,
853
- minPartitions).setName(path)
862
+ minPartitions).setName(s " HadoopRDD[ $ path] " )
854
863
}
855
864
856
865
/**
@@ -867,6 +876,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
867
876
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
868
877
* copy them using a `map` function.
869
878
*/
879
+ @ RDDScope
870
880
def hadoopFile [K , V , F <: InputFormat [K , V ]]
871
881
(path : String , minPartitions : Int )
872
882
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] = {
@@ -891,11 +901,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
891
901
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
892
902
* copy them using a `map` function.
893
903
*/
904
+ @ RDDScope
894
905
def hadoopFile [K , V , F <: InputFormat [K , V ]](path : String )
895
906
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] =
896
907
hadoopFile[K , V , F ](path, defaultMinPartitions)
897
908
898
909
/** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */
910
+ @ RDDScope
899
911
def newAPIHadoopFile [K , V , F <: NewInputFormat [K , V ]]
900
912
(path : String )
901
913
(implicit km : ClassTag [K ], vm : ClassTag [V ], fm : ClassTag [F ]): RDD [(K , V )] = {
@@ -916,6 +928,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
916
928
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
917
929
* copy them using a `map` function.
918
930
*/
931
+ @ RDDScope
919
932
def newAPIHadoopFile [K , V , F <: NewInputFormat [K , V ]](
920
933
path : String ,
921
934
fClass : Class [F ],
@@ -949,6 +962,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
949
962
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
950
963
* copy them using a `map` function.
951
964
*/
965
+ @ RDDScope
952
966
def newAPIHadoopRDD [K , V , F <: NewInputFormat [K , V ]](
953
967
conf : Configuration = hadoopConfiguration,
954
968
fClass : Class [F ],
@@ -969,6 +983,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
969
983
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
970
984
* copy them using a `map` function.
971
985
*/
986
+ @ RDDScope
972
987
def sequenceFile [K , V ](path : String ,
973
988
keyClass : Class [K ],
974
989
valueClass : Class [V ],
@@ -987,6 +1002,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
987
1002
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
988
1003
* copy them using a `map` function.
989
1004
* */
1005
+ @ RDDScope
990
1006
def sequenceFile [K , V ](path : String , keyClass : Class [K ], valueClass : Class [V ]): RDD [(K , V )] = {
991
1007
assertNotStopped()
992
1008
sequenceFile(path, keyClass, valueClass, defaultMinPartitions)
@@ -1014,6 +1030,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
1014
1030
* If you plan to directly cache, sort, or aggregate Hadoop writable objects, you should first
1015
1031
* copy them using a `map` function.
1016
1032
*/
1033
+ @ RDDScope
1017
1034
def sequenceFile [K , V ]
1018
1035
(path : String , minPartitions : Int = defaultMinPartitions)
1019
1036
(implicit km : ClassTag [K ], vm : ClassTag [V ],
@@ -1037,6 +1054,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
1037
1054
* though the nice thing about it is that there's very little effort required to save arbitrary
1038
1055
* objects.
1039
1056
*/
1057
+ @ RDDScope
1040
1058
def objectFile [T : ClassTag ](
1041
1059
path : String ,
1042
1060
minPartitions : Int = defaultMinPartitions
@@ -1046,13 +1064,15 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
1046
1064
.flatMap(x => Utils .deserialize[Array [T ]](x._2.getBytes, Utils .getContextOrSparkClassLoader))
1047
1065
}
1048
1066
1067
+ @ RDDScope
1049
1068
protected [spark] def checkpointFile [T : ClassTag ](
1050
1069
path : String
1051
1070
): RDD [T ] = {
1052
1071
new CheckpointRDD [T ](this , path)
1053
1072
}
1054
1073
1055
1074
/** Build the union of a list of RDDs. */
1075
+ @ RDDScope
1056
1076
def union [T : ClassTag ](rdds : Seq [RDD [T ]]): RDD [T ] = {
1057
1077
val partitioners = rdds.flatMap(_.partitioner).toSet
1058
1078
if (partitioners.size == 1 ) {
@@ -1063,6 +1083,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
1063
1083
}
1064
1084
1065
1085
/** Build the union of a list of RDDs passed as variable-length arguments. */
1086
+ @ RDDScope
1066
1087
def union [T : ClassTag ](first : RDD [T ], rest : RDD [T ]* ): RDD [T ] =
1067
1088
union(Seq (first) ++ rest)
1068
1089
0 commit comments