@@ -29,56 +29,69 @@ import org.apache.spark.sql.catalyst.plans.physical.{OrderedDistribution, Unspec
29
29
import org .apache .spark .sql .catalyst .ScalaReflection
30
30
31
31
case class Project (projectList : Seq [NamedExpression ], child : SparkPlan ) extends UnaryNode {
32
- def output = projectList.map(_.toAttribute)
32
+ override def output = projectList.map(_.toAttribute)
33
33
34
- def execute () = child.execute().mapPartitions { iter =>
34
+ override def execute () = child.execute().mapPartitions { iter =>
35
35
@ transient val reusableProjection = new MutableProjection (projectList)
36
36
iter.map(reusableProjection)
37
37
}
38
38
}
39
39
40
40
case class Filter (condition : Expression , child : SparkPlan ) extends UnaryNode {
41
- def output = child.output
41
+ override def output = child.output
42
42
43
- def execute () = child.execute().mapPartitions { iter =>
43
+ override def execute () = child.execute().mapPartitions { iter =>
44
44
iter.filter(condition.apply(_).asInstanceOf [Boolean ])
45
45
}
46
46
}
47
47
48
48
case class Sample (fraction : Double , withReplacement : Boolean , seed : Int , child : SparkPlan )
49
49
extends UnaryNode {
50
50
51
- def output = child.output
51
+ override def output = child.output
52
52
53
53
// TODO: How to pick seed?
54
- def execute () = child.execute().sample(withReplacement, fraction, seed)
54
+ override def execute () = child.execute().sample(withReplacement, fraction, seed)
55
55
}
56
56
57
57
case class Union (children : Seq [SparkPlan ])(@ transient sc : SparkContext ) extends SparkPlan {
58
58
// TODO: attributes output by union should be distinct for nullability purposes
59
- def output = children.head.output
60
- def execute () = sc.union(children.map(_.execute()))
59
+ override def output = children.head.output
60
+ override def execute () = sc.union(children.map(_.execute()))
61
61
62
62
override def otherCopyArgs = sc :: Nil
63
63
}
64
64
65
- case class StopAfter (limit : Int , child : SparkPlan )(@ transient sc : SparkContext ) extends UnaryNode {
65
+ /**
66
+ * Take the first limit elements.
67
+ */
68
+ case class Limit (limit : Int , child : SparkPlan )(@ transient sc : SparkContext ) extends UnaryNode {
66
69
override def otherCopyArgs = sc :: Nil
70
+ // Note that the implementation is different depending on
71
+ // whether this is a terminal operator or not.
67
72
68
- def output = child.output
73
+ override def output = child.output
69
74
70
75
override def executeCollect () = child.execute().map(_.copy()).take(limit)
71
76
72
- // TODO: Terminal split should be implemented differently from non-terminal split.
73
- // TODO: Pick num splits based on |limit|.
74
- def execute () = sc.makeRDD(executeCollect(), 1 )
77
+ override def execute () = {
78
+ child.execute()
79
+ .mapPartitions(_.take(limit))
80
+ .coalesce(1 , shuffle = true )
81
+ .mapPartitions(_.take(limit))
82
+ }
75
83
}
76
84
77
- case class TopK (limit : Int , sortOrder : Seq [SortOrder ], child : SparkPlan )
78
- (@ transient sc : SparkContext ) extends UnaryNode {
85
+ /**
86
+ * Take the first limit elements as defined by the sortOrder. This is logically equivalent to
87
+ * having a [[Limit ]] operator after a [[Sort ]] operator. This could have been named TopK, but
88
+ * Spark's top operator does the opposite in ordering so we name it TakeOrdered to avoid confusion.
89
+ */
90
+ case class TakeOrdered (limit : Int , sortOrder : Seq [SortOrder ], child : SparkPlan )
91
+ (@ transient sc : SparkContext ) extends UnaryNode {
79
92
override def otherCopyArgs = sc :: Nil
80
93
81
- def output = child.output
94
+ override def output = child.output
82
95
83
96
@ transient
84
97
lazy val ordering = new RowOrdering (sortOrder)
@@ -87,7 +100,7 @@ case class TopK(limit: Int, sortOrder: Seq[SortOrder], child: SparkPlan)
87
100
88
101
// TODO: Terminal split should be implemented differently from non-terminal split.
89
102
// TODO: Pick num splits based on |limit|.
90
- def execute () = sc.makeRDD(executeCollect(), 1 )
103
+ override def execute () = sc.makeRDD(executeCollect(), 1 )
91
104
}
92
105
93
106
@@ -102,15 +115,15 @@ case class Sort(
102
115
@ transient
103
116
lazy val ordering = new RowOrdering (sortOrder)
104
117
105
- def execute () = attachTree(this , " sort" ) {
118
+ override def execute () = attachTree(this , " sort" ) {
106
119
// TODO: Optimize sorting operation?
107
120
child.execute()
108
121
.mapPartitions(
109
122
iterator => iterator.map(_.copy()).toArray.sorted(ordering).iterator,
110
123
preservesPartitioning = true )
111
124
}
112
125
113
- def output = child.output
126
+ override def output = child.output
114
127
}
115
128
116
129
object ExistingRdd {
@@ -131,6 +144,6 @@ object ExistingRdd {
131
144
}
132
145
133
146
case class ExistingRdd (output : Seq [Attribute ], rdd : RDD [Row ]) extends LeafNode {
134
- def execute () = rdd
147
+ override def execute () = rdd
135
148
}
136
149
0 commit comments