Skip to content

Commit c9497a3

Browse files
shivaramDavies Liu
authored andcommitted
Merge pull request #208 from lythesia/master
[SPARKR-188] Add profiling of R execution on worker side Conflicts: pkg/inst/worker/worker.R
1 parent b317aa7 commit c9497a3

File tree

4 files changed

+132
-79
lines changed

4 files changed

+132
-79
lines changed

R/pkg/R/serialize.R

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@ writeJobj <- function(con, value) {
6969
}
7070

7171
writeString <- function(con, value) {
72-
writeInt(con, as.integer(nchar(value) + 1))
73-
writeBin(value, con, endian = "big")
72+
utfVal <- enc2utf8(value)
73+
writeInt(con, as.integer(nchar(utfVal, type = "bytes") + 1))
74+
writeBin(utfVal, con, endian = "big")
7475
}
7576

7677
writeInt <- function(con, value) {
@@ -189,7 +190,3 @@ writeArgs <- function(con, args) {
189190
}
190191
}
191192
}
192-
193-
writeStrings <- function(con, stringList) {
194-
writeLines(unlist(stringList), con)
195-
}

R/pkg/inst/worker/worker.R

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,23 @@
1717

1818
# Worker class
1919

20+
# Get current system time
21+
currentTimeSecs <- function() {
22+
as.numeric(Sys.time())
23+
}
24+
25+
# Get elapsed time
26+
elapsedSecs <- function() {
27+
proc.time()[3]
28+
}
29+
30+
# Constants
31+
specialLengths <- list(END_OF_STERAM = 0L, TIMING_DATA = -1L)
32+
33+
# Timing R process boot
34+
bootTime <- currentTimeSecs()
35+
bootElap <- elapsedSecs()
36+
2037
rLibDir <- Sys.getenv("SPARKR_RLIBDIR")
2138
# Set libPaths to include SparkR package as loadNamespace needs this
2239
# TODO: Figure out if we can avoid this by not loading any objects that require
@@ -46,6 +63,9 @@ computeFunc <- unserialize(SparkR:::readRawLen(inputCon, funcLen))
4663
env <- environment(computeFunc)
4764
parent.env(env) <- .GlobalEnv # Attach under global environment.
4865

66+
# Timing init envs for computing
67+
initElap <- elapsedSecs()
68+
4969
# Read and set broadcast variables
5070
numBroadcastVars <- SparkR:::readInt(inputCon)
5171
if (numBroadcastVars > 0) {
@@ -56,6 +76,9 @@ if (numBroadcastVars > 0) {
5676
}
5777
}
5878

79+
# Timing broadcast
80+
broadcastElap <- elapsedSecs()
81+
5982
# If -1: read as normal RDD; if >= 0, treat as pairwise RDD and treat the int
6083
# as number of partitions to create.
6184
numPartitions <- SparkR:::readInt(inputCon)
@@ -73,14 +96,23 @@ if (isEmpty != 0) {
7396
} else if (deserializer == "row") {
7497
data <- SparkR:::readDeserializeRows(inputCon)
7598
}
99+
# Timing reading input data for execution
100+
inputElap <- elapsedSecs()
101+
76102
output <- computeFunc(partition, data)
103+
# Timing computing
104+
computeElap <- elapsedSecs()
105+
77106
if (serializer == "byte") {
78107
SparkR:::writeRawSerialize(outputCon, output)
79108
} else if (serializer == "row") {
80109
SparkR:::writeRowSerialize(outputCon, output)
81110
} else {
82-
SparkR:::writeStrings(outputCon, output)
111+
# write lines one-by-one with flag
112+
lapply(output, function(line) SparkR:::writeString(outputCon, line))
83113
}
114+
# Timing output
115+
outputElap <- elapsedSecs()
84116
} else {
85117
if (deserializer == "byte") {
86118
# Now read as many characters as described in funcLen
@@ -90,6 +122,8 @@ if (isEmpty != 0) {
90122
} else if (deserializer == "row") {
91123
data <- SparkR:::readDeserializeRows(inputCon)
92124
}
125+
# Timing reading input data for execution
126+
inputElap <- elapsedSecs()
93127

94128
res <- new.env()
95129

@@ -107,6 +141,8 @@ if (isEmpty != 0) {
107141
res[[bucket]] <- acc
108142
}
109143
invisible(lapply(data, hashTupleToEnvir))
144+
# Timing computing
145+
computeElap <- elapsedSecs()
110146

111147
# Step 2: write out all of the environment as key-value pairs.
112148
for (name in ls(res)) {
@@ -116,13 +152,26 @@ if (isEmpty != 0) {
116152
length(res[[name]]$data) <- res[[name]]$counter
117153
SparkR:::writeRawSerialize(outputCon, res[[name]]$data)
118154
}
155+
# Timing output
156+
outputElap <- elapsedSecs()
119157
}
158+
} else {
159+
inputElap <- broadcastElap
160+
computeElap <- broadcastElap
161+
outputElap <- broadcastElap
120162
}
121163

164+
# Report timing
165+
SparkR:::writeInt(outputCon, specialLengths$TIMING_DATA)
166+
SparkR:::writeDouble(outputCon, bootTime)
167+
SparkR:::writeDouble(outputCon, initElap - bootElap) # init
168+
SparkR:::writeDouble(outputCon, broadcastElap - initElap) # broadcast
169+
SparkR:::writeDouble(outputCon, inputElap - broadcastElap) # input
170+
SparkR:::writeDouble(outputCon, computeElap - inputElap) # compute
171+
SparkR:::writeDouble(outputCon, outputElap - computeElap) # output
172+
122173
# End of output
123-
if (serializer %in% c("byte", "row")) {
124-
SparkR:::writeInt(outputCon, 0L)
125-
}
174+
SparkR:::writeInt(outputCon, specialLengths$END_OF_STERAM)
126175

127176
close(outputCon)
128177
close(inputCon)

core/src/main/scala/org/apache/spark/api/r/RRDD.scala

Lines changed: 67 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,15 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
4242
rLibDir: String,
4343
broadcastVars: Array[Broadcast[Object]])
4444
extends RDD[U](parent) with Logging {
45+
protected var dataStream: DataInputStream = _
46+
private var bootTime: Double = _
4547
override def getPartitions: Array[Partition] = parent.partitions
4648

4749
override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
4850

51+
// Timing start
52+
bootTime = System.currentTimeMillis / 1000.0
53+
4954
// The parent may be also an RRDD, so we should launch it first.
5055
val parentIterator = firstParent[T].iterator(partition, context)
5156

@@ -69,7 +74,7 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
6974
// the socket used to receive the output of task
7075
val outSocket = serverSocket.accept()
7176
val inputStream = new BufferedInputStream(outSocket.getInputStream)
72-
val dataStream = openDataStream(inputStream)
77+
dataStream = new DataInputStream(inputStream)
7378
serverSocket.close()
7479

7580
try {
@@ -155,6 +160,7 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
155160
} else if (deserializer == SerializationFormats.ROW) {
156161
dataOut.write(elem.asInstanceOf[Array[Byte]])
157162
} else if (deserializer == SerializationFormats.STRING) {
163+
// write string(for StringRRDD)
158164
printOut.println(elem)
159165
}
160166
}
@@ -180,9 +186,41 @@ private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
180186
}.start()
181187
}
182188

183-
protected def openDataStream(input: InputStream): Closeable
189+
protected def readData(length: Int): U
184190

185-
protected def read(): U
191+
protected def read(): U = {
192+
try {
193+
val length = dataStream.readInt()
194+
195+
length match {
196+
case SpecialLengths.TIMING_DATA =>
197+
// Timing data from R worker
198+
val boot = dataStream.readDouble - bootTime
199+
val init = dataStream.readDouble
200+
val broadcast = dataStream.readDouble
201+
val input = dataStream.readDouble
202+
val compute = dataStream.readDouble
203+
val output = dataStream.readDouble
204+
logInfo(
205+
("Times: boot = %.3f s, init = %.3f s, broadcast = %.3f s, " +
206+
"read-input = %.3f s, compute = %.3f s, write-output = %.3f s, " +
207+
"total = %.3f s").format(
208+
boot,
209+
init,
210+
broadcast,
211+
input,
212+
compute,
213+
output,
214+
boot + init + broadcast + input + compute + output))
215+
read()
216+
case length if length >= 0 =>
217+
readData(length)
218+
}
219+
} catch {
220+
case eof: EOFException =>
221+
throw new SparkException("R worker exited unexpectedly (cranshed)", eof)
222+
}
223+
}
186224
}
187225

188226
/**
@@ -202,31 +240,16 @@ private class PairwiseRRDD[T: ClassTag](
202240
SerializationFormats.BYTE, packageNames, rLibDir,
203241
broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
204242

205-
private var dataStream: DataInputStream = _
206-
207-
override protected def openDataStream(input: InputStream): Closeable = {
208-
dataStream = new DataInputStream(input)
209-
dataStream
210-
}
211-
212-
override protected def read(): (Int, Array[Byte]) = {
213-
try {
214-
val length = dataStream.readInt()
215-
216-
length match {
217-
case length if length == 2 =>
218-
val hashedKey = dataStream.readInt()
219-
val contentPairsLength = dataStream.readInt()
220-
val contentPairs = new Array[Byte](contentPairsLength)
221-
dataStream.readFully(contentPairs)
222-
(hashedKey, contentPairs)
223-
case _ => null // End of input
224-
}
225-
} catch {
226-
case eof: EOFException => {
227-
throw new SparkException("R worker exited unexpectedly (crashed)", eof)
228-
}
229-
}
243+
override protected def readData(length: Int): (Int, Array[Byte]) = {
244+
length match {
245+
case length if length == 2 =>
246+
val hashedKey = dataStream.readInt()
247+
val contentPairsLength = dataStream.readInt()
248+
val contentPairs = new Array[Byte](contentPairsLength)
249+
dataStream.readFully(contentPairs)
250+
(hashedKey, contentPairs)
251+
case _ => null
252+
}
230253
}
231254

232255
lazy val asJavaPairRDD : JavaPairRDD[Int, Array[Byte]] = JavaPairRDD.fromRDD(this)
@@ -247,28 +270,13 @@ private class RRDD[T: ClassTag](
247270
parent, -1, func, deserializer, serializer, packageNames, rLibDir,
248271
broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
249272

250-
private var dataStream: DataInputStream = _
251-
252-
override protected def openDataStream(input: InputStream): Closeable = {
253-
dataStream = new DataInputStream(input)
254-
dataStream
255-
}
256-
257-
override protected def read(): Array[Byte] = {
258-
try {
259-
val length = dataStream.readInt()
260-
261-
length match {
262-
case length if length > 0 =>
263-
val obj = new Array[Byte](length)
264-
dataStream.readFully(obj, 0, length)
265-
obj
266-
case _ => null
267-
}
268-
} catch {
269-
case eof: EOFException => {
270-
throw new SparkException("R worker exited unexpectedly (crashed)", eof)
271-
}
273+
override protected def readData(length: Int): Array[Byte] = {
274+
length match {
275+
case length if length > 0 =>
276+
val obj = new Array[Byte](length)
277+
dataStream.readFully(obj)
278+
obj
279+
case _ => null
272280
}
273281
}
274282

@@ -289,26 +297,21 @@ private class StringRRDD[T: ClassTag](
289297
parent, -1, func, deserializer, SerializationFormats.STRING, packageNames, rLibDir,
290298
broadcastVars.map(x => x.asInstanceOf[Broadcast[Object]])) {
291299

292-
private var dataStream: BufferedReader = _
293-
294-
override protected def openDataStream(input: InputStream): Closeable = {
295-
dataStream = new BufferedReader(new InputStreamReader(input))
296-
dataStream
297-
}
298-
299-
override protected def read(): String = {
300-
try {
301-
dataStream.readLine()
302-
} catch {
303-
case e: IOException => {
304-
throw new SparkException("R worker exited unexpectedly (crashed)", e)
305-
}
300+
override protected def readData(length: Int): String = {
301+
length match {
302+
case length if length > 0 =>
303+
SerDe.readStringBytes(dataStream, length)
304+
case _ => null
306305
}
307306
}
308307

309308
lazy val asJavaRDD : JavaRDD[String] = JavaRDD.fromRDD(this)
310309
}
311310

311+
private object SpecialLengths {
312+
val TIMING_DATA = -1
313+
}
314+
312315
private[r] class BufferedStreamThread(
313316
in: InputStream,
314317
name: String,

core/src/main/scala/org/apache/spark/api/r/SerDe.scala

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,17 @@ private[spark] object SerDe {
8585
in.readDouble()
8686
}
8787

88+
def readStringBytes(in: DataInputStream, len: Int): String = {
89+
val bytes = new Array[Byte](len)
90+
in.readFully(bytes)
91+
assert(bytes(len - 1) == 0)
92+
val str = new String(bytes.dropRight(1), "UTF-8")
93+
str
94+
}
95+
8896
def readString(in: DataInputStream): String = {
8997
val len = in.readInt()
90-
val asciiBytes = new Array[Byte](len)
91-
in.readFully(asciiBytes)
92-
assert(asciiBytes(len - 1) == 0)
93-
val str = new String(asciiBytes.dropRight(1).map(_.toChar))
94-
str
98+
readStringBytes(in, len)
9599
}
96100

97101
def readBoolean(in: DataInputStream): Boolean = {

0 commit comments

Comments
 (0)