apache
diff --git a/‎core/src/main/scala/org/apache/spark/CacheManager.scala
Lines changed: 7 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/CacheManager.scala
Lines changed: 7 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
Lines changed: 25 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
Lines changed: 25 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
Lines changed: 4 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
Lines changed: 4 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Lines changed: 3 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
Lines changed: 3 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
Lines changed: 8 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
Lines changed: 8 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
Lines changed: 10 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/shuffle/hash/BlockStoreShuffleFetcher.scala
Lines changed: 10 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
Lines changed: 5 additions & 5 deletions b/‎core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
Lines changed: 5 additions & 5 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
Lines changed: 6 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala
Lines changed: 6 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
Lines changed: 6 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
Lines changed: 6 additions & 0 deletions
@@ -17,6 +17,8 @@
 
 package org.apache.spark
 
+import org.apache.spark.util.AfterNextInterceptingIterator
+
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
@@ -49,7 +51,11 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
           .getInputMetricsForReadMethod(inputMetrics.readMethod)
         existingMetrics.addBytesRead(inputMetrics.bytesRead)
 
-        new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])
+        val iter = blockResult.data.asInstanceOf[Iterator[T]]
+        new InterruptibleIterator(context, AfterNextInterceptingIterator(iter, (next: T) => {
+          existingMetrics.addRecordsRead(1)
+          next
+        }))
 
       case None =>
         // Acquire a lock for loading this partition
 
@@ -201,6 +201,7 @@ class TaskMetrics extends Serializable {
       merged.incLocalBlocksFetched(depMetrics.localBlocksFetched)
       merged.incRemoteBlocksFetched(depMetrics.remoteBlocksFetched)
       merged.incRemoteBytesRead(depMetrics.remoteBytesRead)
+      merged.recordsRead += depMetrics.recordsRead
     }
     _shuffleReadMetrics = Some(merged)
   }
@@ -243,11 +244,17 @@ object DataWriteMethod extends Enumeration with Serializable {
 case class InputMetrics(readMethod: DataReadMethod.Value) {
 
   private val _bytesRead: AtomicLong = new AtomicLong()
+  private val _recordsRead: AtomicLong = new AtomicLong()
 
   /**
    * Total bytes read.
    */
   def bytesRead: Long = _bytesRead.get()
+
+  /**
+   * Total records read.
+   */
+  def recordsRead: Long = _recordsRead.get()
   @volatile @transient var bytesReadCallback: Option[() => Long] = None
 
   /**
@@ -257,6 +264,10 @@ case class InputMetrics(readMethod: DataReadMethod.Value) {
     _bytesRead.addAndGet(bytes)
   }
 
+  def addRecordsRead(records: Long) = {
+    _recordsRead.addAndGet(records)
+  }
+
   /**
    * Invoke the bytesReadCallback and mutate bytesRead.
    */
@@ -287,6 +298,11 @@ case class OutputMetrics(writeMethod: DataWriteMethod.Value) {
   private var _bytesWritten: Long = _
   def bytesWritten = _bytesWritten
   private[spark] def setBytesWritten(value : Long) = _bytesWritten = value
+
+  /**
+   * Total records written
+   */
+  var recordsWritten: Long = 0L
 }
 
 /**
@@ -334,6 +350,11 @@ class ShuffleReadMetrics extends Serializable {
    * Number of blocks fetched in this shuffle by this task (remote or local)
    */
   def totalBlocksFetched = _remoteBlocksFetched + _localBlocksFetched
+
+  /**
+   * Total number of records read from the shuffle by this task
+   */
+  var recordsRead: Long = _
 }
 
 /**
@@ -358,5 +379,8 @@ class ShuffleWriteMetrics extends Serializable {
   private[spark] def incShuffleWriteTime(value: Long) = _shuffleWriteTime += value
   private[spark] def decShuffleWriteTime(value: Long) = _shuffleWriteTime -= value
 
-
+  /**
+   * Total number of records written from the shuffle by this task
+   */
+  var recordsWritten: Long = _
 }
@@ -44,7 +44,7 @@ import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.executor.DataReadMethod
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
-import org.apache.spark.util.{NextIterator, Utils}
+import org.apache.spark.util.{AfterNextInterceptingIterator, NextIterator, Utils}
 import org.apache.spark.scheduler.{HostTaskLocation, HDFSCacheTaskLocation}
 import org.apache.spark.storage.StorageLevel
 
@@ -247,7 +247,9 @@ class HadoopRDD[K, V](
           case eof: EOFException =>
             finished = true
         }
-
+        if (!finished) {
+          inputMetrics.addRecordsRead(1)
+        }
         (key, value)
       }
 
 
@@ -151,7 +151,9 @@ class NewHadoopRDD[K, V](
           throw new java.util.NoSuchElementException("End of stream")
         }
         havePair = false
-
+        if (!finished) {
+          inputMetrics.addRecordsRead(1)
+        }
         (reader.getCurrentKey, reader.getCurrentValue)
       }
 
 
@@ -34,7 +34,7 @@ import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
 import org.apache.hadoop.mapreduce.{Job => NewAPIHadoopJob, OutputFormat => NewOutputFormat,
-RecordWriter => NewRecordWriter}
+  RecordWriter => NewRecordWriter}
 
 import org.apache.spark._
 import org.apache.spark.Partitioner.defaultPartitioner
@@ -993,8 +993,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val (outputMetrics, bytesWrittenCallback) = initHadoopOutputMetrics(context)
 
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
+      var recordsWritten = 0L
       try {
-        var recordsWritten = 0L
         while (iter.hasNext) {
           val pair = iter.next()
           writer.write(pair._1, pair._2)
@@ -1008,6 +1008,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       }
       committer.commitTask(hadoopContext)
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.recordsWritten = recordsWritten
       1
     } : Int
 
@@ -1065,8 +1066,8 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
       writer.setup(context.stageId, context.partitionId, taskAttemptId)
       writer.open()
+      var recordsWritten = 0L
       try {
-        var recordsWritten = 0L
         while (iter.hasNext) {
           val record = iter.next()
           writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
@@ -1080,6 +1081,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       }
       writer.commit()
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.recordsWritten = recordsWritten
     }
 
     self.context.runJob(self, writeToFile)
@@ -1089,17 +1091,15 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   private def initHadoopOutputMetrics(context: TaskContext): (OutputMetrics, Option[() => Long]) = {
     val bytesWrittenCallback = SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback()
     val outputMetrics = new OutputMetrics(DataWriteMethod.Hadoop)
-    if (bytesWrittenCallback.isDefined) {
-      context.taskMetrics.outputMetrics = Some(outputMetrics)
-    }
+    context.taskMetrics.outputMetrics = Some(outputMetrics)
     (outputMetrics, bytesWrittenCallback)
   }
 
   private def maybeUpdateOutputMetrics(bytesWrittenCallback: Option[() => Long],
       outputMetrics: OutputMetrics, recordsWritten: Long): Unit = {
-    if (recordsWritten % PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES == 0
-        && bytesWrittenCallback.isDefined) {
+    if (recordsWritten % PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES == 0) {
       bytesWrittenCallback.foreach { fn => outputMetrics.setBytesWritten(fn()) }
+      outputMetrics.recordsWritten = recordsWritten
     }
   }
 
 
@@ -25,7 +25,7 @@ import org.apache.spark._
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.FetchFailedException
 import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId}
-import org.apache.spark.util.CompletionIterator
+import org.apache.spark.util.{InterceptingIterator, CompletionIterator}
 
 private[hash] object BlockStoreShuffleFetcher extends Logging {
   def fetch[T](
@@ -82,7 +82,15 @@ private[hash] object BlockStoreShuffleFetcher extends Logging {
       SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024)
     val itr = blockFetcherItr.flatMap(unpackBlock)
 
-    val completionIter = CompletionIterator[T, Iterator[T]](itr, {
+    val itr2 = new InterceptingIterator[T](itr) {
+      val readMetrics = context.taskMetrics().createShuffleReadMetricsForDependency()
+      override def afterNext(next: T) : T = {
+        readMetrics.recordsRead += 1
+        next
+      }
+    }
+
+    val completionIter = CompletionIterator[T, Iterator[T]](itr2, {
       context.taskMetrics.updateShuffleReadMetrics()
     })
 
 
@@ -117,7 +117,7 @@ private[spark] class DiskBlockObjectWriter(
 
   /** Calling channel.position() to update the write metrics can be a little bit expensive, so we
     * only call it every N writes */
-  private var writesSinceMetricsUpdate = 0
+  private var numRecordsWritten = 0
 
   override def open(): BlockObjectWriter = {
     fos = new FileOutputStream(file, true)
@@ -168,6 +168,7 @@ private[spark] class DiskBlockObjectWriter(
   override def revertPartialWritesAndClose() {
     try {
       writeMetrics.decShuffleBytesWritten(reportedPosition - initialPosition)
+      writeMetrics.recordsWritten -= numRecordsWritten
 
       if (initialized) {
         objOut.flush()
@@ -193,12 +194,11 @@ private[spark] class DiskBlockObjectWriter(
     }
 
     objOut.writeObject(value)
+    numRecordsWritten += 1
+    writeMetrics.recordsWritten += 1
 
-    if (writesSinceMetricsUpdate == 32) {
-      writesSinceMetricsUpdate = 0
+    if (numRecordsWritten % 32 == 0) {
       updateBytesWritten()
-    } else {
-      writesSinceMetricsUpdate += 1
     }
   }
 
 
@@ -30,8 +30,10 @@ private[spark] object ToolTips {
     "Time that the task spent blocked waiting for shuffle data to be read from remote machines."
 
   val INPUT = "Bytes read from Hadoop or from Spark storage."
+  val INPUT_RECORDS = "Number of records read from Hadoop or from Spark storage."
 
   val OUTPUT = "Bytes written to Hadoop."
+  val OUTPUT_RECORDS = "Number of records written to Hadoop."
 
   val SHUFFLE_WRITE = "Bytes written to disk in order to be read by a shuffle in a future stage."
 
 
@@ -48,7 +48,9 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
   val executorToTasksFailed = HashMap[String, Int]()
   val executorToDuration = HashMap[String, Long]()
   val executorToInputBytes = HashMap[String, Long]()
+  val executorToInputRecords = HashMap[String, Long]()
   val executorToOutputBytes = HashMap[String, Long]()
+  val executorToOutputRecords = HashMap[String, Long]()
   val executorToShuffleRead = HashMap[String, Long]()
   val executorToShuffleWrite = HashMap[String, Long]()
 
@@ -78,10 +80,14 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
         metrics.inputMetrics.foreach { inputMetrics =>
           executorToInputBytes(eid) =
             executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
+          executorToInputRecords(eid) =
+            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
         }
         metrics.outputMetrics.foreach { outputMetrics =>
           executorToOutputBytes(eid) =
             executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
+          executorToOutputRecords(eid) =
+            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
         }
         metrics.shuffleReadMetrics.foreach { shuffleRead =>
           executorToShuffleRead(eid) =
 
@@ -45,7 +45,9 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
         <th>Failed Tasks</th>
         <th>Succeeded Tasks</th>
         <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
+        <th><span data-toggle="tooltip" title={ToolTips.INPUT_RECORDS}>Input Records</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.OUTPUT}>Output</span></th>
+        <th><span data-toggle="tooltip" title={ToolTips.OUTPUT_RECORDS}>Output Records</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_WRITE}>Shuffle Write</span></th>
         <th>Shuffle Spill (Memory)</th>
@@ -78,8 +80,12 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
             <td>{v.succeededTasks}</td>
             <td sorttable_customkey={v.inputBytes.toString}>
               {Utils.bytesToString(v.inputBytes)}</td>
+            <td sorttable_customkey={v.inputRecords.toString}>
+              {v.inputRecords}</td>
             <td sorttable_customkey={v.outputBytes.toString}>
               {Utils.bytesToString(v.outputBytes)}</td>
+            <td sorttable_customkey={v.outputRecords.toString}>
+              {v.outputRecords}</td>
             <td sorttable_customkey={v.shuffleRead.toString}>
               {Utils.bytesToString(v.shuffleRead)}</td>
             <td sorttable_customkey={v.shuffleWrite.toString}>
Original file line number	Diff line number	Diff line change
`@@ -151,7 +151,9 @@ class NewHadoopRDD[K, V](`
`151`	`151`	`throw new java.util.NoSuchElementException("End of stream")`
`152`	`152`	`}`
`153`	`153`	`havePair = false`
`154`		`-`
	`154`	`+ if (!finished) {`
	`155`	`+ inputMetrics.addRecordsRead(1)`
	`156`	`+ }`
`155`	`157`	`(reader.getCurrentKey, reader.getCurrentValue)`
`156`	`158`	`}`
`157`	`159`