Skip to content

Commit e895e0c

Browse files
witgoankurdave
authored andcommitted
[SPARK-3623][GraphX] GraphX should support the checkpoint operation
Author: GuoQiang Li <[email protected]> Closes #2631 from witgo/SPARK-3623 and squashes the following commits: a70c500 [GuoQiang Li] Remove java related 4d1e249 [GuoQiang Li] Add comments e682724 [GuoQiang Li] Graph should support the checkpoint operation
1 parent 6eb1b6f commit e895e0c

File tree

3 files changed

+34
-0
lines changed

3 files changed

+34
-0
lines changed

graphx/src/main/scala/org/apache/spark/graphx/Graph.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,14 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
9696
*/
9797
def cache(): Graph[VD, ED]
9898

99+
/**
100+
* Mark this Graph for checkpointing. It will be saved to a file inside the checkpoint
101+
* directory set with SparkContext.setCheckpointDir() and all references to its parent
102+
* RDDs will be removed. It is strongly recommended that this Graph is persisted in
103+
* memory, otherwise saving it on a file will require recomputation.
104+
*/
105+
def checkpoint(): Unit
106+
99107
/**
100108
* Uncaches only the vertices of this graph, leaving the edges alone. This is useful in iterative
101109
* algorithms that modify the vertex attributes but reuse the edges. This method can be used to

graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
6565
this
6666
}
6767

68+
override def checkpoint(): Unit = {
69+
vertices.checkpoint()
70+
replicatedVertexView.edges.checkpoint()
71+
}
72+
6873
override def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] = {
6974
vertices.unpersist(blocking)
7075
// TODO: unpersist the replicated vertices in `replicatedVertexView` but leave the edges alone

graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ package org.apache.spark.graphx
1919

2020
import org.scalatest.FunSuite
2121

22+
import com.google.common.io.Files
23+
2224
import org.apache.spark.SparkContext
2325
import org.apache.spark.graphx.Graph._
2426
import org.apache.spark.graphx.PartitionStrategy._
@@ -365,4 +367,23 @@ class GraphSuite extends FunSuite with LocalSparkContext {
365367
}
366368
}
367369

370+
test("checkpoint") {
371+
val checkpointDir = Files.createTempDir()
372+
checkpointDir.deleteOnExit()
373+
withSpark { sc =>
374+
sc.setCheckpointDir(checkpointDir.getAbsolutePath)
375+
val ring = (0L to 100L).zip((1L to 99L) :+ 0L).map { case (a, b) => Edge(a, b, 1)}
376+
val rdd = sc.parallelize(ring)
377+
val graph = Graph.fromEdges(rdd, 1.0F)
378+
graph.checkpoint()
379+
graph.edges.map(_.attr).count()
380+
graph.vertices.map(_._2).count()
381+
382+
val edgesDependencies = graph.edges.partitionsRDD.dependencies
383+
val verticesDependencies = graph.vertices.partitionsRDD.dependencies
384+
assert(edgesDependencies.forall(_.rdd.isInstanceOf[CheckpointRDD[_]]))
385+
assert(verticesDependencies.forall(_.rdd.isInstanceOf[CheckpointRDD[_]]))
386+
}
387+
}
388+
368389
}

0 commit comments

Comments
 (0)