support cuda-graph mode

cenzhaometa · facebook-github-bot · commit 928d2c0a0d03 · 2025-03-17T19:55:37.000-07:00
Summary:
introduce `--graph_launches=10` (default=0) as a knob to enable cuda-graph mode, when it's non-zero, it will replay the graph that many times.

in cuda-graph mode:
1. warm up, run coll `warm_iters` number of times on a separate stream and sync with current-stream.
2. capturing graph, run collective `iters`.
3. replay graph `graph_launches` number of times on current-stream.

&gt; param-bench measures collective latency from CPU side, which is not very accurate.

see test plan: for trace with graph-mode (saw graph-launches) etc.

&gt; TODO: cuda-graph mode doesn't like `async-op=True` case, it produces following error, need to follow up with a separate PTD fix
```
[rank7]: Traceback (most recent call last):
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/__run_xar_main__.py", line 140, in &lt;module&gt;
[rank7]:     __invoke_main()
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/__run_xar_main__.py", line 87, in __invoke_main
[rank7]:     run_as_main(main_module, main_function)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/__par__/meta_only/bootstrap.py", line 98, in run_as_main
[rank7]:     oss_run_as_main(
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/__par__/bootstrap.py", line 94, in run_as_main
[rank7]:     main()
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
[rank7]:     return f(*args, **kwargs)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/fb/launcher.py", line 1226, in main
[rank7]:     remote_mpi_launcher(args, more_args)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/fb/launcher.py", line 475, in remote_mpi_launcher
[rank7]:     local_launcher(args, more_args)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/fb/launcher.py", line 368, in local_launcher
[rank7]:     commsBench()
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/fb/launcher.py", line 268, in commsBench
[rank7]:     comms_bench()
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 1523, in main
[rank7]:     collBenchObj.runBench(commsParams)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 1458, in runBench
[rank7]:     self.backendFuncs.benchmark_comms(self.benchTime, commsParams)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/pytorch_dist_backend.py", line 1206, in benchmark_comms
[rank7]:     benchTime(index, commsParams, self)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 1236, in benchTime
[rank7]:     self.benchComm(index, commsParams, backendFuncs)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 1310, in benchComm
[rank7]:     self.runColl(
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 431, in runColl
[rank7]:     return self.run_coll_cuda_graph(comm_fn, dcheck)
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/param_bench/train/comms/pt/comms.py", line 377, in run_coll_cuda_graph
[rank7]:     with torch.cuda.graph(g):
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/torch/cuda/graphs.py", line 186, in __exit__
[rank7]:     self.cuda_graph.capture_end()
[rank7]:   File "/mnt/xarfuse/uid-0/5d817754-seed-nspid4026531836_cgpid202510957-ns-4026531841/torch/cuda/graphs.py", line 84, in capture_end
[rank7]:     super().capture_end()
[rank7]: RuntimeError: HIP error: capturing stream has unjoined work
[rank7]: HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
[rank7]: For debugging consider passing AMD_SERIALIZE_KERNEL=3
[rank7]: Compile with `TORCH_USE_HIP_DSA` to enable device-side assertions.
```

Reviewed By: kingchc, kwen2501

Differential Revision: D70544123

fbshipit-source-id: bb4a5ad8ad1e03a77e8d3528e17d26125b5fe355
diff --git a/train/comms/pt/comms.py b/train/comms/pt/comms.py
@@ -189,6 +189,12 @@ def readArgs(self, parser):
             default=False,
             help="use device time measurement",
         )
+        parser.add_argument(
+            "--graph-launches",
+            type=int,
+            default=0,
+            help="Number of graph launches for each data-size",
+        )
         return parser.parse_known_args()
 
     def _checkPt2Pt(self, args):
@@ -315,6 +321,10 @@ def checkArgs(self, args):  # noqa: C901
                 logger.error(f"wrong dst_ranks ({args.dst_ranks})")
                 comms_utils.gracefulExit()
 
+        if args.graph_launches > 0 and args.device != "cuda":
+            logger.error("cuda graph is only supported for cuda or rocm device")
+            comms_utils.gracefulExit()
+
     # depnds on data type
     def checkArgsdataType(self, args):  # noqa: C901
         args.b = comms_utils.parsesize(args.b)
@@ -354,7 +364,81 @@ def checkArgsdataType(self, args):  # noqa: C901
         # run a few sanity checks
         self._check_bitwidth(args)
 
+    def run_coll_cuda_graph(self, comm_fn=None, dcheck=False):
+        self.backendFuncs.sync_barrier(
+            self.collectiveArgs, desc="run_coll_cuda_graph_begin"
+        )
+        elapsedTimeNS = 0.0
+
+        # 1. Warmup phase
+        # launch collective on a separate stream and sync with current_stream
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            for _ in range(self.collectiveArgs.numWarmupIters):
+                comm_fn(self.collectiveArgs)
+        torch.cuda.current_stream().wait_stream(s)
+
+        # 2. capturing graph
+        # in cuda graph, we need to use sync mode
+        # TODO: this might need PTD fix (async_op=True won't work under cuda graph)
+        self.collectiveArgs.asyncOp = False
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            for _ in range(self.collectiveArgs.numIters):
+                if dcheck:
+                    # reset input tensor for data validation
+                    self.setTensorVal(self.collectiveArgs.ipTensor)
+                comm_fn(self.collectiveArgs)
+
+        # 3. Replay
+        start = time.monotonic()  # available only in py3
+        for _ in range(self.collectiveArgs.graph_launches):
+            if self.collectiveArgs.enable_profiler:
+                comms_utils.sampleProfiler()
+
+            # [optional] we can feed new input data to ipTensor for each replay
+            g.replay()
+
+        self.backendFuncs.complete_accel_ops(self.collectiveArgs)
+        end = time.monotonic()  # available only in py3
+
+        ensureTensorFlush(self.collectiveArgs.opTensor)
+
+        elapsedTimeNS += (
+            end - start
+        ) * 1e9  # keeping time in NS, helps in divising data by nanoseconds
+
+        memSize = self.backendFuncs.get_mem_size(self.collectiveArgs)
+
+        avgIterNS, algBW = comms_utils.getAlgBW(
+            elapsedTimeNS,
+            memSize,
+            self.collectiveArgs.numIters
+            * self.collectiveArgs.numCollPerIter
+            * self.collectiveArgs.graph_launches,
+        )
+        busBW = self.backendFuncs.getBusBW(
+            self.collectiveArgs.collective,
+            algBW,
+            self.collectiveArgs,
+        )
+
+        # reset group to sync among all global ranks
+        self.collectiveArgs.group = self.backendFuncs.get_default_group()
+        self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_end")
+
+        results = {
+            "timeUS": avgIterNS / 1e3,
+            "algBW": algBW,
+            "busBW": busBW,
+            "memSize": memSize,
+        }
+        return results
+
     def runColl(self, comm_fn=None, dcheck=False):
+        if self.collectiveArgs.graph_launches > 0:
+            return self.run_coll_cuda_graph(comm_fn, dcheck)
         self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_begin")
 
         elapsedCPUTimeNS = 0.0
@@ -801,6 +885,7 @@ def initCollectiveArgs(self, commsParams):
         self.collectiveArgs.numCollPerIter = commsParams.num_coll
         self.collectiveArgs.include_0B = commsParams.include_0B
         self.collectiveArgs.use_device_time = commsParams.use_device_time
+        self.collectiveArgs.graph_launches = commsParams.graph_launches
 
         if commsParams.bitwidth < 32:
             comms_utils.initQuantCommCtx(self.collectiveArgs, commsParams)
diff --git a/train/comms/pt/comms_utils.py b/train/comms/pt/comms_utils.py
@@ -883,6 +883,7 @@ def __init__(
         self.groupRanks = groupRanks
 
         self.include_0B = args.include_0B
+        self.graph_launches = args.graph_launches
         self.num_coll = args.num_coll
 
 
diff --git a/train/comms/pt/pytorch_backend_utils.py b/train/comms/pt/pytorch_backend_utils.py
@@ -128,6 +128,7 @@ def __init__(self) -> None:
 
         self.include_0B = False
         self.use_device_time = False
+        self.graph_launches = 0
 
 
 class backendFunctions(ABC):
diff --git a/train/comms/pt/pytorch_dist_backend.py b/train/comms/pt/pytorch_dist_backend.py
@@ -1023,7 +1023,7 @@ def get_current_stream(self, device: torch.device | None):
             return None
 
     def switch_stream(self, stream, device: torch.device | None):
-        """switch to a new stream and return the current stream"""
+        """switch to a new stream and return the old current stream"""
         if device is None:
             device = self.get_device()
         if stream is not None and device.type == "cuda":