facebookresearch
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎et_replay/comm/backend/pytorch_dist_backend.py‎
Lines changed: 8 additions & 1 deletion b/‎et_replay/comm/backend/pytorch_dist_backend.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎et_replay/comm/comms_utils.py‎
Lines changed: 1 addition & 45 deletions b/‎et_replay/comm/comms_utils.py‎
Lines changed: 1 addition & 45 deletions
@@ -1,2 +1,3 @@
 .venv/
 __pycache__/
+./et_replay/vendor_internal/
@@ -623,6 +623,13 @@ def barrier(self, collectiveArgs, name="dummy", retFlag=False):
         if retFlag:
             return retObj
 
+    def barrier_all_ranks(self):
+        dist.barrier(
+            device_ids=[self.get_device().index]
+            if dist.get_backend() == "nccl"
+            else None
+        )
+
     def sync_barrier(self, collectiveArgs, desc="dummy"):
         # ensure all streams have finished outstanding events before calling barrier
         self.complete_accel_ops(collectiveArgs)
@@ -1031,7 +1038,7 @@ def initialize_groups(self, backend="gloo"):
         # even if they are not going to be members of the group.
         sync_store = dist.PrefixStore("pg_sync_r", self.tcp_store)
         sync_store.set(str(global_rank), json.dumps(self.commsParams.groupRanks))
-        torch.distributed.barrier()
+        self.barrier_all_ranks()
 
         idxed_group_ranks_to_pgId: dict[tuple[int], list[int]] = defaultdict(list)
         for i in range(self.get_world_size()):
 
@@ -20,10 +20,7 @@
 from typing import Any
 
 try:
-    from param_bench.train.comms.pt.fb.internals import (
-        fbInitProfiler,
-        fbSampleProfiler,
-        fbStartProfiler,
+    from param_bench.et_replay.vendor_internals import (
         initialize_collectiveArgs_internal,
         remove_quantization_handlers,
     )
@@ -390,47 +387,6 @@ def ensureTensorFlush(tensors: list[torch.Tensor] | torch.Tensor) -> Any:
     return x
 
 
-def startProfiler(rank: int, device: str, numWarmupIters: int, numIters: int) -> bool:
-    """
-    Starts internal profiler with given parameters.
-
-    Args:
-        rank: Global rank.
-        device: Type of device "cuda", "cpu", etc.
-        numWarmupIters: Number of warmup iterations.
-        numIters: Number of real iterations.
-    Returns:
-        bool: Returns if internal profile was able to start or not.
-    """
-    if has_internal_libs:
-        fbInitProfiler(
-            rank=rank,
-            device=device,
-            warmup=numWarmupIters,
-            iters=numIters,
-        )
-        fbStartProfiler()
-        return True
-    else:
-        logger.debug("Internal profiler is not available, skip...")
-        return False
-
-
-def sampleProfiler(stop: bool = False) -> None:
-    """
-    Starts internal sample profiler.
-
-    Args:
-        stop: Bool to be passed into sample profiler.
-    Returns:
-        None
-    """
-    if has_internal_libs:
-        fbSampleProfiler(stop)
-    else:
-        logger.debug("Internal profiler is not available, skip...")
-
-
 class commsArgs:
     """
     This class contains all of the args that we can use to perform a single collective.
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`.venv/`
`2`	`2`	`__pycache__/`
	`3`	`+./et_replay/vendor_internal/`