Move get_world_size_and_rank to utils

joecummings · joecummings · commit 38ff8e446f2f · 2024-12-12T03:38:39.000-08:00
diff --git a/docs/source/api_ref_training.rst b/docs/source/api_ref_training.rst
@@ -52,7 +52,6 @@ Utilities for enabling and working with distributed training.
 
     init_distributed
     is_distributed
-    get_world_size_and_rank
     gather_cpu_state_dict
 
 .. _ac_label:
diff --git a/docs/source/api_ref_utilities.rst b/docs/source/api_ref_utilities.rst
@@ -18,3 +18,4 @@ Miscellaneous
     get_device
     get_logger
     torch_version_ge
+    get_world_size_and_rank
diff --git a/recipes/dev/early_exit_finetune_distributed.py b/recipes/dev/early_exit_finetune_distributed.py
@@ -183,7 +183,7 @@ def __init__(self, cfg: DictConfig) -> None:
 
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
         self._is_rank_zero = rank == 0
 
         # Training cfg
@@ -646,7 +646,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -826,7 +826,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -133,7 +133,7 @@ def __init__(self, cfg: DictConfig) -> None:
             )
             self._log_peak_memory_stats = False
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
         self._is_rank_zero = rank == 0
 
         # Training cfg
@@ -619,7 +619,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -757,7 +757,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "fp16 precision is not supported in this recipe. Please use fp32 or bf16."
             )
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         self._is_rank_zero = rank == 0
 
@@ -646,7 +646,7 @@ def _setup_data(
         Map-style Datasets which fit into memory and an option for random shuffling.
         Samplers, iterable datasets, and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -815,7 +815,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -131,7 +131,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         self._is_rank_zero = rank == 0
 
@@ -492,7 +492,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -642,7 +642,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -135,7 +135,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         self._is_rank_zero = rank == 0
 
@@ -584,7 +584,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -746,7 +746,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -144,7 +144,7 @@ def __init__(self, cfg: DictConfig) -> None:
             )
             self._log_peak_memory_stats = False
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
         self._is_rank_zero = rank == 0
 
         # Training cfg
@@ -591,7 +591,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -729,7 +729,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -149,7 +149,7 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        _, rank = training.get_world_size_and_rank()
+        _, rank = utils.get_world_size_and_rank()
 
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
@@ -620,7 +620,7 @@ def _setup_data(
         DistributedSamplers with Map-style Datasets which fit into memory. Other samplers,
         iterable datasets and streaming datasets are not supported.
         """
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
@@ -784,7 +784,7 @@ def train(self) -> None:
         # clean up before training begins
         training.cleanup_before_training()
 
-        world_size, rank = training.get_world_size_and_rank()
+        world_size, rank = utils.get_world_size_and_rank()
 
         # zero out the gradients before starting training
         self._optimizer.zero_grad()
diff --git a/tests/torchtune/training/test_distributed.py b/tests/torchtune/training/test_distributed.py
@@ -56,15 +56,6 @@ def _test_worker_fn(init_pg_explicit: bool) -> None:
             pg_backend == "gloo"
         ), f"Expected 'gloo' backend, but received {pg_backend}"
 
-    @staticmethod
-    def _test_world_size_with_cpu_device(expected_world_size: int) -> None:
-        training.init_distributed(backend="gloo")
-        world_size, _ = training.get_world_size_and_rank()
-        if world_size != expected_world_size:
-            raise AssertionError(
-                f"Expected different world size: received {world_size}, expected {expected_world_size}"
-            )
-
     def _test_launch_worker(
         self,
         get_pet_launch_config,
@@ -84,13 +75,6 @@ def test_init_from_env_dup(self, get_pet_launch_config) -> None:
         # trivial test case to ensure test passes with no exceptions
         assert True
 
-    def test_world_size_with_cpu(self, get_pet_launch_config) -> None:
-        desired_world_size = 4
-        lc = get_pet_launch_config(desired_world_size)
-        launcher.elastic_launch(lc, entrypoint=self._test_world_size_with_cpu_device)(
-            desired_world_size
-        )
-
     def test_validate_no_params_on_meta_device(self) -> None:
         with torch.device("meta"):
             model = torch.nn.Linear(3, 3)
diff --git a/tests/torchtune/utils/test_device.py b/tests/torchtune/utils/test_device.py
@@ -12,6 +12,8 @@
 import pytest
 
 import torch
+
+from torch.distributed import launcher
 from torchtune.utils._device import (
     _get_device_type_from_env,
     _setup_device,
@@ -20,13 +22,32 @@
     get_device,
     get_device_support,
     get_torch_device_namespace,
+    get_world_size_and_rank,
 )
 
 
 class TestDevice:
 
     cuda_available: bool = torch.cuda.is_available()
 
+    def _create_world(self, expected_world_size: int) -> None:
+        torch.distributed.init_process_group(backend="gloo")
+        world_size, _ = get_world_size_and_rank()
+        if world_size != expected_world_size:
+            raise AssertionError(
+                f"Expected different world size: received {world_size}, expected {expected_world_size}"
+            )
+
+    def test_world_size_with_cpu(self, get_pet_launch_config) -> None:
+        desired_world_size = 4
+        lc = get_pet_launch_config(desired_world_size)
+        launcher.elastic_launch(lc, entrypoint=self._create_world)(desired_world_size)
+
+    def test_rank_with_cpu_device(self) -> None:
+        """Very, very basic test"""
+        _, rank = get_world_size_and_rank()
+        assert rank == 0
+
     @patch("torch.cuda.is_available", return_value=False)
     def test_get_cpu_device(self, mock_cuda):
         devices = [None, "cpu", "meta"]
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -22,9 +22,8 @@
 from torch.optim import Optimizer
 from torchao.dtypes.nf4tensor import NF4Tensor, to_nf4
 from torchtune.modules import TransformerDecoder
-from torchtune.utils import get_logger
-
-from torchtune.utils._device import get_device
+from torchtune.utils import get_device, get_logger
+from torchtune.utils._logging import deprecated
 
 _log: logging.Logger = get_logger()
 
@@ -117,6 +116,10 @@ def set_torch_num_threads() -> None:
     _log.info(f"Set intra op parallelism no. of threads to {num_threads}")
 
 
+@deprecated(
+    msg="`get_world_size_and_rank` will move to `torchtune.utils._device` in future releases. "
+    "Please use `torchtune.utils.get_world_size_and_rank` instead."
+)
 def get_world_size_and_rank() -> Tuple[int, int]:
     """Function that gets the current world size (aka total number
     of ranks) and rank number of the current process in the default process group.
diff --git a/torchtune/training/_profiler.py b/torchtune/training/_profiler.py
@@ -18,9 +18,8 @@
 from omegaconf import DictConfig
 from torch._C._profiler import _ExperimentalConfig
 from torch.profiler import tensorboard_trace_handler
-from torchtune.training import get_world_size_and_rank
 
-from torchtune.utils import get_logger
+from torchtune.utils import get_logger, get_world_size_and_rank
 
 log = get_logger("INFO")
 
diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py
@@ -14,9 +14,8 @@
 
 from numpy import ndarray
 from omegaconf import DictConfig, OmegaConf
-from torchtune.training._distributed import get_world_size_and_rank
 
-from torchtune.utils import get_logger
+from torchtune.utils import get_logger, get_world_size_and_rank
 from typing_extensions import Protocol
 
 Scalar = Union[torch.Tensor, ndarray, int, float]
diff --git a/torchtune/training/seed.py b/torchtune/training/seed.py
@@ -13,8 +13,8 @@
 import numpy as np
 import torch
 
-from torchtune.training._distributed import _broadcast_tensor, get_world_size_and_rank
-from torchtune.utils import get_logger
+from torchtune.training._distributed import _broadcast_tensor
+from torchtune.utils import get_logger, get_world_size_and_rank
 
 _log: logging.Logger = get_logger()
 
diff --git a/torchtune/utils/__init__.py b/torchtune/utils/__init__.py
@@ -10,12 +10,14 @@
     get_device,
     get_device_support,
     get_torch_device_namespace,
+    get_world_size_and_rank,
 )
 from ._logging import get_logger, log_rank_zero
 
 from ._version import torch_version_ge
 
 __all__ = [
+    "get_world_size_and_rank",
     "batch_to_device",
     "get_device",
     "get_logger",
diff --git a/torchtune/utils/_device.py b/torchtune/utils/_device.py
@@ -6,7 +6,7 @@
 
 import os
 from enum import Enum
-from typing import Optional
+from typing import Optional, Tuple
 
 import torch
 
@@ -21,6 +21,19 @@
     BlockMask = torch.Tensor
 
 
+def get_world_size_and_rank() -> Tuple[int, int]:
+    """Function that gets the current world size (aka total number
+    of ranks) and rank number of the current process in the default process group.
+
+    Returns:
+        Tuple[int, int]: world size, rank
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(), torch.distributed.get_rank()
+    else:
+        return 1, 0
+
+
 def is_torch_npu_available() -> bool:
     """Check the availability of NPU"""
     try: