huggingface · muellerzr · Mar 11, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -168,7 +168,8 @@
 
 class Accelerator:
     """
-    Creates an instance of an accelerator for distributed training (on multi-GPU, TPU) or mixed precision training.
+    Creates an instance of an accelerator for distributed training (on multi-GPU, TPU, HPU) or mixed precision
+    training.
 
     Args:
         device_placement (`bool`, *optional*, defaults to `True`):
@@ -529,7 +530,7 @@ def __init__(
             DistributedType.DEEPSPEED,
             DistributedType.MEGATRON_LM,
         ):
-            if self.device.type in ["cpu", "xpu"]:
+            if self.device.type in ["cpu", "xpu", "hpu"]:
                 self.native_amp = True
             else:
                 self.native_amp = is_bf16_available(True)

diff --git a/src/accelerate/state.py b/src/accelerate/state.py
@@ -38,6 +38,7 @@
     is_datasets_available,
     is_deepspeed_available,
     is_fp8_available,
+    is_hpu_available,
     is_ipex_available,
     is_mlu_available,
     is_mps_available,
@@ -64,6 +65,9 @@
 if is_npu_available(check_device=False):
     import torch_npu  # noqa: F401
 
+if is_hpu_available(check_device=False):
+    import habana_frameworks.torch  # noqa: F401
+
 logger = logging.getLogger(__name__)
 
 
@@ -210,6 +214,7 @@ def __init__(self, cpu: bool = False, **kwargs):
                         and not torch.distributed.is_initialized()
                     ):
                         torch.distributed.init_process_group(backend=self.backend, **kwargs)
+
             # XPU and CPU require special env configs to be set
             if self.distributed_type in (DistributedType.MULTI_XPU, DistributedType.MULTI_CPU):
                 dist_information = get_cpu_distributed_information()
@@ -290,6 +295,7 @@ def __init__(self, cpu: bool = False, **kwargs):
                         'Please set `NCCL_P2P_DISABLE="1"` and `NCCL_IB_DISABLE="1" or use `accelerate launch` which '
                         "will do this automatically."
                     )
+
         # Important: This should be the *only* code outside of `self.initialized!`
         self.fork_launched = parse_flag_from_env("FORK_LAUNCHED", 0)
 
@@ -369,6 +375,7 @@ def wait_for_everyone(self):
             DistributedType.MULTI_NPU,
             DistributedType.MULTI_XPU,
             DistributedType.MULTI_CPU,
+            DistributedType.MULTI_HPU,
             DistributedType.DEEPSPEED,
             DistributedType.FSDP,
         ):
@@ -704,6 +711,8 @@ def default_device(self) -> torch.device:
             return torch.device("cuda")
         elif is_xpu_available():
             return torch.device("xpu")
+        elif is_hpu_available():
+            return torch.device("hpu")
         else:
             return torch.device("cpu")
 
@@ -720,6 +729,7 @@ def _prepare_backend(
         elif is_torch_xla_available():
             backend = "xla"
             distributed_type = DistributedType.XLA
+
         elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
             if is_mlu_available():
                 backend = "cncl"
@@ -732,6 +742,9 @@ def _prepare_backend(
             elif is_npu_available():
                 backend = "hccl"
                 distributed_type = DistributedType.MULTI_NPU
+            elif is_hpu_available():
+                backend = "hccl"
+                distributed_type = DistributedType.MULTI_HPU
             elif torch.cuda.is_available():
                 if backend is None:
                     backend = "nccl"
@@ -776,7 +789,7 @@ def set_device(self):
             self.device = torch.device("cpu") if self._cpu else self.default_device
             return
         device = str(self.distributed_type).split(".")[-1].replace("MULTI_", "").lower()
-        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla"):
+        if device not in ("cpu", "gpu", "mlu", "musa", "npu", "xpu", "xla", "hpu"):
             raise ValueError(
                 f"Can't set device for {self.distributed_type} ({device}), verify we should be calling `_set_device()` for it!"
             )
@@ -911,6 +924,7 @@ def __init__(
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,
+                DistributedType.MULTI_HPU,
             ]:
                 if os.environ.get("ACCELERATE_USE_FSDP", "false") == "true" or fsdp_plugin is not None:
                     self.distributed_type = DistributedType.FSDP

diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
@@ -94,6 +94,7 @@
     is_deepspeed_available,
     is_dvclive_available,
     is_fp8_available,
+    is_hpu_available,
     is_import_timer_available,
     is_ipex_available,
     is_lomo_available,

diff --git a/src/accelerate/utils/constants.py b/src/accelerate/utils/constants.py
@@ -85,4 +85,5 @@
     "MULTI_MUSA",
     "MULTI_XPU",
     "MULTI_CPU",
+    "MULTI_HPU",
 ]
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
@@ -40,6 +40,7 @@
 from .environment import parse_flag_from_env, str_to_bool
 from .imports import (
     is_cuda_available,
+    is_hpu_available,
     is_mlu_available,
     is_msamp_available,
     is_musa_available,
@@ -528,6 +529,7 @@ class DistributedType(str, enum.Enum):
         - **MULTI_MUSA** -- Distributed on multiple MUSAs.
         - **MULTI_NPU** -- Distributed on multiple NPUs.
         - **MULTI_XPU** -- Distributed on multiple XPUs.
+        - **MULTI_HPU** -- Distributed on multiple HPUs.
         - **DEEPSPEED** -- Using DeepSpeed.
         - **XLA** -- Using TorchXLA.
     """
@@ -545,6 +547,7 @@ class DistributedType(str, enum.Enum):
     TP = "TP"
     XLA = "XLA"
     MEGATRON_LM = "MEGATRON_LM"
+    MULTI_HPU = "MULTI_HPU"
 
 
 class SageMakerDistributedType(str, enum.Enum):
@@ -646,6 +649,7 @@ class DynamoBackend(str, BaseEnum):
     TORCHXLA_TRACE_ONCE = "TORCHXLA_TRACE_ONCE"
     IPEX = "IPEX"
     TVM = "TVM"
+    HPU_BACKEND = "HPU_BACKEND"
 
 
 class LoggerType(BaseEnum):
@@ -1695,6 +1699,8 @@ def __post_init__(self):
                 device = torch.cuda.current_device()
             elif is_xpu_available():
                 device = torch.xpu.current_device()
+            elif is_hpu_available():
+                device = torch.hpu.current_device()
             else:
                 raise RuntimeError(
                     "There are currently no available devices found, must be one of 'XPU', 'CUDA', or 'NPU'."

diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
@@ -378,6 +378,28 @@ def is_npu_available(check_device=False):
     return hasattr(torch, "npu") and torch.npu.is_available()
 
 
+@lru_cache
+def is_hpu_available(check_device=False):
+    "Checks if `torch_hpu` is installed and potentially if a HPU is in the environment"
+    if importlib.util.find_spec("habana_frameworks") is None:
+        return False
+
+    import habana_frameworks.torch  # noqa: F401
+    import habana_frameworks.torch.distributed.hccl as hccl  # noqa: F401
+
+    if check_device:
+        try:
+            import habana_frameworks.torch.utils.experimental as htexp
+
+            if htexp.hpu.is_available():
+                _ = htexp.hpu.device_count()
+                return True
+            return False
+        except RuntimeError:
+            return False
+    return True
+
+
 @lru_cache
 def is_xpu_available(check_device=False):
     """