huggingface · muellerzr · Mar 11, 2025 · Feb 4, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/.github/workflows/gaudi1.yml b/.github/workflows/gaudi1.yml
@@ -0,0 +1,77 @@
+name: Gaudi1 tests (scheduled)
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 2 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run_gaudi1_tests:
+    name: Test on Gaudi1
+    runs-on:
+      group: aws-dl1-24xlarge
+
+    container:
+      image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+      options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
+      env:
+        OMPI_MCA_btl_vader_single_copy_mechanism: none
+        PT_ENABLE_INT64_SUPPORT: 1
+        PT_HPU_LAZY_MODE: 0
+        RUN_SLOW: 1
+
+    steps:
+      - name: HL-SMI (1)
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Extract HPU visible modules
+        id: add-modules
+        run: |
+          export HABANA_VISIBLE_MODULES=$(hl-smi -Q module_id -f csv,noheader | tr '\n' ',' | sed 's/,$//')
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" >> $GITHUB_ENV
+
+      - name: HL-SMI (2)
+        run: |
+          hl-smi
+          echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
+          echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"
+
+      - name: Checkout to Accelerate
+        uses: actions/checkout@v4
+
+      - name: Install Accelerate with Transformers & DeepSpeed
+        run: |
+          pip install -e .[testing] \
+            git+https://github.com/HabanaAI/[email protected] \
+            git+https://github.com/huggingface/transformers.git@hpu-support
+
+      - name: Run CLI tests
+        run: |
+          make test_cli
+
+      - name: Run Core tests
+        run: |
+          make test_core
+
+      - name: Run Big Modeling tests
+        run: |
+          make test_big_modeling
+
+      - name: Run FSDP integration tests
+        run: |
+          make test_fsdp
+
+      - name: Run DeepSpeed integration tests
+        run: |
+          make test_deepspeed
+
+      - name: Run Examples tests
+        run: |
+          make test_examples
diff --git a/Makefile b/Makefile
@@ -28,7 +28,7 @@ test_big_modeling:
 
 test_core:
 	python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
-	--ignore=./tests/fsdp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
+	--ignore=./tests/fsdp --ignore=./tests/tp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
 
 test_cli:
 	python -m pytest -s -v ./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_cli.log",)
@@ -39,6 +39,9 @@ test_deepspeed:
 test_fsdp:
 	python -m pytest -s -v ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_fsdp.log",)
 
+test_tp:
+	python -m pytest -s -v ./tests/tp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_tp.log",)
+
 # Since the new version of pytest will *change* how things are collected, we need `deepspeed` to 
 # run after test_core and test_cli
 test:
@@ -47,13 +50,14 @@ test:
 	$(MAKE) test_big_modeling
 	$(MAKE) test_deepspeed
 	$(MAKE) test_fsdp
+	$(MAKE) test_tp
 
 test_examples:
 	python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_examples.log",)
 
 # Broken down example tests for the CI runners
 test_integrations:
-	python -m pytest -s -v ./tests/deepspeed ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_integrations.log",)
+	python -m pytest -s -v ./tests/deepspeed ./tests/fsdp ./tests/tp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_integrations.log",)
 
 test_example_differences:
 	python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_example_diff.log",)

diff --git a/examples/inference/distributed/stable_diffusion.py b/examples/inference/distributed/stable_diffusion.py
@@ -18,7 +18,7 @@
 from accelerate import PartialState  # Can also be Accelerator or AcceleratorState
 
 
-pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
 distributed_state = PartialState()
 pipe.to(distributed_state.device)
 

diff --git a/examples/inference/pippy/bert.py b/examples/inference/pippy/bert.py
@@ -17,9 +17,15 @@
 from transformers import AutoModelForMaskedLM
 
 from accelerate import PartialState, prepare_pippy
+from accelerate.test_utils import torch_device
 from accelerate.utils import set_seed
 
 
+if torch_device == "hpu":
+    synchronize_func = torch.hpu.synchronize
+else:
+    synchronize_func = torch.cuda.synchronize
+
 # Set the random seed to have reproducable outputs
 set_seed(42)
 
@@ -60,25 +66,25 @@
 )
 
 # Move the inputs to the first device
-input = input.to("cuda:0")
+input = input.to(torch_device)
 
 # Take an average of 5 times
 # Measure first batch
-torch.cuda.synchronize()
+synchronize_func()
 start_time = time.time()
 with torch.no_grad():
     output = model(input)
-torch.cuda.synchronize()
+synchronize_func()
 end_time = time.time()
 first_batch = end_time - start_time
 
-# Now that CUDA is init, measure after
-torch.cuda.synchronize()
+# Now that hpu is init, measure after
+synchronize_func()
 start_time = time.time()
 for i in range(5):
     with torch.no_grad():
         output = model(input)
-torch.cuda.synchronize()
+synchronize_func()
 end_time = time.time()
 
 # The outputs are only on the final process by default

diff --git a/examples/inference/pippy/gpt2.py b/examples/inference/pippy/gpt2.py
@@ -17,9 +17,16 @@
 from transformers import AutoModelForSequenceClassification
 
 from accelerate import PartialState, prepare_pippy
+from accelerate.test_utils import torch_device
 from accelerate.utils import set_seed
 
 
+if torch_device == "hpu":
+    synchronize_func = torch.hpu.synchronize
+else:
+    synchronize_func = torch.cuda.synchronize
+
+
 # Set the random seed to have reproducable outputs
 set_seed(42)
 
@@ -59,25 +66,25 @@
 )
 
 # Move the inputs to the first device
-input = input.to("cuda:0")
+input = input.to(torch_device)
 
 # Take an average of 5 times
 # Measure first batch
-torch.cuda.synchronize()
+synchronize_func()
 start_time = time.time()
 with torch.no_grad():
     output = model(input)
-torch.cuda.synchronize()
+synchronize_func()
 end_time = time.time()
 first_batch = end_time - start_time
 
-# Now that CUDA is init, measure after
-torch.cuda.synchronize()
+# Now that device/backend is init, measure after
+synchronize_func()
 start_time = time.time()
 for i in range(5):
     with torch.no_grad():
         output = model(input)
-torch.cuda.synchronize()
+synchronize_func()
 end_time = time.time()
 
 # The outputs are only on the final process by default

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
     "ruff ~= 0.6.4",
 ]
 extras["docs"] = []
-extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
+extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]
 extras["test_dev"] = [
     "datasets",
     "diffusers",

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
@@ -174,7 +174,7 @@
 
 class Accelerator:
     """
-    Creates an instance of an accelerator for distributed training (on multi-GPU, TPU) or mixed precision training.
+    Creates an instance of an accelerator for distributed training or mixed precision training.
 
     Args:
         device_placement (`bool`, *optional*, defaults to `True`):
@@ -534,9 +534,16 @@ def __init__(
             and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
         ):
             self.native_amp = True
-            if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa", "sdaa") or is_torch_xla_available(
-                check_is_tpu=True
-            ):
+            if self.device.type not in (
+                "xpu",
+                "cuda",
+                "npu",
+                "xla",
+                "mlu",
+                "musa",
+                "hpu",
+                "sdaa",
+            ) or is_torch_xla_available(check_is_tpu=True):
                 raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
             kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
             self.scaler = get_grad_scaler(self.distributed_type, **kwargs)
@@ -545,7 +552,7 @@ def __init__(
             DistributedType.DEEPSPEED,
             DistributedType.MEGATRON_LM,
         ):
-            if self.device.type in ["cpu", "xpu"]:
+            if self.device.type in ["cpu", "xpu", "hpu"]:
                 self.native_amp = True
             else:
                 self.native_amp = is_bf16_available(True)
@@ -1202,6 +1209,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
             DistributedType.MULTI_SDAA,
             DistributedType.MULTI_MUSA,
             DistributedType.MULTI_XPU,
+            DistributedType.MULTI_HPU,
         ):
             dl_even_batches_values = []
 
@@ -1437,6 +1445,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         """
         if device_placement is None:
             device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
+
         self._models.append(model)
 
         # TODO: Look at enabling native TP training directly with a proper config
@@ -1515,12 +1524,16 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
                 DistributedType.MULTI_XPU,
+                DistributedType.MULTI_HPU,
             ):
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                     # TODO: Look at enabling native TP training directly with a proper config
                     if os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true":
-                        device_ids, output_device = [self.local_process_index], self.local_process_index
+                        if self.device.type == "hpu":
+                            device_ids, output_device = [self.device.index], self.device.index
+                        else:
+                            device_ids, output_device = [self.local_process_index], self.local_process_index
                     else:
                         device_ids, output_device = None, None
 
@@ -1920,13 +1933,25 @@ def _prepare_deepspeed(self, *args):
                     if self.deepspeed_config["zero_optimization"].get("offload_optimizer", {}).get(
                         "device", "none"
                     ) != "none" and self.deepspeed_config.get("zero_force_ds_cpu_optimizer", True):
+                        if self.device.type == "hpu" and os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
+                            raise ValueError(
+                                "You can't use an Offload Optimizer with HPU in Lazy Mode. "
+                                "Please set the environment variable `PT_HPU_LAZY_MODE` to `0`."
+                            )
+
                         optimizer = map_pytorch_optim_to_deepspeed(optimizer)
                     kwargs["optimizer"] = optimizer
                     if scheduler is not None:
                         if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES:
                             kwargs["lr_scheduler"] = scheduler
 
+            if self.device.type == "hpu":
+                # This env variable is initialized here to make sure it is set to "true"
+                # It should be done by the launcher but it does not work for multi-node runs
+                os.environ["DEEPSPEED_USE_HPU"] = "true"
+
             engine, optimizer, _, lr_scheduler = ds_initialize(**kwargs)
+
             if compare_versions("deepspeed", ">=", "0.14.4") and self.state.dynamo_plugin.backend != DynamoBackend.NO:
                 compile_kwargs = self.state.dynamo_plugin.to_kwargs()
                 engine.compile(backend=compile_kwargs.pop("backend"), compile_kwargs=compile_kwargs)
@@ -3318,6 +3343,7 @@ def _inner(folder):
                 DistributedType.MULTI_SDAA,
                 DistributedType.MULTI_MUSA,
                 DistributedType.MULTI_NPU,
+                DistributedType.MULTI_HPU,
             ):
                 map_location = "on_device"
             else:

diff --git a/src/accelerate/checkpointing.py b/src/accelerate/checkpointing.py
@@ -32,6 +32,8 @@
     SCHEDULER_NAME,
     WEIGHTS_NAME,
     get_pretty_name,
+    is_cuda_available,
+    is_hpu_available,
     is_mlu_available,
     is_musa_available,
     is_sdaa_available,
@@ -158,7 +160,9 @@ def save_accelerator_state(
         states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
     elif is_musa_available():
         states["torch_musa_manual_seed"] = torch.musa.get_rng_state_all()
-    else:
+    if is_hpu_available():
+        states["torch_hpu_manual_seed"] = torch.hpu.get_rng_state_all()
+    if is_cuda_available():
         states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
     if is_torch_xla_available():
         states["xm_seed"] = xm.get_rng_state()

diff --git a/src/accelerate/commands/config/default.py b/src/accelerate/commands/config/default.py
@@ -18,7 +18,14 @@
 
 import torch
 
-from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_sdaa_available, is_xpu_available
+from ...utils import (
+    is_hpu_available,
+    is_mlu_available,
+    is_musa_available,
+    is_npu_available,
+    is_sdaa_available,
+    is_xpu_available,
+)
 from .config_args import ClusterConfig, default_json_config_file
 from .config_utils import SubcommandHelpFormatter
 
@@ -81,6 +88,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
             config["distributed_type"] = "MULTI_MUSA"
         else:
             config["distributed_type"] = "NO"
+    elif is_hpu_available():
+        num_hpus = torch.hpu.device_count()
+        config["num_processes"] = num_hpus
+        config["use_cpu"] = False
+        if num_hpus > 1:
+            config["distributed_type"] = "MULTI_HPU"
+        else:
+            config["distributed_type"] = "NO"
     elif torch.cuda.is_available():
         num_gpus = torch.cuda.device_count()
         config["num_processes"] = num_gpus