Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
166 commits
Select commit Hold shift + click to select a range
4f462b0
init
IlyasMoutawwakil Feb 4, 2025
7b51103
style
IlyasMoutawwakil Feb 4, 2025
9d7376e
is_hpu_available
IlyasMoutawwakil Feb 4, 2025
069b88a
fix
IlyasMoutawwakil Feb 4, 2025
cd3cbb9
import habana_frameworks.torch.distributed.hccl
IlyasMoutawwakil Feb 4, 2025
2493abe
style
IlyasMoutawwakil Feb 4, 2025
32cbc88
test
IlyasMoutawwakil Feb 4, 2025
5fd4de2
initialize dist proc group
IlyasMoutawwakil Feb 4, 2025
7f72745
revert
IlyasMoutawwakil Feb 5, 2025
f66c5df
set backend to hccl only if hccl initialization sets a local rank
IlyasMoutawwakil Feb 5, 2025
2a4130d
force backend hccl and multi_hpu type when sure of distributed launch
IlyasMoutawwakil Feb 5, 2025
fa1bc44
style
IlyasMoutawwakil Feb 5, 2025
d3e24c5
pass accelerator tests
IlyasMoutawwakil Feb 6, 2025
00cc283
pas big modeling tests with bigger atol/rtol for accelerators
IlyasMoutawwakil Feb 6, 2025
97081da
fix hpu device count and skip tests requiring hpu:x
IlyasMoutawwakil Feb 6, 2025
ddcb3ca
hpu autocast
IlyasMoutawwakil Feb 6, 2025
6de389c
hpu rng_state
IlyasMoutawwakil Feb 7, 2025
ae9a76b
hpu launch
IlyasMoutawwakil Feb 7, 2025
5b8b0b2
hpu special device placement
IlyasMoutawwakil Feb 7, 2025
a2f8040
hpu launch
IlyasMoutawwakil Feb 7, 2025
6abecdd
rng state
IlyasMoutawwakil Feb 7, 2025
7bc37dc
distributed data loop tests
IlyasMoutawwakil Feb 7, 2025
ef1de61
enforce non contiguity after device memory allocation
IlyasMoutawwakil Feb 7, 2025
1b6905e
pass fsdp tests
IlyasMoutawwakil Feb 7, 2025
defe3fa
enforce pt_hpu_lazy_mode=0 when fsdp testing
IlyasMoutawwakil Feb 7, 2025
9551ce3
pass cli tests
IlyasMoutawwakil Feb 10, 2025
9c84fe7
pass and document grad sync tests
IlyasMoutawwakil Feb 10, 2025
6f00591
pass kwargs handler and autocast tests
IlyasMoutawwakil Feb 10, 2025
c94bfbd
memory utils
IlyasMoutawwakil Feb 10, 2025
61235d3
found source of int64 errors
IlyasMoutawwakil Feb 10, 2025
0896a50
skip some modeling utils tests
IlyasMoutawwakil Feb 10, 2025
e974758
enable int64
IlyasMoutawwakil Feb 10, 2025
ee08748
skip optimizer tests
IlyasMoutawwakil Feb 10, 2025
6f0fbe4
pass checkpointing tests
IlyasMoutawwakil Feb 10, 2025
c5c50c6
pass accelerator tests with safetensors main
IlyasMoutawwakil Feb 10, 2025
34010c9
more hpu stuff
IlyasMoutawwakil Feb 10, 2025
9f75a6e
Merge branch 'main' into hpu-support
IlyasMoutawwakil Feb 10, 2025
e80b484
style
IlyasMoutawwakil Feb 10, 2025
5cacc31
remove PT_HPU_LAZY_MODE and PT_ENABLE_INT64_SUPPORT as they should be…
IlyasMoutawwakil Feb 15, 2025
f006c4e
start testing on gaudi2
IlyasMoutawwakil Feb 17, 2025
19e652a
support fp16 on gaudi2
IlyasMoutawwakil Feb 17, 2025
40d22b1
add testing order
IlyasMoutawwakil Feb 17, 2025
eb37c43
custom hpu fsdp env dict
IlyasMoutawwakil Feb 17, 2025
dc4ca51
fix torch trace malloc
IlyasMoutawwakil Feb 17, 2025
74b307a
test ddp half precision comm hooks
IlyasMoutawwakil Feb 17, 2025
5a6d5ef
fix
IlyasMoutawwakil Feb 17, 2025
5a1c0c9
fix
IlyasMoutawwakil Feb 17, 2025
50d9e71
remove lower bound for hpu
IlyasMoutawwakil Feb 17, 2025
f0579e8
use 0.72 as lower bound
IlyasMoutawwakil Feb 17, 2025
dfc82ec
lower lower bound
IlyasMoutawwakil Feb 17, 2025
176e3d2
order deepspeed tests
IlyasMoutawwakil Feb 17, 2025
6c688d0
fix
IlyasMoutawwakil Feb 17, 2025
b078e90
deepspeed_use_hpu
IlyasMoutawwakil Feb 17, 2025
0dcb46a
assert non lazy mode with offloaded optimizer
IlyasMoutawwakil Feb 18, 2025
5abb1a4
make patching torch with habana frameworks the default
IlyasMoutawwakil Feb 18, 2025
b63a6fa
less of require_non_hpu
IlyasMoutawwakil Feb 18, 2025
36f8794
skip test_multi_device_merge_fsdp_weights for now as it halts
IlyasMoutawwakil Feb 18, 2025
ab5cbb0
skip another flaky test
IlyasMoutawwakil Feb 18, 2025
e318161
format
IlyasMoutawwakil Feb 18, 2025
0c040c3
use habana_visible_modules
IlyasMoutawwakil Feb 18, 2025
6f5977e
patch torch hpu device count
IlyasMoutawwakil Feb 18, 2025
f1e196f
avoid setting HABANA_VISIBLE_MODULES
IlyasMoutawwakil Feb 18, 2025
2772b68
don't play with habana visible devices/modules
IlyasMoutawwakil Feb 18, 2025
7d1ef62
only with hpu
IlyasMoutawwakil Feb 18, 2025
427c313
fixes and skips
IlyasMoutawwakil Feb 18, 2025
be91183
skip
IlyasMoutawwakil Feb 18, 2025
5c0cd84
fix device ids and add some todos
IlyasMoutawwakil Feb 19, 2025
ae1431a
skip offloading with generate()
IlyasMoutawwakil Feb 19, 2025
d383ea5
fix
IlyasMoutawwakil Feb 19, 2025
0b62d52
reduced atol/rtol for hpu
IlyasMoutawwakil Feb 19, 2025
f2504a5
fix
IlyasMoutawwakil Feb 19, 2025
f5cf0d5
tag deepspeed tests that should run first
IlyasMoutawwakil Feb 19, 2025
ac434c2
enable a test path that was skipped
IlyasMoutawwakil Feb 19, 2025
1501105
revert a test that was customized for gaudi1
IlyasMoutawwakil Feb 19, 2025
8b5708e
some patching to enable HABANA_VISIBLE_MODULES
IlyasMoutawwakil Feb 19, 2025
8935766
fix zero3 test
IlyasMoutawwakil Feb 19, 2025
d8301cd
misc
IlyasMoutawwakil Feb 19, 2025
6ce9e3a
test DTensor TP
IlyasMoutawwakil Feb 19, 2025
42775d2
remove gaudi1
IlyasMoutawwakil Feb 19, 2025
788e95f
test
IlyasMoutawwakil Feb 20, 2025
03b391e
style
IlyasMoutawwakil Feb 20, 2025
2247739
comment
IlyasMoutawwakil Feb 20, 2025
07ba582
pass pad_across_processes
IlyasMoutawwakil Feb 20, 2025
647dfab
require_fp16
IlyasMoutawwakil Feb 20, 2025
8e63b29
pass memory utils test
IlyasMoutawwakil Feb 20, 2025
6b1d131
test_ddp_comm_hook
IlyasMoutawwakil Feb 20, 2025
7803291
skip half precision comm hooks on hpu
IlyasMoutawwakil Feb 20, 2025
2883ca1
fix
IlyasMoutawwakil Feb 20, 2025
007d4a8
is_fp16_available
IlyasMoutawwakil Feb 20, 2025
9c12fae
fp16
IlyasMoutawwakil Feb 20, 2025
324d6df
tp as part of integration tests
IlyasMoutawwakil Feb 20, 2025
839c6be
fix
IlyasMoutawwakil Feb 20, 2025
3e548f4
write_basic_config
IlyasMoutawwakil Feb 20, 2025
f67a898
safetensors
IlyasMoutawwakil Feb 20, 2025
f449d3f
local sgd and masked_fill_fwd_i64
IlyasMoutawwakil Feb 20, 2025
79ef8a5
fix num_processes in test_load_states_by_steps
IlyasMoutawwakil Feb 20, 2025
f772b76
fp8 support
IlyasMoutawwakil Feb 24, 2025
6218cec
test
IlyasMoutawwakil Feb 24, 2025
31872f6
Merge branch 'main' into hpu-support
IlyasMoutawwakil Feb 24, 2025
610c68b
fix
IlyasMoutawwakil Feb 24, 2025
347db07
add a workflow
IlyasMoutawwakil Feb 25, 2025
5fc5a2a
Update src/accelerate/accelerator.py
IlyasMoutawwakil Feb 25, 2025
dc7a773
review comments
IlyasMoutawwakil Feb 25, 2025
9606f0d
ci
IlyasMoutawwakil Feb 25, 2025
6b77bc4
style
IlyasMoutawwakil Feb 25, 2025
d556021
comments
IlyasMoutawwakil Feb 26, 2025
e2fe2cc
test
IlyasMoutawwakil Feb 26, 2025
05e6861
habana_frameworks.torch
IlyasMoutawwakil Feb 26, 2025
ef6192c
patch device count
IlyasMoutawwakil Feb 26, 2025
59b51e5
fix
IlyasMoutawwakil Feb 26, 2025
c6731f5
fix
IlyasMoutawwakil Feb 26, 2025
66ec449
require_fp8
IlyasMoutawwakil Feb 26, 2025
28dae91
fix
IlyasMoutawwakil Feb 27, 2025
ec9c562
fix
IlyasMoutawwakil Feb 27, 2025
53f99c3
gaudi 1
IlyasMoutawwakil Feb 27, 2025
5f9928d
remove unnecessary
IlyasMoutawwakil Feb 27, 2025
ddbece5
fixed maskd fill error in transformers
IlyasMoutawwakil Feb 28, 2025
72bd312
style
IlyasMoutawwakil Feb 28, 2025
506d07e
balanced_memory pass on hpu
IlyasMoutawwakil Mar 3, 2025
ae67bcc
remove for now
IlyasMoutawwakil Mar 3, 2025
405b857
run first
IlyasMoutawwakil Mar 4, 2025
27be94c
Apply suggestions from code review
IlyasMoutawwakil Mar 5, 2025
4e0e966
Merge branch 'main' into hpu-support
IlyasMoutawwakil Mar 5, 2025
e2a8d85
style after merge
IlyasMoutawwakil Mar 5, 2025
03e2646
Update src/accelerate/accelerator.py
IlyasMoutawwakil Mar 6, 2025
3ed87c1
Update src/accelerate/utils/transformer_engine.py
IlyasMoutawwakil Mar 6, 2025
2dcab3e
Merge branch 'main' into hpu-support
IlyasMoutawwakil Mar 6, 2025
55b0d3c
empty cache review comments
IlyasMoutawwakil Mar 6, 2025
bd2afc3
test_scirpt.py error messages
IlyasMoutawwakil Mar 6, 2025
75e5b81
AccelerateTestCase for accelerator state cleanup
IlyasMoutawwakil Mar 6, 2025
e5dfad4
test
IlyasMoutawwakil Mar 7, 2025
ed84e7b
add gaudi1 workflow
IlyasMoutawwakil Mar 7, 2025
a05e54a
fp8 avilability
IlyasMoutawwakil Mar 7, 2025
eb0b3a3
fix
IlyasMoutawwakil Mar 7, 2025
7b2650a
reduce batch size
IlyasMoutawwakil Mar 7, 2025
9b227d8
concurrency
IlyasMoutawwakil Mar 7, 2025
8cf20cd
check cuda as well
IlyasMoutawwakil Mar 7, 2025
7c4897b
nits and comments
IlyasMoutawwakil Mar 7, 2025
d0485f1
mark fsdp tests that require_fp16
IlyasMoutawwakil Mar 7, 2025
c37aefd
style
IlyasMoutawwakil Mar 7, 2025
bdae68d
mark deepspeed fp16 tests
IlyasMoutawwakil Mar 7, 2025
d919931
update image
IlyasMoutawwakil Mar 7, 2025
efd2a27
fix
IlyasMoutawwakil Mar 9, 2025
394b687
updated
IlyasMoutawwakil Mar 9, 2025
4f76d2c
better msgs
IlyasMoutawwakil Mar 9, 2025
b3dd375
skip pippy
IlyasMoutawwakil Mar 9, 2025
17d43ab
test
IlyasMoutawwakil Mar 9, 2025
db16287
test on 2 device
IlyasMoutawwakil Mar 9, 2025
e359c01
support up to 1% relative error in test_accelerate
IlyasMoutawwakil Mar 9, 2025
e9cfca4
skip hpu fp16
IlyasMoutawwakil Mar 9, 2025
ac41600
allow for 1 byte differene
IlyasMoutawwakil Mar 9, 2025
8571ef4
revert torch_device change
IlyasMoutawwakil Mar 9, 2025
3115ee4
style
IlyasMoutawwakil Mar 9, 2025
7c6a44a
skip memory release since it's flaky
IlyasMoutawwakil Mar 9, 2025
e8f9a48
add accelerator state cleanup to fixture
IlyasMoutawwakil Mar 9, 2025
3face36
fix
IlyasMoutawwakil Mar 9, 2025
06c1f53
atol
IlyasMoutawwakil Mar 9, 2025
75aaabd
fix
IlyasMoutawwakil Mar 9, 2025
21fca86
more rtol
IlyasMoutawwakil Mar 10, 2025
a99c297
equal grad test
IlyasMoutawwakil Mar 10, 2025
81a37be
revert
IlyasMoutawwakil Mar 10, 2025
92775af
pass pippy on gaudi2 and skip on gaudi1
IlyasMoutawwakil Mar 10, 2025
ce13eeb
enable sd 1.5 test with require fp16
IlyasMoutawwakil Mar 10, 2025
04983cc
added warning on memory release
IlyasMoutawwakil Mar 10, 2025
5efbe8c
don't log warning in memory release as it requires PartialState to be…
IlyasMoutawwakil Mar 10, 2025
4847474
Apply suggestions from code review
IlyasMoutawwakil Mar 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions .github/workflows/gaudi1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: Gaudi1 tests (scheduled)

on:
workflow_dispatch:
schedule:
- cron: "0 2 * * *"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
run_gaudi1_tests:
name: Test on Gaudi1
runs-on:
group: aws-dl1-24xlarge

container:
image: docker://vault.habana.ai/gaudi-docker/1.20.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
options: --runtime=habana --shm-size=64G --cap-add=sys_nice --env HABANA_VISIBLE_DEVICES=0,1
env:
OMPI_MCA_btl_vader_single_copy_mechanism: none
PT_ENABLE_INT64_SUPPORT: 1
PT_HPU_LAZY_MODE: 0
RUN_SLOW: 1

steps:
- name: HL-SMI (1)
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

- name: Extract HPU visible modules
id: add-modules
run: |
export HABANA_VISIBLE_MODULES=$(hl-smi -Q module_id -f csv,noheader | tr '\n' ',' | sed 's/,$//')
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}" >> $GITHUB_ENV

- name: HL-SMI (2)
run: |
hl-smi
echo "HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES}"
echo "HABANA_VISIBLE_MODULES=${HABANA_VISIBLE_MODULES}"

- name: Checkout to Accelerate
uses: actions/checkout@v4

- name: Install Accelerate with Transformers & DeepSpeed
run: |
pip install -e .[testing] \
git+https://github.com/HabanaAI/[email protected] \
git+https://github.com/huggingface/transformers.git@hpu-support

- name: Run CLI tests
run: |
make test_cli

- name: Run Core tests
run: |
make test_core

- name: Run Big Modeling tests
run: |
make test_big_modeling

- name: Run FSDP integration tests
run: |
make test_fsdp

- name: Run DeepSpeed integration tests
run: |
make test_deepspeed

- name: Run Examples tests
run: |
make test_examples
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ test_big_modeling:

test_core:
python -m pytest -s -v ./tests/ --ignore=./tests/test_examples.py --ignore=./tests/deepspeed --ignore=./tests/test_big_modeling.py \
--ignore=./tests/fsdp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
--ignore=./tests/fsdp --ignore=./tests/tp --ignore=./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_core.log",)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure TP should be part of test_core, tell me if you want me to revert this.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah i don't think we want that cc @muellerzr

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed


test_cli:
python -m pytest -s -v ./tests/test_cli.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_cli.log",)
Expand All @@ -39,6 +39,9 @@ test_deepspeed:
test_fsdp:
python -m pytest -s -v ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_fsdp.log",)

test_tp:
python -m pytest -s -v ./tests/tp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_tp.log",)

# Since the new version of pytest will *change* how things are collected, we need `deepspeed` to
# run after test_core and test_cli
test:
Expand All @@ -47,13 +50,14 @@ test:
$(MAKE) test_big_modeling
$(MAKE) test_deepspeed
$(MAKE) test_fsdp
$(MAKE) test_tp

test_examples:
python -m pytest -s -v ./tests/test_examples.py $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_examples.log",)

# Broken down example tests for the CI runners
test_integrations:
python -m pytest -s -v ./tests/deepspeed ./tests/fsdp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_integrations.log",)
python -m pytest -s -v ./tests/deepspeed ./tests/fsdp ./tests/tp $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_integrations.log",)

test_example_differences:
python -m pytest -s -v ./tests/test_examples.py::ExampleDifferenceTests $(if $(IS_GITHUB_CI),--report-log "$(PYTORCH_VERSION)_example_diff.log",)
Expand Down
2 changes: 1 addition & 1 deletion examples/inference/distributed/stable_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from accelerate import PartialState # Can also be Accelerator or AcceleratorState


pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
distributed_state = PartialState()
pipe.to(distributed_state.device)

Expand Down
18 changes: 12 additions & 6 deletions examples/inference/pippy/bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,15 @@
from transformers import AutoModelForMaskedLM

from accelerate import PartialState, prepare_pippy
from accelerate.test_utils import torch_device
from accelerate.utils import set_seed


if torch_device == "hpu":
synchronize_func = torch.hpu.synchronize
else:
synchronize_func = torch.cuda.synchronize

# Set the random seed to have reproducable outputs
set_seed(42)

Expand Down Expand Up @@ -60,25 +66,25 @@
)

# Move the inputs to the first device
input = input.to("cuda:0")
input = input.to(torch_device)

# Take an average of 5 times
# Measure first batch
torch.cuda.synchronize()
synchronize_func()
start_time = time.time()
with torch.no_grad():
output = model(input)
torch.cuda.synchronize()
synchronize_func()
end_time = time.time()
first_batch = end_time - start_time

# Now that CUDA is init, measure after
torch.cuda.synchronize()
# Now that hpu is init, measure after
synchronize_func()
start_time = time.time()
for i in range(5):
with torch.no_grad():
output = model(input)
torch.cuda.synchronize()
synchronize_func()
end_time = time.time()

# The outputs are only on the final process by default
Expand Down
19 changes: 13 additions & 6 deletions examples/inference/pippy/gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,16 @@
from transformers import AutoModelForSequenceClassification

from accelerate import PartialState, prepare_pippy
from accelerate.test_utils import torch_device
from accelerate.utils import set_seed


if torch_device == "hpu":
synchronize_func = torch.hpu.synchronize
else:
synchronize_func = torch.cuda.synchronize


# Set the random seed to have reproducable outputs
set_seed(42)

Expand Down Expand Up @@ -59,25 +66,25 @@
)

# Move the inputs to the first device
input = input.to("cuda:0")
input = input.to(torch_device)

# Take an average of 5 times
# Measure first batch
torch.cuda.synchronize()
synchronize_func()
start_time = time.time()
with torch.no_grad():
output = model(input)
torch.cuda.synchronize()
synchronize_func()
end_time = time.time()
first_batch = end_time - start_time

# Now that CUDA is init, measure after
torch.cuda.synchronize()
# Now that device/backend is init, measure after
synchronize_func()
start_time = time.time()
for i in range(5):
with torch.no_grad():
output = model(input)
torch.cuda.synchronize()
synchronize_func()
end_time = time.time()

# The outputs are only on the final process by default
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"ruff ~= 0.6.4",
]
extras["docs"] = []
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized"]
extras["test_prod"] = ["pytest>=7.2.0,<=8.0.0", "pytest-xdist", "pytest-subtests", "parameterized", "pytest-order"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIL 👀

extras["test_dev"] = [
"datasets",
"diffusers",
Expand Down
38 changes: 32 additions & 6 deletions src/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@

class Accelerator:
"""
Creates an instance of an accelerator for distributed training (on multi-GPU, TPU) or mixed precision training.
Creates an instance of an accelerator for distributed training or mixed precision training.

Args:
device_placement (`bool`, *optional*, defaults to `True`):
Expand Down Expand Up @@ -534,9 +534,16 @@ def __init__(
and self.distributed_type not in (DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM)
):
self.native_amp = True
if self.device.type not in ("xpu", "cuda", "npu", "xla", "mlu", "musa", "sdaa") or is_torch_xla_available(
check_is_tpu=True
):
if self.device.type not in (
"xpu",
"cuda",
"npu",
"xla",
"mlu",
"musa",
"hpu",
"sdaa",
) or is_torch_xla_available(check_is_tpu=True):
raise ValueError(f"fp16 mixed precision requires a GPU (not {self.device.type!r}).")
kwargs = self.scaler_handler.to_kwargs() if self.scaler_handler is not None else {}
self.scaler = get_grad_scaler(self.distributed_type, **kwargs)
Expand All @@ -545,7 +552,7 @@ def __init__(
DistributedType.DEEPSPEED,
DistributedType.MEGATRON_LM,
):
if self.device.type in ["cpu", "xpu"]:
if self.device.type in ["cpu", "xpu", "hpu"]:
self.native_amp = True
else:
self.native_amp = is_bf16_available(True)
Expand Down Expand Up @@ -1202,6 +1209,7 @@ def join_uneven_inputs(self, joinables, even_batches=None):
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_XPU,
DistributedType.MULTI_HPU,
):
dl_even_batches_values = []

Expand Down Expand Up @@ -1437,6 +1445,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
"""
if device_placement is None:
device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP

self._models.append(model)

# TODO: Look at enabling native TP training directly with a proper config
Expand Down Expand Up @@ -1515,12 +1524,16 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_XPU,
DistributedType.MULTI_HPU,
):
if any(p.requires_grad for p in model.parameters()):
kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
# TODO: Look at enabling native TP training directly with a proper config
if os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true":
device_ids, output_device = [self.local_process_index], self.local_process_index
if self.device.type == "hpu":
device_ids, output_device = [self.device.index], self.device.index
else:
device_ids, output_device = [self.local_process_index], self.local_process_index
else:
device_ids, output_device = None, None

Expand Down Expand Up @@ -1920,13 +1933,25 @@ def _prepare_deepspeed(self, *args):
if self.deepspeed_config["zero_optimization"].get("offload_optimizer", {}).get(
"device", "none"
) != "none" and self.deepspeed_config.get("zero_force_ds_cpu_optimizer", True):
if self.device.type == "hpu" and os.environ.get("PT_HPU_LAZY_MODE", "1") == "1":
raise ValueError(
"You can't use an Offload Optimizer with HPU in Lazy Mode. "
"Please set the environment variable `PT_HPU_LAZY_MODE` to `0`."
)

optimizer = map_pytorch_optim_to_deepspeed(optimizer)
kwargs["optimizer"] = optimizer
if scheduler is not None:
if type(scheduler).__name__ in deepspeed.runtime.lr_schedules.VALID_LR_SCHEDULES:
kwargs["lr_scheduler"] = scheduler

if self.device.type == "hpu":
# This env variable is initialized here to make sure it is set to "true"
# It should be done by the launcher but it does not work for multi-node runs
os.environ["DEEPSPEED_USE_HPU"] = "true"

engine, optimizer, _, lr_scheduler = ds_initialize(**kwargs)

if compare_versions("deepspeed", ">=", "0.14.4") and self.state.dynamo_plugin.backend != DynamoBackend.NO:
compile_kwargs = self.state.dynamo_plugin.to_kwargs()
engine.compile(backend=compile_kwargs.pop("backend"), compile_kwargs=compile_kwargs)
Expand Down Expand Up @@ -3318,6 +3343,7 @@ def _inner(folder):
DistributedType.MULTI_SDAA,
DistributedType.MULTI_MUSA,
DistributedType.MULTI_NPU,
DistributedType.MULTI_HPU,
):
map_location = "on_device"
else:
Expand Down
6 changes: 5 additions & 1 deletion src/accelerate/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
SCHEDULER_NAME,
WEIGHTS_NAME,
get_pretty_name,
is_cuda_available,
is_hpu_available,
is_mlu_available,
is_musa_available,
is_sdaa_available,
Expand Down Expand Up @@ -158,7 +160,9 @@ def save_accelerator_state(
states["torch_sdaa_manual_seed"] = torch.sdaa.get_rng_state_all()
elif is_musa_available():
states["torch_musa_manual_seed"] = torch.musa.get_rng_state_all()
else:
if is_hpu_available():
states["torch_hpu_manual_seed"] = torch.hpu.get_rng_state_all()
if is_cuda_available():
states["torch_cuda_manual_seed"] = torch.cuda.get_rng_state_all()
if is_torch_xla_available():
states["xm_seed"] = xm.get_rng_state()
Expand Down
17 changes: 16 additions & 1 deletion src/accelerate/commands/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,14 @@

import torch

from ...utils import is_mlu_available, is_musa_available, is_npu_available, is_sdaa_available, is_xpu_available
from ...utils import (
is_hpu_available,
is_mlu_available,
is_musa_available,
is_npu_available,
is_sdaa_available,
is_xpu_available,
)
from .config_args import ClusterConfig, default_json_config_file
from .config_utils import SubcommandHelpFormatter

Expand Down Expand Up @@ -81,6 +88,14 @@ def write_basic_config(mixed_precision="no", save_location: str = default_json_c
config["distributed_type"] = "MULTI_MUSA"
else:
config["distributed_type"] = "NO"
elif is_hpu_available():
num_hpus = torch.hpu.device_count()
config["num_processes"] = num_hpus
config["use_cpu"] = False
if num_hpus > 1:
config["distributed_type"] = "MULTI_HPU"
else:
config["distributed_type"] = "NO"
elif torch.cuda.is_available():
num_gpus = torch.cuda.device_count()
config["num_processes"] = num_gpus
Expand Down
Loading
Loading