pytorch
diff --git a/‎test/distributed/fsdp/test_fsdp_comm_hooks.py
Lines changed: 24 additions & 18 deletions b/‎test/distributed/fsdp/test_fsdp_comm_hooks.py
Lines changed: 24 additions & 18 deletions
diff --git a/‎test/distributed/fsdp/test_fsdp_flatten_params.py
Lines changed: 15 additions & 13 deletions b/‎test/distributed/fsdp/test_fsdp_flatten_params.py
Lines changed: 15 additions & 13 deletions
diff --git a/‎test/distributed/fsdp/test_fsdp_freezing_weights.py
Lines changed: 13 additions & 10 deletions b/‎test/distributed/fsdp/test_fsdp_freezing_weights.py
Lines changed: 13 additions & 10 deletions
@@ -12,35 +12,42 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, MixedPrecision
 from torch.distributed.fsdp.fully_sharded_data_parallel import ShardingStrategy
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import (
     requires_nccl,
     requires_nccl_version,
     skip_but_pass_in_sandcastle_if,
     skip_if_lt_x_gpu,
 )
-from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
 from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TEST_CUDA,
+    TEST_HPU,
 )
 
 
+device_type = torch.device(get_devtype())
+
+
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
 
 # bfloat16 is only supported by CUDA 11+
-BFLOAT16_AVAILABLE = torch.cuda.is_available() and (
-    torch.version.cuda is not None or torch.version.hip is not None
+BFLOAT16_AVAILABLE = (
+    True
+    if (TEST_CUDA and (torch.version.cuda is not None or torch.version.hip is not None))
+    or TEST_HPU
+    else False
 )
 
 
 class Net(nn.Module):
     def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
         # to ensure determinism
         torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
         super().__init__()
 
         if has_wrapping:
@@ -50,12 +57,12 @@ def __init__(self, has_wrapping, sharding_strategy, mixed_precision=None):
                     nn.ReLU(),
                     FSDP(
                         nn.Linear(16, 8),
-                        device_id=torch.cuda.current_device(),
+                        device_id=device_type,
                         sharding_strategy=sharding_strategy,
                         mixed_precision=mixed_precision,
                     ),
                 ),
-                device_id=torch.cuda.current_device(),
+                device_id=device_type,
                 sharding_strategy=sharding_strategy,
                 mixed_precision=mixed_precision,
             )
@@ -134,13 +141,13 @@ def test_default_communication_hook_behavior(
         """
         out_dim = self.world_size
         net = torch.nn.Linear(1, out_dim, bias=False)
-        inpt = torch.tensor([self.rank]).float().cuda(self.rank)
+        inpt = torch.tensor([self.rank]).float().to(device_type.type)
 
         net_default_hook = FSDP(
             net,
-            device_id=torch.cuda.current_device(),
+            device_id=device_type,
             sharding_strategy=sharding_strategy,
-        ).to(self.rank)
+        ).to(device_type.type)
 
         # Check that by default, `_comm_hook` is None
         for entry in FSDP.fsdp_modules(net_default_hook):
@@ -172,13 +179,12 @@ def _get_submodules(self, fsdp_net):
         ]
 
     def _init_model(self, core, sharding_strategy, mixed_precision=None):
-        device = torch.device("cuda")
         return FSDP(
             core,
-            device_id=torch.cuda.current_device(),
+            device_id=device_type,
             sharding_strategy=sharding_strategy,
             mixed_precision=mixed_precision,
-        ).to(device)
+        ).to(device_type)
 
     @skip_if_lt_x_gpu(2)
     @parametrize("has_wrapping", [True, False])
@@ -277,9 +283,10 @@ def test_registering_hook_hybrid_strategy(self):
             ShardingStrategy.HYBRID_SHARD,
             ShardingStrategy._HYBRID_SHARD_ZERO2,
         ):
-            model = Net(False, None, None).cuda()
+            model = Net(False, None, None).to(device_type)
             fsdp_model = FSDP(
                 model,
+                device_id=device_type,
                 auto_wrap_policy=ModuleWrapPolicy({nn.Linear}),
                 sharding_strategy=sharding_strategy,
             )
@@ -337,7 +344,6 @@ def _check_low_precision_hook(
     ):
         # keep everything deterministic for input data
         torch.manual_seed(0)
-        torch.cuda.manual_seed(0)
 
         fsdp_with_hook = self._init_model(
             Net(has_wrapping=has_wrapping, sharding_strategy=sharding_strategy),
@@ -359,7 +365,7 @@ def _check_low_precision_hook(
         optim_hook = torch.optim.SGD(fsdp_with_hook.parameters(), lr=0.1)
         optim_mp = torch.optim.SGD(fsdp_with_mp.parameters(), lr=0.1)
 
-        in_data = torch.rand(16, 8).cuda()
+        in_data = torch.rand(16, 8).to(device_type)
         fsdp_with_hook.train()
         fsdp_with_mp.train()
         loss_hook = fsdp_with_hook(in_data).sum()
@@ -426,7 +432,7 @@ def test_bf16_hook(
         )
 
 
-instantiate_parametrized_tests(TestCommunicationHooks)
-
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(TestCommunicationHooks, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
@@ -11,16 +11,18 @@
     FlatParamShardMetadata,
     HandleShardingStrategy,
 )
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
 from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
 
+device_type = torch.device(get_devtype())
+
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -45,7 +47,7 @@ def world_size(self) -> int:
 
     def _get_default_config(self):
         return {
-            "device": torch.device("cuda"),
+            "device": torch.device(device_type),
             "sharding_strategy": HandleShardingStrategy.FULL_SHARD,
             "offload_params": False,
             "mp_param_dtype": None,
@@ -103,8 +105,8 @@ def _test_partial_flattening(self, half: bool):
         params_to_flatten = encoder_1_params + decoder_0_params
         num_params = [len(encoder_1_params), len(decoder_0_params)]
         numel_to_flatten = sum(p.numel() for p in params_to_flatten)
-        module.encoder.layers[1] = FSDP(module.encoder.layers[1])
-        module.decoder.layers[0] = FSDP(module.decoder.layers[0])
+        module.encoder.layers[1] = FSDP(module.encoder.layers[1], device_id=device_type)
+        module.decoder.layers[0] = FSDP(module.decoder.layers[0], device_id=device_type)
         flat_params = [
             module.encoder.layers[1]._flat_param,
             module.decoder.layers[0]._flat_param,
@@ -173,7 +175,7 @@ def test_empty_module(self):
         module = self._get_empty_module()
         in_data = torch.rand(1)
         ref_out = module(in_data)
-        fsdp_module = FSDP(module)
+        fsdp_module = FSDP(module, device_id=device_type)
         self.assertEqual(len(list(fsdp_module.parameters())), 0)
         self.assertIsNone(fsdp_module._flat_param)
         fsdp_out = fsdp_module(in_data)
@@ -270,9 +272,9 @@ def _test_output_with_shared_params(self, half: bool):
         self._test_output(module)
 
     def _test_output(self, module: nn.Module):
-        module = module.to(self.rank)
+        module = module.to(device_type)
         ref_output = self._get_output(module)
-        fsdp_module = FSDP(module)
+        fsdp_module = FSDP(module, device_id=device_type)
         fsdp_output = self._get_output(fsdp_module)
         self.assertEqual(ref_output, fsdp_output)
 
@@ -295,14 +297,14 @@ def test_pnorm_after_step_with_shared_params(self):
         )
 
     def _test_pnorm_after_step_with_shared_params(self, half: bool):
-        module = self._get_shared_params_transformer().to(self.rank)
+        module = self._get_shared_params_transformer().to(device_type)
         if half:
             module = module.half()
         ref_pnorm_after_step = self._get_pnorm_after_step(module)
-        module = self._get_shared_params_transformer().to(self.rank)  # recreate
+        module = self._get_shared_params_transformer().to(device_type)  # recreate
         if half:
             module = module.half()
-        fsdp_module = FSDP(module)
+        fsdp_module = FSDP(module, device_id=device_type)
         fsdp_pnorm_after_step = self._get_pnorm_after_step(fsdp_module)
         self.assertEqual(ref_pnorm_after_step, fsdp_pnorm_after_step)
 
@@ -648,7 +650,7 @@ def test_flat_param_shard_metadata_with_memory_format(self, memory_format):
         )
 
 
-instantiate_parametrized_tests(TestFlattenParams)
-
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(TestFlattenParams, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
@@ -10,16 +10,18 @@
 from torch import distributed as dist
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.nn.parallel import DistributedDataParallel
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest, get_full_params
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, get_full_params
 from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
 )
 
 
+device_type = torch.device(get_devtype())
+
 if not dist.is_available():
     print("Distributed not available, skipping tests", file=sys.stderr)
     sys.exit(0)
@@ -47,7 +49,6 @@ def __init__(
             nn.AdaptiveAvgPool2d(output_size=(1, 1)),
             nn.Flatten(),
         )
-        self.device = torch.cuda.current_device()
         self.head = nn.Linear(64, 10)
         if with_fsdp and freeze_after_wrap_fsdp:
             self.fsdp_wrap(fsdp_kwargs)
@@ -56,6 +57,7 @@ def __init__(
         )
 
     def fsdp_wrap(self, fsdp_kwargs):
+        fsdp_kwargs = {"device_id": device_type}
         self.trunk = FSDP(self.trunk, **fsdp_kwargs)
         self.head = FSDP(self.head, **fsdp_kwargs)
 
@@ -90,6 +92,7 @@ def __init__(
         )
 
     def fsdp_wrap(self, fsdp_kwargs):
+        fsdp_kwargs = {"device_id": device_type}
         for name, child in self.trunk.named_children():
             wrapped_child = FSDP(child, **fsdp_kwargs)
             setattr(self.trunk, name, wrapped_child)
@@ -145,15 +148,15 @@ def _dist_train(
         forward_prefetch,
     ):
         torch.manual_seed(0)
-        batch = torch.randn(size=(2, 3, 224, 224)).cuda()
+        batch = torch.randn(size=(2, 3, 224, 224)).to(device_type)
 
         fsdp_kwargs = {
-            "device_id": self.rank,
             "forward_prefetch": forward_prefetch,
+            "device_id": device_type,
         }
 
         ddp_kwargs = {
-            "device_ids": [self.rank],
+            "device_ids": [device_type],
             "find_unused_parameters": True if disable_autograd else False,
         }
 
@@ -164,7 +167,7 @@ def _dist_train(
             disable_autograd,
             fsdp_kwargs,
         )
-        model = model.cuda()
+        model = model.to(device_type)
 
         # freezing the trunk using requires_grad.
         if freezing_method == FreezingMethod.RequiresGrad:
@@ -178,7 +181,7 @@ def _dist_train(
         else:
             model = DistributedDataParallel(model, **ddp_kwargs)
 
-        target = torch.tensor([0, 1], dtype=torch.long).cuda()
+        target = torch.tensor([0, 1], dtype=torch.long).to(device_type)
         criterion = nn.CrossEntropyLoss()
         optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
 
@@ -245,7 +248,7 @@ def test_freezing_weights(
                 self.assertEqual(ddp_param.requires_grad, fsdp_param.requires_grad)
 
 
-instantiate_parametrized_tests(TestFreezingWeights)
-
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(TestFreezingWeights, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()