Fix FSDP2 and distributed checkpointing imports for older PyTorch versions (huggingface#46141)

ryota-komatsu · 3outeille · web-flow · commit 634500b9a9ce · 2026-05-26T16:41:36.000Z
* Fix PyTorch requirement for FSDP2 to &gt;=2.6

* Fix PyTorch requirement for distributed checkpoint saving to &gt;=2.7

---------

Co-authored-by: Ferdinand Mom &lt;47445085+3outeille@users.noreply.github.com&gt;
diff --git a/src/transformers/distributed/fsdp.py b/src/transformers/distributed/fsdp.py
@@ -28,7 +28,7 @@
 if is_torch_available():
     import torch
 
-if is_torch_available() and is_torch_greater_or_equal("2.5"):
+if is_torch_available() and is_torch_greater_or_equal("2.6"):
     import torch.distributed as dist
     from torch.distributed._composable.fsdp import fully_shard
     from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy, OffloadPolicy
@@ -91,8 +91,8 @@ def initialize_fsdp(
     if fsdp_plan is None:
         return device_map, device_mesh, None
 
-    if not is_torch_greater_or_equal("2.5"):
-        raise OSError("FSDP2 is only supported for `torch>=2.5`.")
+    if not is_torch_greater_or_equal("2.6"):
+        raise OSError("FSDP2 is only supported for `torch>=2.6`.")
 
     if device_mesh is None:
         # Detect the accelerator on the machine
@@ -338,8 +338,8 @@ def apply_fully_shard_data_parallel(
     if not is_torch_available():
         raise ImportError("PyTorch is required for FSDP support")
 
-    if not is_torch_greater_or_equal("2.5"):
-        raise OSError("FSDP2 requires torch>=2.5")
+    if not is_torch_greater_or_equal("2.6"):
+        raise OSError("FSDP2 requires torch>=2.6")
 
     if fsdp_plan is None:
         fsdp_plan = {}
diff --git a/src/transformers/distributed/utils.py b/src/transformers/distributed/utils.py
@@ -39,14 +39,16 @@
 if is_torch_available():
     import torch
     import torch.distributed.checkpoint as dcp
-    from torch.distributed.checkpoint.hf_storage import HuggingFaceStorageWriter
     from torch.distributed.checkpoint.state_dict import (
         get_model_state_dict,
         get_optimizer_state_dict,
         set_optimizer_state_dict,
     )
     from torch.distributed.tensor import DTensor
 
+    if is_torch_greater_or_equal("2.7"):
+        from torch.distributed.checkpoint.hf_storage import HuggingFaceStorageWriter
+
 
 def _ensure_torch_distributed(device_type: str):
     """Initialize torch.distributed if not already initialized."""
@@ -103,6 +105,9 @@ def init_device_mesh(distributed_config: DistributedConfig) -> torch.distributed
     if not is_torch_greater_or_equal("2.5"):
         raise OSError("Distributed training with DistributedConfig requires `torch>=2.5`.")
 
+    if distributed_config.fsdp_size > 1 and not is_torch_greater_or_equal("2.6"):
+        raise OSError("FSDP2 requires `torch>=2.6`.")
+
     device_type = torch._C._get_accelerator().type
     _ensure_torch_distributed(device_type)
 
@@ -205,6 +210,9 @@ def save_model_checkpoint_distributed(model, checkpoint_dir: str) -> None:
     gate||up MoE weights) are replicated to a full tensor on every rank
     before the save, otherwise DCP cannot encode that placement.
     """
+    if not is_torch_greater_or_equal("2.7"):
+        raise OSError("Distributed checkpoint saving requires `torch>=2.7`.")
+
     state_dict = get_model_state_dict(model)
     for key, value in list(state_dict.items()):
         if (