refactor the total norm computation in grad clipping in APS (#3243)

Gavin Zhang · facebook-github-bot · commit 1305e2fdd97e · 2025-07-30T18:23:36.000-07:00
Summary: Pull Request resolved: #3243 Refactored the previous code for applying gradient clipping across ddp and fsdp parameter. Added a new funciton _compute_total_norm() that takes in the fsdp and ddp params provided in the gradientclippingOpitmizer class and computes the total gradient norm of the given parameter. Differential Revision: D79128843
diff --git a/torchrec/optim/clipping.py b/torchrec/optim/clipping.py
@@ -135,98 +135,121 @@ def step(self, closure: Any = None) -> None:
         super().step(closure)
         self._step_num += 1
 
-    @torch.no_grad()
     def clip_grad_norm_(self) -> Optional[Union[float, torch.Tensor]]:
         """Clip the gradient norm of all parameters."""
         max_norm = self._max_gradient
         norm_type = float(self._norm_type)
         all_grads = []
         total_grad_norm = None
 
+        sharded_params = self._sharded_params
+        ddp_params = self._replicate_params
+
         # Process distributed parameters and gradients
-        for pgs, dist_params in self._sharded_params.items():
-            sharded_grads = [
-                p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
-                for p in dist_params
-                if p.grad is not None and p.grad.numel() > 0
-            ]
-            if len(sharded_grads) == 0:
-                continue
+        for dist_params in sharded_params.values():
+            sharded_grads = get_grads(dist_params)
             all_grads.extend(sharded_grads)
 
-            sharded_grad_norm = _batch_cal_norm(
-                sharded_grads,
-                max_norm,
-                norm_type,
-                pgs,
-            )
-            total_grad_norm = (
-                sharded_grad_norm
-                if total_grad_norm is None
-                else (
-                    torch.maximum(total_grad_norm, sharded_grad_norm)
-                    if norm_type == torch.inf
-                    else total_grad_norm + sharded_grad_norm
-                )
-            )
-
-        square_sharded_grad_norm = total_grad_norm if total_grad_norm is not None else 0
-
         # Process replicated parameters and gradients
-        if self._replicate_params:
-            replicated_grads = [
-                p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
-                for p in self._replicate_params
-                if p.grad is not None and p.grad.numel() > 0
-            ]
-            all_grads.extend(replicated_grads)
-
-            replicated_grad_norm = _batch_cal_norm(
-                replicated_grads,
-                max_norm,
-                norm_type,
-                None,
-            )
-            total_grad_norm = (
-                replicated_grad_norm
-                if total_grad_norm is None
-                else (
-                    torch.maximum(total_grad_norm, replicated_grad_norm)
-                    if norm_type == torch.inf
-                    else total_grad_norm + replicated_grad_norm
-                )
-            )
-            square_replicated_grad_norm = replicated_grad_norm
-        else:
-            square_replicated_grad_norm = 0
-
-        global log_grad_norm
-        if log_grad_norm:
-            if total_grad_norm is not None and norm_type != torch.inf:
-                # pyre-ignore[58]
-                grad_norm = total_grad_norm ** (1.0 / norm_type)
-            else:
-                grad_norm = total_grad_norm
+        if ddp_params:
+            ddp_grads = get_grads(ddp_params)
+            all_grads.extend(ddp_grads)
 
-            rank = dist.get_rank()
-            logger.info(
-                f"Clipping [rank={rank}, step={self._step_num}]: square_sharded_grad_norm = {square_sharded_grad_norm}, square_replicated_grad_norm = {square_replicated_grad_norm}, total_grad_norm = {grad_norm}"
-            )
-
-        # Aggregation
-        if total_grad_norm is None:
-            return
+        total_grad_norm = _compute_total_norm(
+            ddp_params, sharded_params, norm_type, max_norm
+        )
 
-        if norm_type != torch.inf:
-            # pyre-ignore [58]: ** is not supported for operand types torch._tensor.Tensor and float.
-            total_grad_norm = total_grad_norm ** (1.0 / norm_type)
         # pyre-ignore [58]: / is not supported for operand types float and Union[float, torch._tensor.Tensor].
         clip_coef = cast(torch.Tensor, max_norm / (total_grad_norm + 1e-6))
         clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
         torch._foreach_mul_(all_grads, clip_coef_clamped)
         return total_grad_norm
 
 
+def get_grads(
+    param_list: List[torch.Tensor],
+) -> List[torch.Tensor]:
+    """Get the gradients of a list of parameters. Converts DTensors to local tensors if needed."""
+    grads = [
+        p.grad._local_tensor if isinstance(p.grad, DTensor) else p.grad
+        for p in param_list
+        if p.grad is not None and p.grad.numel() > 0
+    ]
+    return grads
+
+
+def _compute_total_norm(
+    ddp_params: Optional[List[torch.Tensor]] = None,
+    sharded_params: Optional[Dict[Tuple[dist.ProcessGroup], List[torch.Tensor]]] = None,
+    norm_type: float = 2.0,  # can be a normal float, or torch.inf
+    max_grad_norm: float = 1.0,
+) -> torch.Tensor:
+    """
+    Given both ddp params and sharded params, compute the total norm of the gradients of the full ddp params and the
+    full sharded param (parameters with a process group).
+
+    Args:
+        ddp_params (List[torch.Tensor]): list of ddp params
+        sharded_params (Dict[Tuple[dist.ProcessGroup], List[torch.Tensor]]): dict that maps each process group to a list of tensors
+        norm_type (Union[float, str]): type of the used p-norm. Can be ``'inf'`` for infinity norm.
+        enable_global_grad_clip (bool): whether to compute total norm using all fsdp shards in the process group
+        param_to_pgs (Dict[torch.nn.Parameter, List[dist.ProcessGroup]]): mapping of parameters to process groups.
+    """
+
+    ## compute |W|^p corresponding to all DDP params W
+
+    if ddp_params is None:
+        ddp_params = []
+    if sharded_params is None:
+        sharded_params = defaultdict(list)
+
+    def get_grad_norm_power(
+        param_list: List[torch.Tensor],
+        norm_type: float,
+        max_grad_norm: float,
+        pgs: Optional[Tuple[dist.ProcessGroup]] = None,
+    ) -> torch.Tensor:
+        """
+        Given a list of parameters, convert them to local tensors if they are DTensors,
+        and compute the squared (or p-th power) norm of the gradients of the parameters.
+        """
+        grad_list = get_grads(param_list)
+        return _batch_cal_norm(grad_list, max_grad_norm, norm_type, pgs)
+
+    ## compute the norm |W|^p corresponding to all sharded params W
+    sharded_grad_norm: torch.Tensor = torch.tensor(0.0)
+    if sharded_params:
+        combine_sharded_norm_operator = (
+            torch.maximum if norm_type == torch.inf else torch.add
+        )
+
+        # We need to move sharded_grad_norm to the same device as the first shard so that we can do addition (or take max)
+        # this is specifically for the case where sharded_grad_norm is 0, and ddp_grad_norm is not,
+        # because by default torch.tensor(0.0) is on cpu, and ddp_grad_norm is on GPU. For MTIA
+        # specifically, adding a tensor on cpu and a tensor on GPU will result in an error.
+        for pgs, dist_params in sharded_params.items():
+            shard_norm = get_grad_norm_power(dist_params, norm_type, max_grad_norm, pgs)
+            sharded_grad_norm = combine_sharded_norm_operator(
+                sharded_grad_norm.to(shard_norm.device), shard_norm
+            )
+
+    # Similar to the case above, we move ddp_grad_norm to the same device as sharded_grad_norm so that we can do addition.
+    ddp_grad_norm: torch.Tensor = (
+        get_grad_norm_power(ddp_params, norm_type, max_grad_norm)
+        if ddp_params
+        else torch.tensor(0.0)
+    ).to(sharded_grad_norm.device)
+
+    combine_norm_operator = (
+        torch.maximum
+        if norm_type == torch.inf
+        else lambda a, b: torch.add(a, b).pow(1.0 / norm_type)
+    )
+
+    total_grad_norm = combine_norm_operator(ddp_grad_norm, sharded_grad_norm)
+    return total_grad_norm
+
+
 def _batch_cal_norm(
     grad_list: List[torch.Tensor],
     max_norm: float,