Skip to content

Commit f47f633

Browse files
authored
Log grad norm aggregated over all ranks, not just rank zero (#2248)
1 parent 262122b commit f47f633

File tree

6 files changed

+6
-6
lines changed

6 files changed

+6
-6
lines changed

recipes/dev/early_exit_finetune_distributed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -951,7 +951,7 @@ def train(self) -> None:
951951
grad_norm = torch.nn.utils.clip_grad_norm_(
952952
self._model.parameters(),
953953
max_norm=float(self._clip_grad_norm),
954-
)
954+
).full_tensor()
955955
self._optimizer.step()
956956
self._optimizer.zero_grad(set_to_none=True)
957957

recipes/full_finetune_distributed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ def train(self) -> None:
786786
grad_norm = torch.nn.utils.clip_grad_norm_(
787787
self._model.parameters(),
788788
max_norm=float(self._clip_grad_norm),
789-
)
789+
).full_tensor()
790790
self._optimizer.step()
791791
self._optimizer.zero_grad(set_to_none=True)
792792

recipes/lora_finetune_distributed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -828,7 +828,7 @@ def train(self) -> None:
828828
grad_norm = torch.nn.utils.clip_grad_norm_(
829829
self._model.parameters(),
830830
max_norm=float(self._clip_grad_norm),
831-
)
831+
).full_tensor()
832832
self._optimizer.step()
833833
self._optimizer.zero_grad(set_to_none=True)
834834
self._lr_scheduler.step()

recipes/lora_finetune_distributed_multi_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -857,7 +857,7 @@ def train(self) -> None:
857857
grad_norm = torch.nn.utils.clip_grad_norm_(
858858
self._model.parameters(),
859859
max_norm=float(self._clip_grad_norm),
860-
)
860+
).full_tensor()
861861
self._optimizer.step()
862862
self._optimizer.zero_grad(set_to_none=True)
863863
self._lr_scheduler.step()

recipes/qat_distributed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -857,7 +857,7 @@ def train(self) -> None:
857857
grad_norm = torch.nn.utils.clip_grad_norm_(
858858
self._model.parameters(),
859859
max_norm=float(self._clip_grad_norm),
860-
)
860+
).full_tensor()
861861
self._optimizer.step()
862862
self._optimizer.zero_grad(set_to_none=True)
863863

recipes/qat_lora_finetune_distributed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,7 @@ def train(self) -> None:
872872
grad_norm = torch.nn.utils.clip_grad_norm_(
873873
self._model.parameters(),
874874
max_norm=float(self._clip_grad_norm),
875-
)
875+
).full_tensor()
876876
self._optimizer.step()
877877
self._optimizer.zero_grad(set_to_none=True)
878878
self._lr_scheduler.step()

0 commit comments

Comments
 (0)