introduce intermediate scaling for numerical stability

RobotSail · RobotSail · commit 5ceb42455c14 · 2025-05-18T19:13:23.000Z
Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -359,6 +359,8 @@ def train(
     tokenizer: PreTrainedTokenizer,
     train_loader: DataLoader,
     grad_accum,
+    metric_logger,
+    packing_max_batch_len: int,
 ):
     model.train()
 
@@ -442,6 +444,22 @@ def train(
             # loss = (
             #     loss / num_loss_counted_tokens * world_size
             # )  # dividing by the total number of non-padding tokens and multiplying by the number of GPUs so when accelerate averages by world_size, it will be the correct loss.
+
+            # XXX(osilkin): Since we are accumulating gradients and sharding across cards, we need to
+            #               ensure that the loss is correctly scaled across the entire mini-batch.
+            #
+            #               To achieve this, we do the following:
+            #               1. Sum up the loss across nodes in a step then scale it by the inverse of
+            #                  the max possible loss tokens we might see (N)
+            #               2. Since the scaling factor is a constant, final gradients will look like: 1/N * g1 + 1/N * g2 + ... + 1/N * gn
+            #                  this means that for a given parameter, we can factor out the scalar: 1/N * (g1 + g2 + ... + gn)
+            #               3. At the end of the batch, we've counted up the true number of loss tokens seen T, so we create the adjusted
+            #                  scalar C = N/T and compute as a single number
+            #               4. Multiply the final gradient by this scalar so everything cancels out correctly:
+            #                  (N/T) * 1/N * (g1 + g2 + ... + gn)
+            loss = (
+                loss / packing_max_batch_len
+            )  # dividing by the total number of non-padding tokens and multiplying by the number of GPUs so when accelerate averages by world_size, it will be the correct loss.
             base_logger.info(
                 f"Epoch: {epoch}, Step: {global_step}, Rank: {torch.distributed.get_rank()}, loss = {loss}"
             )
@@ -454,10 +472,14 @@ def train(
                 #               to numeric instability
                 #
                 #               Also I'm not sure if this will work with FSDP Full Shard
+                # XXX(osilkin): not sure how this will work when doing full shard
+                corrected_scalar = float(packing_max_batch_len) / float(
+                    total_minibatch_loss_tokens_seen
+                )
                 for p in model.parameters():
                     grad = p.grad
                     assert grad is not None
-                    grad.mul_(1.0 / total_minibatch_loss_tokens_seen)
+                    grad.mul_(corrected_scalar)
 
                 global_grad_norm = accelerator.clip_grad_norm_(model.parameters(), 1.0)
                 optimizer.step()
@@ -700,6 +722,8 @@ def main(args):
         tokenizer,
         train_loader,
         grad_accum,
+        metric_logger,
+        packing_max_batch_len,
     )
 
     torch.distributed.barrier()