command-line overwrite to forcibly untie embeddings for llama3.2 models

ngc92 · ngc92 · commit 08b0f91e491b · 2025-04-14T18:58:21.000+02:00
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -119,6 +119,7 @@ jobs:
         run: ./test_gpt2fp32cu
 
   build-and-test-llama3:
+    name: Build and test LLama3.2 1B
     runs-on: ubicloud-gpu-standard-1-latest
     env:
       HF_TOKEN: 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
@@ -154,18 +155,52 @@ jobs:
       - name: Build BF16 precision
         run: PRECISION=BF16 make train_llama3cu test_llama3cu
 
-      - name: Run default
+      - name: Run default (BF16)
         run: ./test_llama3cu
 
-      - name: Run no recompute GeLU
+      - name: Run no recompute GeLU (BF16)
         run: ./test_llama3cu -r 0
 
-      - name: Run no master weights
+      - name: Run no master weights (BF16)
         run: ./test_llama3cu -w 0
 
-      - name: Run recompute LN
+      - name: Run recompute LN (BF16)
         run: ./test_llama3cu -r 2
 
+  build-and-test-llama3-untied:
+    name: Build and test LLama3.2 1B with untie weights
+    runs-on: ubicloud-gpu-standard-1-latest
+    env:
+      HF_TOKEN: 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
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - run: echo "::add-mask::$HF_TOKEN"
+
+      - name: Install OpenMP
+        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python dev/data/tinyshakespeare.py --model_desc llama-3
+
+      - name: Train model
+        run: python train_llama3.py --write_tensors 1 --dtype float32 --untie 1
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make test_llama3cu
+
+      - name: Run default
+        run: ./test_llama3cu
+
+      - name: Build BF16 precision
+        run: PRECISION=BF16 make train_llama3cu test_llama3cu
+
+      - name: Run default
+        run: ./test_llama3cu
+
   unit-tests-gpu:
     runs-on: ubicloud-gpu-standard-1-latest
 
diff --git a/test_llama3.cu b/test_llama3.cu
@@ -301,7 +301,7 @@ int main(int argc, char *argv[]) {
     }
 
     // expected losses are as follows, from Python (without CPUOffload)
-    float expected_losses[10] = {
+    float expected_losses_untied[10] = {
         4.849688f,
         3.070303f,
         1.711614f,
@@ -313,6 +313,20 @@ int main(int argc, char *argv[]) {
         0.355562f,
         0.334824f
     };
+    float expected_losses_tied[10] = {
+        4.849688f,
+        3.072875f,
+        1.714160f,
+        1.060224f,
+        0.596433f,
+        0.431257f,
+        0.373330f,
+        0.361544f,
+        0.357920f,
+        0.336123f
+    };
+
+    float* expected_losses = model.config.tied_weights ? expected_losses_tied : expected_losses_untied;
 
     // compare
     for (int i = 0; i < 10; i++) {
diff --git a/train_llama3.py b/train_llama3.py
@@ -435,10 +435,16 @@ def unpermute(w, n_heads, dim1, dim2):
         return checkpoint
 
     @classmethod
-    def from_pretrained_llama3_hf(cls, model_id):
+    def from_pretrained_llama3_hf(cls, model_id, untie):
         """Loads pretrained LLaMA model weights from HuggingFace"""
         from transformers import AutoModelForCausalLM, AutoTokenizer
         model_args = MODEL_DICT[model_id]
+        if untie:
+            if not model_args.tied_embeddings:
+                print("Model embeddings are not tied, --untie has no effect.")
+            else:
+                print("Untying token embeddings and LM head.")
+                model_args.tied_embeddings = False
 
         model = AutoModelForCausalLM.from_pretrained(model_id)
         checkpoint = LLaMA.adapt_llama_state_dict_keys_hf(model.state_dict(), model_args)
@@ -1026,6 +1032,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--input_val_bin", type=str, default="", help="input .bin to eval validation loss on")
     parser.add_argument("--output_dir", type=str, default="", help="output directory to which to write logs and checkpoints")
     parser.add_argument("--model", type=str, default="meta-llama/Llama-3.2-1B", help="chose the llama model")
+    parser.add_argument("--untie", type=int, default=False, help="Untie token embeddings and LM-head, even if they are tied in the checkpoint.")
     # token layout for each step of the optimization
     parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
     parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
@@ -1131,7 +1138,7 @@ def print0(*args, **kwargs):
 
     # init the model
     if args.use_hf:
-        model = LLaMA.from_pretrained_llama3_hf(args.model)
+        model = LLaMA.from_pretrained_llama3_hf(args.model, args.untie)
     else:  # use Meta's checkpoint
         assert args.ckpt_dir is not None and os.path.exists(args.ckpt_dir), f"llama3 ckpt dir {args.ckpt_dir} does not exist"
         assert args.tokenizer_path is not None and os.path.exists(args.tokenizer_path), f"llama3 tokenizer path {args.tokenizer_path} does not exist"