added tie_word_embeddings to llama3_2 models (#2331)

jingzhaoou · web-flow · commit a226a58b8c36 · 2025-02-05T17:10:40.000-05:00
diff --git a/torchtune/models/llama3_2/_component_builders.py b/torchtune/models/llama3_2/_component_builders.py
@@ -52,6 +52,7 @@ def llama3_2(
     intermediate_dim: Optional[int] = None,
     norm_eps: float = 1e-5,
     scale_factor: int = 32,
+    tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Build the decoder associated with the Llama3.2 model. This includes:
@@ -78,6 +79,7 @@ def llama3_2(
             this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
         norm_eps (float): epsilon in RMS norms.
         scale_factor (int): scaling factor for RoPE. Default: 32
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
 
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 model.
@@ -112,7 +114,11 @@ def llama3_2(
         layers.append(layer)
 
     tok_embeddings = nn.Embedding(vocab_size, embed_dim)
-    output_proj = TiedLinear(tok_embeddings)
+    if tie_word_embeddings:
+        output_proj = TiedLinear(tok_embeddings)
+    else:
+        output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+
     return TransformerDecoder(
         tok_embeddings=tok_embeddings,
         layers=layers,
@@ -161,6 +167,7 @@ def lora_llama3_2(
     use_dora: bool = False,
     # Quantization args
     quantize_base: bool = False,
+    tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Return a version of Llama3.2 (an instance of :func:`~torchtune.modules.TransformerDecoder`)
@@ -197,6 +204,7 @@ def lora_llama3_2(
         quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
             weights within linear layers LoRA is applied to. The final output linear projection is not
             supported for quantization currently.
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
 
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 model with LoRA applied to
@@ -254,7 +262,11 @@ def lora_llama3_2(
             "apply_lora_to_output is currently not supporting in llama3.2 1b and 3b,"
             "as the projection layer weights are tied to the embeddings"
         )
-    output_proj = TiedLinear(tok_embeddings)
+    if tie_word_embeddings:
+        output_proj = TiedLinear(tok_embeddings)
+    else:
+        output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+        
     model = TransformerDecoder(
         tok_embeddings=tok_embeddings,
         layers=layers,
diff --git a/torchtune/models/llama3_2/_model_builders.py b/torchtune/models/llama3_2/_model_builders.py
@@ -16,10 +16,15 @@
 the llama3_2_1b model builder uses the llama3_2 component builder to create the
 Llama3.2 1B model.
 """
-def llama3_2_1b() -> TransformerDecoder:
+def llama3_2_1b(
+    tie_word_embeddings: bool = True,
+) -> TransformerDecoder:
     """
     Builder for creating a Llama3.2 model initialized w/ the default 1b parameter values.
     
+    Args:
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
+
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 1B model
     """
@@ -35,11 +40,17 @@ def llama3_2_1b() -> TransformerDecoder:
         norm_eps=1e-5,
         rope_base=500_000,
         scale_factor=32,
+        tie_word_embeddings=tie_word_embeddings,
     )
-def llama3_2_3b() -> TransformerDecoder:
+def llama3_2_3b(
+    tie_word_embeddings: bool = True,
+) -> TransformerDecoder:
     """
     Builder for creating a Llama3.2 model initialized w/ the default 3b parameter values.
 
+    Args:
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
+
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 3B model
     """
@@ -55,6 +66,7 @@ def llama3_2_3b() -> TransformerDecoder:
         norm_eps=1e-5,
         rope_base=500_000,
         scale_factor=32,
+        tie_word_embeddings=tie_word_embeddings,
     )
 def lora_llama3_2_1b(
     lora_attn_modules: List[LORA_ATTN_MODULES],
@@ -65,6 +77,7 @@ def lora_llama3_2_1b(
     lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
+    tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Builder for creating a Llama3.2 1B model with LoRA enabled.
@@ -86,6 +99,7 @@ def lora_llama3_2_1b(
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
 
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 1B model with LoRA applied
@@ -110,6 +124,7 @@ def lora_llama3_2_1b(
         lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
+        tie_word_embeddings=tie_word_embeddings,
     )
 def lora_llama3_2_3b(
     lora_attn_modules: List[LORA_ATTN_MODULES],
@@ -120,6 +135,7 @@ def lora_llama3_2_3b(
     lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
+    tie_word_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Builder for creating a Llama3.2 3B model with LoRA enabled.
@@ -141,6 +157,7 @@ def lora_llama3_2_3b(
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
+        tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied.
 
     Returns:
         TransformerDecoder: Instantiation of Llama3.2 3B model with LoRA applied
@@ -166,6 +183,7 @@ def lora_llama3_2_3b(
         lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
+        tie_word_embeddings=tie_word_embeddings,
     )
 qlora_llama3_2_1b = partial(lora_llama3_2_1b, quantize_base=True)
 qlora_llama3_2_1b.__doc__ = """