don't forget about gemma, nicer docstrings, fix a dora bug

ebsmothers · ebsmothers · commit 2d32730f8c0b · 2024-09-07T11:15:23.000-07:00
diff --git a/torchtune/models/gemma/_component_builders.py b/torchtune/models/gemma/_component_builders.py
@@ -289,7 +289,6 @@ def lora_gemma_self_attention(
             alpha=lora_alpha,
             dropout=lora_dropout,
             quantize_base=quantize_base,
-            use_dora=use_dora,
         )
         if "q_proj" in lora_modules
         else (
@@ -306,7 +305,6 @@ def lora_gemma_self_attention(
             alpha=lora_alpha,
             dropout=lora_dropout,
             quantize_base=quantize_base,
-            use_dora=use_dora,
         )
         if "k_proj" in lora_modules
         else (
@@ -323,7 +321,6 @@ def lora_gemma_self_attention(
             alpha=lora_alpha,
             dropout=lora_dropout,
             quantize_base=quantize_base,
-            use_dora=use_dora,
         )
         if "v_proj" in lora_modules
         else (
@@ -340,7 +337,6 @@ def lora_gemma_self_attention(
             alpha=lora_alpha,
             dropout=lora_dropout,
             quantize_base=quantize_base,
-            use_dora=use_dora,
         )
         if "output_proj" in lora_modules
         else (
@@ -385,7 +381,6 @@ def lora_gemma_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
-        use_dora=use_dora,
     )
     down_proj = adapter_cls(
         in_dim=hidden_dim,
@@ -394,7 +389,6 @@ def lora_gemma_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
-        use_dora=use_dora,
     )
     up_proj = adapter_cls(
         in_dim=dim,
@@ -403,7 +397,6 @@ def lora_gemma_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
-        use_dora=use_dora,
     )
     activation = nn.GELU(approximate="tanh")
 
diff --git a/torchtune/models/gemma/transformer.py b/torchtune/models/gemma/transformer.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
+from typing import List, Optional
 
 import torch
 import torch.nn as nn
@@ -98,6 +98,28 @@ def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None:
             torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool)
         )
 
+    @torch.compiler.disable
+    def chunked_output(self, last_hidden_state: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Apply output projection in chunks. This should be applied in conjunction with
+        :class:`~torchtune.modules.loss.CEWithChunkedOutputLoss` as upcasting to fp32 is done there.
+
+        To use this method, you should first call
+        :func:`~torchtune.models.gemma.GemmaTransformerDecoder.set_num_output_chunks`.
+
+        Args:
+            last_hidden_state (torch.Tensor): last hidden state of the decoder, having shape
+                [b, seq_len, embed_dim].
+
+        Returns:
+            List[torch.Tensor]: List of num_chunks output tensors, each with shape
+                [b, seq_len/num_chunks, out_dim], where out_dim is usually the vocab size.
+        """
+        return [
+            F.linear(chunk, self.tok_embeddings.weight)
+            for chunk in last_hidden_state.chunk(self.num_output_chunks, dim=1)
+        ]
+
     def forward(
         self,
         tokens: torch.Tensor,
@@ -168,13 +190,7 @@ def forward(
         h = self.norm(h)
 
         if self.num_output_chunks > 0:
-            # shape: [b, seq_len/num_chunks, out_dim] - out_dim is usually the vocab size
-            # Used with CEWithChunkedOutputLoss. Need to set num_output_chunks in the recipe,
-            # before calling forward. Upcasting it done inside of the loss function.
-            output = [
-                F.linear(chunk, self.tok_embeddings.weight)
-                for chunk in h.chunk(self.num_output_chunks, dim=1)
-            ]
+            output = self.chunked_output(h)
         else:
             # shape: [b, seq_len, out_dim]
             output = F.linear(h, self.tok_embeddings.weight).float()
diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -383,9 +383,19 @@ def reset_caches(self):
     @torch.compiler.disable
     def chunked_output(self, last_hidden_state: torch.Tensor) -> List[torch.Tensor]:
         """
-        shape: [b, seq_len/num_chunks, out_dim] - out_dim is usually the vocab size
-        Used with CEWithChunkedOutputLoss. Need to set num_output_chunks in the recipe,
-        before calling forward. Upcasting it done inside of the loss function.
+        Apply output projection in chunks. This should be applied in conjunction with
+        :class:`~torchtune.modules.loss.CEWithChunkedOutputLoss` as upcasting to fp32 is done there.
+
+        To use this method, you should first call
+        :func:`~torchtune.modules.TransformerDecoder.set_num_output_chunks`.
+
+        Args:
+            last_hidden_state (torch.Tensor): last hidden state of the decoder, having shape
+                [b, seq_len, embed_dim].
+
+        Returns:
+            List[torch.Tensor]: List of num_chunks output tensors, each with shape
+                [b, seq_len/num_chunks, out_dim], where out_dim is usually the vocab size.
         """
         return [
             self.output(chunk)
@@ -604,9 +614,19 @@ def reset_caches(self):
     @torch.compiler.disable
     def chunked_output(self, last_hidden_state: torch.Tensor) -> List[torch.Tensor]:
         """
-        shape: [b, seq_len/num_chunks, out_dim] - out_dim is usually the vocab size
-        Used with CEWithChunkedOutputLoss. Need to set num_output_chunks in the recipe,
-        before calling forward. Upcasting it done inside of the loss function.
+        Apply output projection in chunks. This should be applied in conjunction with
+        :class:`~torchtune.modules.loss.CEWithChunkedOutputLoss` as upcasting to fp32 is done there.
+
+        To use this method, you should first call
+        :func:`~torchtune.modules.TiedEmbeddingTransformerDecoder.set_num_output_chunks`.
+
+        Args:
+            last_hidden_state (torch.Tensor): last hidden state of the decoder, having shape
+                [b, seq_len, embed_dim].
+
+        Returns:
+            List[torch.Tensor]: List of num_chunks output tensors, each with shape
+                [b, seq_len/num_chunks, out_dim], where out_dim is usually the vocab size.
         """
         return [
             F.linear(chunk, self.tok_embeddings.weight)