[doc][modules] Update to modules documentation (#1079)

felipemello1 · Felipe Mello · web-flow · commit 12cbc4b4bf70 · 2024-06-13T14:15:46.000-04:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -29,6 +29,7 @@ Tokenizers
 
     tokenizers.SentencePieceTokenizer
     tokenizers.TikTokenTokenizer
+    tokenizers.Tokenizer
 
 PEFT Components
 ---------------
@@ -41,7 +42,9 @@ PEFT Components
     peft.AdapterModule
     peft.get_adapter_params
     peft.set_trainable_params
-
+    peft.validate_missing_and_unexpected_for_lora
+    peft.validate_state_dict_for_lora
+    peft.disable_adapter
 
 Module Utilities
 ------------------
@@ -52,3 +55,12 @@ These are utilities that are common to and can be used by all modules.
    :nosignatures:
 
    common_utils.reparametrize_as_dtype_state_dict_post_hook
+
+Loss
+------------------
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+   loss.DPOLoss
diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
@@ -22,6 +22,5 @@
     "RMSNorm",
     "TransformerDecoder",
     "TransformerDecoderLayer",
-    "TransformerClassifier",
     "reparametrize_as_dtype_state_dict_post_hook",
 ]
diff --git a/torchtune/modules/kv_cache.py b/torchtune/modules/kv_cache.py
@@ -50,7 +50,7 @@ def reset(self) -> None:
     def update(
         self, input_pos: Tensor, k_val: Tensor, v_val: Tensor
     ) -> Tuple[Tensor, Tensor]:
-        """Update KV cache and return the updated cache.
+        """Update KV cache with the new k_val, v_val and return the updated cache.
 
         Args:
             input_pos (Tensor): Current position tensor with shape [S]
diff --git a/torchtune/modules/lr_schedulers.py b/torchtune/modules/lr_schedulers.py
@@ -38,12 +38,16 @@ def get_cosine_schedule_with_warmup(
         torch.optim.lr_scheduler.LambdaLR with the appropriate schedule.
     """
 
-    def lr_lambda(current_step):
+    def lr_lambda(current_step: int) -> float:
+        # linear warmup phase
         if current_step < num_warmup_steps:
             return current_step / max(1, num_warmup_steps)
+
+        # cosine
         progress = (current_step - num_warmup_steps) / max(
             1, num_training_steps - num_warmup_steps
         )
+
         cosine_lr_multiple = 0.5 * (
             1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)
         )
diff --git a/torchtune/modules/peft/peft_utils.py b/torchtune/modules/peft/peft_utils.py
@@ -241,23 +241,43 @@ def get_merged_lora_ckpt(
 
 @contextlib.contextmanager
 def disable_adapter(model: nn.Module) -> Generator[None, None, None]:
-    for _, v in model.named_modules():
+    """
+    Temporarily disable the adapters in a neural network model. This can be used,
+    for example, in DPO for treating the lora adapters as the policy model
+    and disabling it to treat the base model as the reference model.
+
+    This context manager goes through all modules in the provided neural network model,
+    and if a module has an 'adapter_params' attribute that is callable and a 'disabled' attribute,
+    it sets 'disabled' to True. Then, the control is given back to caller. Once that finalizes,
+    it sets 'disabled' back to False for all modules that were temporarily disabled.
+
+    Args:
+        model (nn.Module): The neural network model whose adapters are to be temporarily disabled.
+    Yields:
+        None: This function yields control back to the caller, with the adapters disabled.
+    Example:
+        >>> with disable_adapter(model):
+        ...     # Perform operations with adapters disabled
+        ...     pass
+
+    """
+    for _, module in model.named_modules():
         if (
-            hasattr(v, "adapter_params")
-            and callable(v.adapter_params)
-            and hasattr(v, "disabled")
+            hasattr(module, "adapter_params")
+            and callable(module.adapter_params)
+            and hasattr(module, "disabled")
         ):
-            v.disabled = True
+            module.disabled = True
     try:
         yield
     finally:
-        for _, v in model.named_modules():
+        for _, module in model.named_modules():
             if (
-                hasattr(v, "adapter_params")
-                and callable(v.adapter_params)
-                and hasattr(v, "disabled")
+                hasattr(module, "adapter_params")
+                and callable(module.adapter_params)
+                and hasattr(module, "disabled")
             ):
-                v.disabled = False
+                module.disabled = False
 
 
 def validate_missing_and_unexpected_for_lora(
@@ -272,7 +292,7 @@ def validate_missing_and_unexpected_for_lora(
     """
     A more memory-efficient way to validate that LoRA state dict loading was done properly.
 
-    Similar to validate_state_dict_for_lora, this function uses a model's LoRA config to
+    Similar to :func:`validate_state_dict_for_lora`, this function uses a model's LoRA config to
     check that LoRA and/or base model weights are loaded into the full model correctly.
     Unlike that function, this method relies only on the values of missing and unexpected
     as returned by the load_state_dict API with strict=False. This allows us to do the
diff --git a/torchtune/modules/tokenizers/_sentencepiece.py b/torchtune/modules/tokenizers/_sentencepiece.py
@@ -60,8 +60,8 @@ def encode(
 
         Args:
             text (str): The input text to be encoded, unbatched.
-            add_bos (bool): Whether to prepend BOS to the input, defaults to True.
-            add_eos (bool): Whether to append EOS to the input, defaults to True.
+            add_bos (bool): Whether to prepend BOS special token (Beginning of Sentence) to the input, defaults to True.
+            add_eos (bool): Whether to append EOS special token (End of Sentence) to the input, defaults to True.
             trim_leading_whitespace (bool): Whether to trim leading whitespace from
                 underlying sentencepiece tokenization. Sentencepiece normally prepends
                 whitespace to any tokenized text, which can cause differences where
diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/tokenizers/_tiktoken.py
@@ -290,11 +290,11 @@ def decode(
         """
         if truncate_at_eos:
             try:
-                k = token_ids.index(self.eos_id)
+                idx_eos = token_ids.index(self.eos_id)
             except ValueError:
-                k = None
-            if k:
-                token_ids = token_ids[:k]
+                idx_eos = None
+            if idx_eos:
+                token_ids = token_ids[:idx_eos]
         token_ids = [token_id for token_id in token_ids if token_id != self.bos_id]
         return self.tt_model.decode(token_ids)
 
diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/tokenizers/_utils.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Iterator, List, Protocol, Set, Union
+from typing import Iterator, List, Protocol, Set
 
 from torchtune.data._types import Message
 
@@ -37,17 +37,6 @@ def tokenize_messages(self, token_ids: List[Message], **kwargs):
         pass
 
 
-def truncate(
-    tokens: List[int],
-    max_seq_len: int,
-    eos_id: Union[int, bool],
-):
-    tokens_truncated = tokens[:max_seq_len]
-    if tokens_truncated[-1] != eos_id:
-        tokens_truncated[-1] = eos_id
-    return tokens_truncated
-
-
 def _split_long_repetitions(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
     """
     Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py
@@ -62,11 +62,6 @@ def forward(
             Tensor: output tensor with same shape as input
                 [batch_size x seq_length x embed_dim]
 
-        Notation used for tensor shapes:
-            - b: batch size
-            - s: sequence length
-            - d: embed dim
-
         TODO:
             - Make position of norm configurable
         """
@@ -75,13 +70,13 @@ def forward(
         # Norm applied before self-attention
         attn_out = self.attn(self.sa_norm(x), mask=mask, input_pos=input_pos)
 
-        # Residual connection; shape: [b, s, d]
+        # Residual connection; shape: [batch_size, seq_length, embed_dim]
         h = attn_out + x
 
         # Norm applied before the feedforward layer
         mlp_out = self.mlp(self.mlp_norm(h))
 
-        # Residual connection; shape: [b, s, d]
+        # Residual connection; shape: [batch_size, seq_length, embed_dim]
         out = h + mlp_out
         return out
 

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,5 @@`
`22`	`22`	`"RMSNorm",`
`23`	`23`	`"TransformerDecoder",`
`24`	`24`	`"TransformerDecoderLayer",`
`25`		`- "TransformerClassifier",`
`26`	`25`	`"reparametrize_as_dtype_state_dict_post_hook",`
`27`	`26`	`]`