meta-pytorch
diff --git a/‎recipes/configs/code_llama2/7B_full_low_memory.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/code_llama2/7B_full_low_memory.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/code_llama2/7B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/code_llama2/7B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/code_llama2/7B_qlora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/code_llama2/7B_qlora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/2B_full.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/2B_full.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/2B_lora.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/2B_lora.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/2B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/2B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/2B_qlora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/2B_qlora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/7B_full.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/7B_full.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/7B_lora.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/7B_lora.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎recipes/configs/gemma/7B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions b/‎recipes/configs/gemma/7B_lora_single_device.yaml‎
Lines changed: 1 addition & 0 deletions
@@ -64,6 +64,7 @@ optimizer:
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -72,6 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -71,6 +71,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -57,6 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -69,6 +69,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -59,6 +59,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -71,6 +71,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -70,6 +70,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env