Skip to content

Commit e030626

Browse files
authored
QLoRA with bias + Llama 3.2 Vision QLoRA configs (#1726)
1 parent bc486d4 commit e030626

File tree

14 files changed

+429
-175
lines changed

14 files changed

+429
-175
lines changed

recipes/configs/llama3_2_vision/11B_lora.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ enable_activation_offloading: False
8181
dtype: bf16
8282

8383
# Logging
84-
output_dir: /tmp/full-llama3.2-vision-finetune
84+
output_dir: /tmp/lora-llama3.2-vision-finetune
8585
metric_logger:
8686
_component_: torchtune.training.metric_logging.DiskLogger
8787
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs

recipes/configs/llama3_2_vision/11B_lora_single_device.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ enable_activation_offloading: False
8080
dtype: bf16
8181

8282
# Logging
83-
output_dir: /tmp/full-llama3.2-vision-finetune
83+
output_dir: /tmp/lora-llama3.2-vision-finetune
8484
metric_logger:
8585
_component_: torchtune.training.metric_logging.DiskLogger
8686
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# Config for multi-device QLoRA finetuning in lora_finetune_distributed.py
2+
# using a Llama3.2 11B Vision Instruct model
3+
#
4+
# This config assumes that you've run the following command before launching:
5+
# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
6+
#
7+
# To launch on 2 devices, run the following command from root:
8+
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora
9+
#
10+
# You can add specific overrides through the command line. For example
11+
# to override the checkpointer directory while launching training:
12+
# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3_2_vision/11B_qlora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
13+
#
14+
# This config works best when the model is being fine-tuned on 2+ GPUs.
15+
# For single device QLoRA finetuning please use 11B_qlora_single_device.yaml
16+
17+
# Model arguments
18+
model:
19+
_component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_11b
20+
decoder_trainable: "frozen"
21+
encoder_trainable: "lora"
22+
fusion_trainable: "lora"
23+
lora_attn_modules: ['q_proj', 'v_proj']
24+
apply_lora_to_mlp: False
25+
apply_lora_to_output: False
26+
lora_rank: 8
27+
lora_alpha: 16
28+
lora_dropout: 0.0
29+
image_size: 560 # Make sure this matches the image_size in tokenizer
30+
31+
# Transform
32+
tokenizer:
33+
_component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
34+
path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
35+
image_size: 560
36+
max_seq_len: 8192
37+
38+
# Checkpointer
39+
checkpointer:
40+
_component_: torchtune.training.FullModelMetaCheckpointer
41+
checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
42+
checkpoint_files: [consolidated.pth]
43+
recipe_checkpoint: null
44+
output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
45+
model_type: LLAMA3_VISION
46+
resume_from_checkpoint: False
47+
48+
# Dataset
49+
dataset:
50+
_component_: torchtune.datasets.multimodal.the_cauldron_dataset
51+
subset: ocrvqa
52+
seed: null
53+
shuffle: True
54+
collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
55+
56+
# Fine-tuning arguments
57+
epochs: 1
58+
max_steps_per_epoch: null
59+
batch_size: 2
60+
gradient_accumulation_steps: 4
61+
optimizer:
62+
_component_: torch.optim.AdamW
63+
fused: True
64+
weight_decay: 0.01
65+
lr: 2e-5
66+
lr_scheduler:
67+
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
68+
num_warmup_steps: 100
69+
loss:
70+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
71+
clip_grad_norm: 1.0
72+
compile: False # set it to True for better memory and performance
73+
74+
# Training env
75+
device: cuda
76+
77+
# Memory management
78+
enable_activation_checkpointing: True
79+
enable_activation_offloading: False
80+
dtype: bf16
81+
82+
# Logging
83+
output_dir: /tmp/qlora-llama3.2-vision-finetune
84+
metric_logger:
85+
_component_: torchtune.training.metric_logging.DiskLogger
86+
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
87+
log_every_n_steps: 1
88+
log_peak_memory_stats: False
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Config for single device QLoRA finetuning in lora_finetune_single_device.py
2+
# using a Llama3.2 11B Vision Instruct model
3+
#
4+
# This config assumes that you've run the following command before launching:
5+
# tune download meta-llama/Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Llama-3.2-11B-Vision-Instruct
6+
#
7+
# To launch on a single device, run the following command from root:
8+
# tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device
9+
#
10+
# You can add specific overrides through the command line. For example
11+
# to override the checkpointer directory while launching training:
12+
# tune run lora_finetune_single_device --config llama3_2_vision/11B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
13+
#
14+
# This config works only for training on single device.
15+
16+
# Model arguments
17+
model:
18+
_component_: torchtune.models.llama3_2_vision.qlora_llama3_2_vision_11b
19+
decoder_trainable: "frozen"
20+
encoder_trainable: "lora"
21+
fusion_trainable: "lora"
22+
lora_attn_modules: ['q_proj', 'v_proj']
23+
apply_lora_to_mlp: False
24+
apply_lora_to_output: False
25+
lora_rank: 8
26+
lora_alpha: 16
27+
lora_dropout: 0.0
28+
image_size: 560 # Make sure this matches the image_size in tokenizer
29+
30+
# Transform
31+
tokenizer:
32+
_component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
33+
path: /tmp/Llama-3.2-11B-Vision-Instruct/original/tokenizer.model
34+
image_size: 560
35+
max_seq_len: 8192
36+
37+
# Checkpointer
38+
checkpointer:
39+
_component_: torchtune.training.FullModelMetaCheckpointer
40+
checkpoint_dir: /tmp/Llama-3.2-11B-Vision-Instruct/original/
41+
checkpoint_files: [consolidated.pth]
42+
recipe_checkpoint: null
43+
output_dir: /tmp/Llama-3.2-11B-Vision-Instruct/
44+
model_type: LLAMA3_VISION
45+
resume_from_checkpoint: False
46+
47+
# Dataset
48+
dataset:
49+
_component_: torchtune.datasets.multimodal.the_cauldron_dataset
50+
subset: ocrvqa
51+
seed: null
52+
shuffle: True
53+
collate_fn: torchtune.data.padded_collate_tiled_images_and_mask
54+
55+
# Fine-tuning arguments
56+
epochs: 1
57+
max_steps_per_epoch: null
58+
batch_size: 2
59+
gradient_accumulation_steps: 16
60+
optimizer:
61+
_component_: torch.optim.AdamW
62+
fused: True
63+
weight_decay: 0.01
64+
lr: 2e-5
65+
optimizer_in_bwd: False
66+
lr_scheduler:
67+
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
68+
num_warmup_steps: 100
69+
loss:
70+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
71+
clip_grad_norm: 1.0
72+
compile: False # set it to True for better memory and performance
73+
74+
# Training env
75+
device: cuda
76+
77+
# Memory management
78+
enable_activation_checkpointing: True
79+
enable_activation_offloading: False
80+
dtype: bf16
81+
82+
# Logging
83+
output_dir: /tmp/qlora-llama3.2-vision-finetune
84+
metric_logger:
85+
_component_: torchtune.training.metric_logging.DiskLogger
86+
log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
87+
log_every_n_steps: 1
88+
log_peak_memory_stats: False
89+
90+
# Profiler (disabled)
91+
profiler:
92+
_component_: torchtune.training.setup_torch_profiler
93+
enabled: False
94+
95+
#Output directory of trace artifacts
96+
output_dir: ${output_dir}/profiling_outputs
97+
98+
#`torch.profiler.ProfilerActivity` types to trace
99+
cpu: True
100+
cuda: True
101+
102+
#trace options passed to `torch.profiler.profile`
103+
profile_memory: True
104+
with_stack: False
105+
record_shapes: True
106+
with_flops: False
107+
108+
# `torch.profiler.schedule` options:
109+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
110+
wait_steps: 1
111+
warmup_steps: 2
112+
active_steps: 1
113+
num_cycles: 1

tests/torchtune/modules/low_precision/test_nf4_linear.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@ class TestNF4Linear:
4040
Class for testing our NF4Linear implementation.
4141
"""
4242

43-
def test_bias_unsupported(self):
44-
with pytest.raises(RuntimeError, match="does not currently support biases"):
45-
_ = FrozenNF4Linear(1, 1, bias=True)
46-
4743
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
4844
def test_parameters(self, dtype):
4945
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype)
@@ -59,9 +55,10 @@ def test_state_dict(self, dtype):
5955
assert isinstance(state_dict["weight"], NF4Tensor)
6056

6157
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
62-
def test_output_dtype(self, dtype):
58+
@pytest.mark.parametrize("bias", [True, False])
59+
def test_output_dtype(self, dtype, bias):
6360
# Test to ensure W4 A16 produces A16 / W4A32 produces A32
64-
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype)
61+
nf4_linear = FrozenNF4Linear(512, 512, device="cpu", dtype=dtype, bias=bias)
6562
inp = torch.randn(2, 512, dtype=dtype, requires_grad=True)
6663
out = nf4_linear(inp)
6764
assert out.dtype == dtype

tests/torchtune/modules/peft/test_dora.py

Lines changed: 52 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -49,80 +49,77 @@ def inputs(self, in_dim) -> torch.Tensor:
4949
return inputs
5050

5151
@pytest.fixture
52-
def dora_linear(self, in_dim, out_dim) -> DoRALinear:
53-
dora_linear = DoRALinear(
54-
in_dim=in_dim,
55-
out_dim=out_dim,
56-
rank=RANK,
57-
alpha=ALPHA,
58-
use_bias=False,
59-
)
52+
def dora_linear(self, in_dim, out_dim):
53+
def create_dora_linear(use_bias, dtype, in_dim=in_dim, out_dim=out_dim):
54+
with training.set_default_dtype(dtype):
55+
dora_linear = DoRALinear(
56+
in_dim=in_dim,
57+
out_dim=out_dim,
58+
rank=RANK,
59+
alpha=ALPHA,
60+
use_bias=use_bias,
61+
)
6062

61-
fixed_init_model(dora_linear)
62-
return dora_linear
63+
fixed_init_model(dora_linear)
64+
return dora_linear
65+
66+
return create_dora_linear
6367

6468
@pytest.fixture
65-
def qdora_linear(self, in_dim, out_dim) -> DoRALinear:
66-
with training.set_default_dtype(torch.bfloat16):
67-
qdora_linear = DoRALinear(
68-
in_dim=512,
69-
out_dim=512,
70-
rank=RANK,
71-
alpha=ALPHA,
72-
use_bias=False,
73-
quantize_base=True,
74-
)
75-
fixed_init_model(qdora_linear, dtype=torch.bfloat16)
69+
def qdora_linear(self):
70+
def create_qdora_linear(
71+
use_bias=False, dtype=torch.bfloat16, in_dim=512, out_dim=512
72+
):
73+
with training.set_default_dtype(dtype):
74+
qdora_linear = DoRALinear(
75+
in_dim=in_dim,
76+
out_dim=out_dim,
77+
rank=RANK,
78+
alpha=ALPHA,
79+
use_bias=use_bias,
80+
quantize_base=True,
81+
)
82+
fixed_init_model(qdora_linear)
7683
return qdora_linear
7784

85+
return create_qdora_linear
86+
7887
def test_forward(self, inputs, dora_linear, out_dim) -> None:
88+
dora_linear = dora_linear(use_bias=False, dtype=torch.float32)
7989
expected = torch.tensor(EXPECTED_VAL)
8090
actual = dora_linear(inputs)
8191
assert actual.shape == (BSZ, SEQ_LEN, out_dim)
8292
torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-6)
8393

84-
def test_dora_weight_nf4_when_quantized(self, qdora_linear):
94+
@pytest.mark.parametrize("use_bias", [True, False])
95+
def test_dora_weight_nf4_when_quantized(self, use_bias, qdora_linear):
96+
qdora_linear = qdora_linear(use_bias=use_bias, dtype=torch.bfloat16)
8597
assert isinstance(qdora_linear.weight, NF4Tensor)
86-
87-
def test_bias_raises(self):
88-
with pytest.raises(
89-
NotImplementedError, match="DoRALinear does not support using bias"
90-
):
91-
DoRALinear(
92-
in_dim=512,
93-
out_dim=512,
94-
rank=RANK,
95-
alpha=ALPHA,
96-
use_bias=True,
97-
quantize_base=False,
98-
)
99-
100-
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
101-
def test_qdora_parity(self, dtype):
98+
if use_bias:
99+
assert not isinstance(qdora_linear.bias, NF4Tensor)
100+
assert qdora_linear.bias.dtype == torch.bfloat16
101+
102+
# Note: with bfloat16 F.linear(x, weight, bias) != F.linear(x, weight) + bias.
103+
# This means we would get different results (irrespective of QDoRA).
104+
# So we leave that test case out
105+
@pytest.mark.parametrize(
106+
"use_bias, dtype",
107+
[(False, torch.bfloat16), (True, torch.float32), (False, torch.float32)],
108+
)
109+
def test_qdora_parity(self, use_bias, dtype, dora_linear, qdora_linear):
102110
with training.set_default_dtype(dtype):
103-
torch.manual_seed(0)
104-
qdora_linear = DoRALinear(
105-
in_dim=512,
106-
out_dim=512,
107-
rank=RANK,
108-
alpha=ALPHA,
109-
use_bias=False,
110-
quantize_base=True,
111+
qdora_linear = qdora_linear(
112+
use_bias=use_bias, dtype=dtype, in_dim=512, out_dim=512
111113
)
112-
torch.manual_seed(0)
113-
dora_linear = DoRALinear(
114-
in_dim=512,
115-
out_dim=512,
116-
rank=RANK,
117-
alpha=ALPHA,
118-
use_bias=False,
119-
quantize_base=False,
114+
dora_linear = dora_linear(
115+
use_bias=use_bias, dtype=dtype, in_dim=512, out_dim=512
120116
)
121117

122118
# set weight of dora_linear to unquantized weight of qdora_linear and check
123119
# parity.
124120
dora_linear.weight.data = qdora_linear.weight.to(dtype)
125-
121+
if use_bias:
122+
dora_linear.bias.data = qdora_linear.bias.detach().clone()
126123
qdora_linear.initialize_dora_magnitude()
127124
dora_linear.initialize_dora_magnitude()
128125

0 commit comments

Comments
 (0)