Skip to content

Commit 5c7246e

Browse files
authored
Add support to Qwen2-0.5B and Qwen2-1.5B. (#1247)
1 parent 9fd5d01 commit 5c7246e

17 files changed

+968
-34
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Config for multi-device full finetuning in full_finetune_distributed.py
2+
# using a Qwen2 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns ""
7+
#
8+
# To launch on 4 devices, run the following command from root:
9+
# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/0.5B_full
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/0.5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# Single device full finetuning requires more memory optimizations. It's
18+
# best to use 0.5B_full.yaml for those cases
19+
20+
# Tokenizer
21+
tokenizer:
22+
_component_: torchtune.models.qwen2.qwen2_tokenizer
23+
path: /tmp/Qwen2-0.5B-Instruct/vocab.json
24+
merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt
25+
26+
# Dataset
27+
dataset:
28+
_component_: torchtune.datasets.alpaca_cleaned_dataset
29+
seed: null
30+
shuffle: True
31+
32+
# Model Arguments
33+
model:
34+
_component_: torchtune.models.qwen2.qwen2_0_5b
35+
36+
checkpointer:
37+
_component_: torchtune.utils.FullModelHFCheckpointer
38+
checkpoint_dir: /tmp/Qwen2-0.5B-Instruct
39+
checkpoint_files: [
40+
model.safetensors
41+
]
42+
recipe_checkpoint: null
43+
output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
44+
model_type: QWEN2
45+
resume_from_checkpoint: False
46+
47+
# Fine-tuning arguments
48+
batch_size: 2
49+
epochs: 1
50+
optimizer:
51+
_component_: torch.optim.AdamW
52+
lr: 5e-6
53+
loss:
54+
_component_: torch.nn.CrossEntropyLoss
55+
max_steps_per_epoch: null
56+
gradient_accumulation_steps: 16
57+
58+
59+
# Training env
60+
device: cuda
61+
62+
# Memory management
63+
enable_activation_checkpointing: True
64+
memory_efficient_fsdp_wrap: False
65+
66+
# Reduced precision
67+
dtype: bf16
68+
69+
# Logging
70+
metric_logger:
71+
_component_: torchtune.utils.metric_logging.DiskLogger
72+
log_dir: ${output_dir}
73+
output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
74+
log_every_n_steps: 1
75+
log_peak_memory_stats: False
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Config for single device full finetuning in full_finetune_single_device.py
2+
# using a Qwen2 0.5B
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns ""
7+
#
8+
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
9+
# you can install it with
10+
# pip install bitsandbytes
11+
#
12+
# To launch on a single device, run the following command from root:
13+
# tune run full_finetune_single_device --config qwen2/0.5B_full_single_device
14+
#
15+
# You can add specific overrides through the command line. For example
16+
# to override the checkpointer directory while launching training
17+
# you can run:
18+
# tune run full_finetune_single_device --config qwen2/0.5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
19+
#
20+
# This config works only for training on single device.
21+
22+
# Tokenizer
23+
tokenizer:
24+
_component_: torchtune.models.qwen2.qwen2_tokenizer
25+
path: /tmp/Qwen2-0.5B-Instruct/vocab.json
26+
merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt
27+
28+
# Dataset
29+
dataset:
30+
_component_: torchtune.datasets.alpaca_cleaned_dataset
31+
seed: null
32+
shuffle: True
33+
34+
# Model Arguments
35+
model:
36+
_component_: torchtune.models.qwen2.qwen2_0_5b
37+
38+
checkpointer:
39+
_component_: torchtune.utils.FullModelHFCheckpointer
40+
checkpoint_dir: /tmp/Qwen2-0.5B-Instruct
41+
checkpoint_files: [
42+
model.safetensors
43+
]
44+
recipe_checkpoint: null
45+
output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
46+
model_type: QWEN2
47+
resume_from_checkpoint: False
48+
49+
# Fine-tuning arguments
50+
batch_size: 2
51+
epochs: 1
52+
optimizer:
53+
_component_: bitsandbytes.optim.PagedAdamW
54+
lr: 5e-6
55+
optimizer_in_bwd: True
56+
loss:
57+
_component_: torch.nn.CrossEntropyLoss
58+
max_steps_per_epoch: null
59+
gradient_accumulation_steps: 16
60+
compile: False
61+
62+
# Training environment
63+
device: cuda
64+
65+
# Memory management
66+
enable_activation_checkpointing: True
67+
68+
# Reduced precision
69+
dtype: bf16
70+
71+
# Logging
72+
metric_logger:
73+
_component_: torchtune.utils.metric_logging.DiskLogger
74+
log_dir: ${output_dir}
75+
output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
76+
log_every_n_steps: 1
77+
log_peak_memory_stats: False
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
2+
# using a Qwen2 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns ""
7+
#
8+
# To launch on 2 devices, run the following command from root:
9+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# For single device LoRA finetuning please use 0.5B_lora_single_device.yaml
18+
# or 0.5B_qlora_single_device.yaml
19+
20+
21+
# Model Arguments
22+
model:
23+
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
24+
lora_attn_modules: ['q_proj', 'v_proj']
25+
apply_lora_to_mlp: False
26+
apply_lora_to_output: False
27+
lora_rank: 8
28+
lora_alpha: 16
29+
30+
tokenizer:
31+
_component_: torchtune.models.qwen2.qwen2_tokenizer
32+
path: /tmp/Qwen2-0.5B-Instruct/vocab.json
33+
merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt
34+
35+
checkpointer:
36+
_component_: torchtune.utils.FullModelHFCheckpointer
37+
checkpoint_dir: /tmp/Qwen2-0.5B-Instruct
38+
checkpoint_files: [
39+
model.safetensors
40+
]
41+
recipe_checkpoint: null
42+
output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune
43+
model_type: QWEN2
44+
resume_from_checkpoint: False
45+
46+
# Dataset and Sampler
47+
dataset:
48+
_component_: torchtune.datasets.alpaca_cleaned_dataset
49+
seed: null
50+
shuffle: True
51+
batch_size: 2
52+
53+
# Optimizer and Scheduler
54+
optimizer:
55+
_component_: torch.optim.AdamW
56+
weight_decay: 0.01
57+
lr: 3e-4
58+
lr_scheduler:
59+
_component_: torchtune.modules.get_cosine_schedule_with_warmup
60+
num_warmup_steps: 100
61+
62+
loss:
63+
_component_: torch.nn.CrossEntropyLoss
64+
65+
# Training
66+
epochs: 1
67+
max_steps_per_epoch: null
68+
gradient_accumulation_steps: 32
69+
70+
# Logging
71+
output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune
72+
metric_logger:
73+
_component_: torchtune.utils.metric_logging.DiskLogger
74+
log_dir: ${output_dir}
75+
log_every_n_steps: 1
76+
log_peak_memory_stats: False
77+
78+
# Environment
79+
device: cuda
80+
dtype: bf16
81+
enable_activation_checkpointing: False
82+
83+
# Show case the usage of pytorch profiler
84+
# Set enabled to False as it's only needed for debugging training
85+
profiler:
86+
_component_: torchtune.utils.setup_torch_profiler
87+
88+
enabled: False
89+
90+
#Output directory of trace artifacts
91+
output_dir: ${output_dir}/profiling_outputs
92+
93+
#`torch.profiler.ProfilerActivity` types to trace
94+
cpu: True
95+
cuda: True
96+
97+
#trace options passed to `torch.profiler.profile`
98+
profile_memory: False
99+
with_stack: False
100+
record_shapes: True
101+
with_flops: False
102+
103+
# `torch.profiler.schedule` options:
104+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
105+
wait_steps: 5
106+
warmup_steps: 5
107+
active_steps: 2
108+
num_cycles: 1
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
# Config for single device LoRA finetuning in lora_finetune_single_device.py
2+
# using a Qwen2 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns ""
7+
#
8+
# To launch on a single device, run the following command from root:
9+
# tune run lora_finetune_single_device --config qwen2/0.5B_lora_single_device
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run lora_finetune_single_device --config qwen2/0.5B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works only for training on single device.
17+
18+
19+
# Model Arguments
20+
model:
21+
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
22+
lora_attn_modules: ['q_proj', 'v_proj']
23+
apply_lora_to_mlp: False
24+
apply_lora_to_output: False
25+
lora_rank: 8
26+
lora_alpha: 16
27+
28+
tokenizer:
29+
_component_: torchtune.models.qwen2.qwen2_tokenizer
30+
path: /tmp/Qwen2-0.5B-Instruct/vocab.json
31+
merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt
32+
33+
checkpointer:
34+
_component_: torchtune.utils.FullModelHFCheckpointer
35+
checkpoint_dir: /tmp/Qwen2-0.5B-Instruct
36+
checkpoint_files: [
37+
model.safetensors
38+
]
39+
recipe_checkpoint: null
40+
output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune
41+
model_type: QWEN2
42+
resume_from_checkpoint: False
43+
44+
# Dataset and Sampler
45+
dataset:
46+
_component_: torchtune.datasets.alpaca_cleaned_dataset
47+
seed: null
48+
shuffle: True
49+
batch_size: 2
50+
51+
# Optimizer and Scheduler
52+
optimizer:
53+
_component_: torch.optim.AdamW
54+
weight_decay: 0.01
55+
lr: 3e-4
56+
lr_scheduler:
57+
_component_: torchtune.modules.get_cosine_schedule_with_warmup
58+
num_warmup_steps: 100
59+
60+
loss:
61+
_component_: torch.nn.CrossEntropyLoss
62+
63+
# Training
64+
epochs: 1
65+
max_steps_per_epoch: null
66+
gradient_accumulation_steps: 64
67+
compile: False
68+
69+
# Logging
70+
output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune
71+
metric_logger:
72+
_component_: torchtune.utils.metric_logging.DiskLogger
73+
log_dir: ${output_dir}
74+
log_every_n_steps: 1
75+
log_peak_memory_stats: False
76+
77+
# Environment
78+
device: cuda
79+
dtype: bf16
80+
enable_activation_checkpointing: True
81+
82+
# Show case the usage of pytorch profiler
83+
# Set enabled to False as it's only needed for debugging training
84+
profiler:
85+
_component_: torchtune.utils.setup_torch_profiler
86+
enabled: False
87+
88+
#Output directory of trace artifacts
89+
output_dir: ${output_dir}/profiling_outputs
90+
91+
#`torch.profiler.ProfilerActivity` types to trace
92+
cpu: True
93+
cuda: True
94+
95+
#trace options passed to `torch.profiler.profile`
96+
profile_memory: False
97+
with_stack: False
98+
record_shapes: True
99+
with_flops: False
100+
101+
# `torch.profiler.schedule` options:
102+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
103+
wait_steps: 5
104+
warmup_steps: 5
105+
active_steps: 2
106+
num_cycles: 1

0 commit comments

Comments
 (0)