Skip to content

Commit 825aa92

Browse files
jeromekuRdoubleA
authored andcommitted
Profiler v2 (meta-pytorch#1089)
Co-authored-by: RdoubleA <[email protected]>
1 parent 20c25da commit 825aa92

22 files changed

+1116
-77
lines changed

docs/source/api_ref_utilities.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ of your finetuning job.
8787

8888
get_memory_stats
8989
log_memory_stats
90-
profiler
90+
setup_torch_profiler
9191

9292
.. _metric_logging_label:
9393

recipes/configs/code_llama2/7B_lora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,25 @@ log_every_n_steps: 1
8383
log_peak_memory_stats: False
8484

8585
profiler:
86-
_component_: torchtune.utils.profiler
86+
_component_: torchtune.utils.setup_torch_profiler
8787
enabled: False
88-
output_dir: ${output_dir}/torchtune_perf_tracing.json
88+
89+
#Output directory of trace artifacts
90+
output_dir: ${output_dir}/profiling_outputs
91+
92+
#`torch.profiler.ProfilerActivity` types to trace
93+
cpu: True
94+
cuda: True
95+
96+
#trace options passed to `torch.profiler.profile`
97+
profile_memory: False
98+
with_stack: False
99+
record_shapes: True
100+
with_flops: False
101+
102+
# `torch.profiler.schedule` options:
103+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
104+
wait_steps: 5
105+
warmup_steps: 5
106+
active_steps: 2
107+
num_cycles: 1

recipes/configs/code_llama2/7B_qlora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
8686
# Show case the usage of pytorch profiler
8787
# Set enabled to False as it's only needed for debugging training
8888
profiler:
89-
_component_: torchtune.utils.profiler
89+
_component_: torchtune.utils.setup_torch_profiler
9090
enabled: False
91-
output_dir: ${output_dir}/torchtune_perf_tracing.json
91+
92+
#Output directory of trace artifacts
93+
output_dir: ${output_dir}/profiling_outputs
94+
95+
#`torch.profiler.ProfilerActivity` types to trace
96+
cpu: True
97+
cuda: True
98+
99+
#trace options passed to `torch.profiler.profile`
100+
profile_memory: False
101+
with_stack: False
102+
record_shapes: True
103+
with_flops: False
104+
105+
# `torch.profiler.schedule` options:
106+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
107+
wait_steps: 5
108+
warmup_steps: 5
109+
active_steps: 2
110+
num_cycles: 1

recipes/configs/gemma/2B_lora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,25 @@ log_peak_memory_stats: False
8484
# Show case the usage of pytorch profiler
8585
# Set enabled to False as it's only needed for debugging training
8686
profiler:
87-
_component_: torchtune.utils.profiler
87+
_component_: torchtune.utils.setup_torch_profiler
8888
enabled: False
89-
output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
89+
90+
#Output directory of trace artifacts
91+
output_dir: ${output_dir}/profiling_outputs
92+
93+
#`torch.profiler.ProfilerActivity` types to trace
94+
cpu: True
95+
cuda: True
96+
97+
#trace options passed to `torch.profiler.profile`
98+
profile_memory: False
99+
with_stack: False
100+
record_shapes: True
101+
with_flops: False
102+
103+
# `torch.profiler.schedule` options:
104+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
105+
wait_steps: 5
106+
warmup_steps: 5
107+
active_steps: 2
108+
num_cycles: 1

recipes/configs/gemma/2B_qlora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,25 @@ log_peak_memory_stats: False
8484
# Show case the usage of pytorch profiler
8585
# Set enabled to False as it's only needed for debugging training
8686
profiler:
87-
_component_: torchtune.utils.profiler
87+
_component_: torchtune.utils.setup_torch_profiler
8888
enabled: False
89-
output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
89+
90+
#Output directory of trace artifacts
91+
output_dir: ${output_dir}/profiling_outputs
92+
93+
#`torch.profiler.ProfilerActivity` types to trace
94+
cpu: True
95+
cuda: True
96+
97+
#trace options passed to `torch.profiler.profile`
98+
profile_memory: False
99+
with_stack: False
100+
record_shapes: True
101+
with_flops: False
102+
103+
# `torch.profiler.schedule` options:
104+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
105+
wait_steps: 5
106+
warmup_steps: 5
107+
active_steps: 2
108+
num_cycles: 1

recipes/configs/gemma/7B_lora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
8686
# Show case the usage of pytorch profiler
8787
# Set enabled to False as it's only needed for debugging training
8888
profiler:
89-
_component_: torchtune.utils.profiler
89+
_component_: torchtune.utils.setup_torch_profiler
9090
enabled: False
91-
output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
91+
92+
#Output directory of trace artifacts
93+
output_dir: ${output_dir}/profiling_outputs
94+
95+
#`torch.profiler.ProfilerActivity` types to trace
96+
cpu: True
97+
cuda: True
98+
99+
#trace options passed to `torch.profiler.profile`
100+
profile_memory: False
101+
with_stack: False
102+
record_shapes: True
103+
with_flops: False
104+
105+
# `torch.profiler.schedule` options:
106+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
107+
wait_steps: 5
108+
warmup_steps: 5
109+
active_steps: 2
110+
num_cycles: 1

recipes/configs/gemma/7B_qlora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,25 @@ log_peak_memory_stats: False
8686
# Show case the usage of pytorch profiler
8787
# Set enabled to False as it's only needed for debugging training
8888
profiler:
89-
_component_: torchtune.utils.profiler
89+
_component_: torchtune.utils.setup_torch_profiler
9090
enabled: False
91-
output_dir: /tmp/alpaca-gemma-finetune/torchtune_perf_tracing.json
91+
92+
#Output directory of trace artifacts
93+
output_dir: ${output_dir}/profiling_outputs
94+
95+
#`torch.profiler.ProfilerActivity` types to trace
96+
cpu: True
97+
cuda: True
98+
99+
#trace options passed to `torch.profiler.profile`
100+
profile_memory: False
101+
with_stack: False
102+
record_shapes: True
103+
with_flops: False
104+
105+
# `torch.profiler.schedule` options:
106+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
107+
wait_steps: 5
108+
warmup_steps: 5
109+
active_steps: 2
110+
num_cycles: 1

recipes/configs/llama2/13B_qlora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,25 @@ enable_activation_checkpointing: True
8383
# Show case the usage of pytorch profiler
8484
# Set enabled to False as it's only needed for debugging training
8585
profiler:
86-
_component_: torchtune.utils.profiler
86+
_component_: torchtune.utils.setup_torch_profiler
8787
enabled: False
88-
output_dir: ${output_dir}/torchtune_perf_tracing.json
88+
89+
#Output directory of trace artifacts
90+
output_dir: ${output_dir}/profiling_outputs
91+
92+
#`torch.profiler.ProfilerActivity` types to trace
93+
cpu: True
94+
cuda: True
95+
96+
#trace options passed to `torch.profiler.profile`
97+
profile_memory: False
98+
with_stack: False
99+
record_shapes: True
100+
with_flops: False
101+
102+
# `torch.profiler.schedule` options:
103+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
104+
wait_steps: 5
105+
warmup_steps: 5
106+
active_steps: 2
107+
num_cycles: 1

recipes/configs/llama2/7B_lora.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,30 @@ log_peak_memory_stats: False
8080
device: cuda
8181
dtype: bf16
8282
enable_activation_checkpointing: False
83+
84+
# Show case the usage of pytorch profiler
85+
# Set enabled to False as it's only needed for debugging training
86+
profiler:
87+
_component_: torchtune.utils.setup_torch_profiler
88+
89+
enabled: False
90+
91+
#Output directory of trace artifacts
92+
output_dir: ${output_dir}/profiling_outputs
93+
94+
#`torch.profiler.ProfilerActivity` types to trace
95+
cpu: True
96+
cuda: True
97+
98+
#trace options passed to `torch.profiler.profile`
99+
profile_memory: False
100+
with_stack: False
101+
record_shapes: True
102+
with_flops: False
103+
104+
# `torch.profiler.schedule` options:
105+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
106+
wait_steps: 5
107+
warmup_steps: 5
108+
active_steps: 2
109+
num_cycles: 1

recipes/configs/llama2/7B_lora_single_device.yaml

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,25 @@ enable_activation_checkpointing: True
8383
# Show case the usage of pytorch profiler
8484
# Set enabled to False as it's only needed for debugging training
8585
profiler:
86-
_component_: torchtune.utils.profiler
86+
_component_: torchtune.utils.setup_torch_profiler
8787
enabled: False
88-
output_dir: ${output_dir}/torchtune_perf_tracing.json
88+
89+
#Output directory of trace artifacts
90+
output_dir: ${output_dir}/profiling_outputs
91+
92+
#`torch.profiler.ProfilerActivity` types to trace
93+
cpu: True
94+
cuda: True
95+
96+
#trace options passed to `torch.profiler.profile`
97+
profile_memory: False
98+
with_stack: False
99+
record_shapes: True
100+
with_flops: False
101+
102+
# `torch.profiler.schedule` options:
103+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
104+
wait_steps: 5
105+
warmup_steps: 5
106+
active_steps: 2
107+
num_cycles: 1

0 commit comments

Comments
 (0)