Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions recipes/configs/llama3_2/8B_to_1B_KD_lora_distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ model:
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

teacher_model:
Expand Down
4 changes: 2 additions & 2 deletions recipes/configs/llama3_2/8B_to_1B_KD_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ model:
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

teacher_model:
Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/llama3_3/70B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
# This config is only tested on an 8xA100 machine.
#

output_dir: /tmp/torchtune/llama3_3_70B/full # /tmp may be deleted by your system. Change it to your preference.

# Tokenizer
tokenizer:
_component_: torchtune.models.llama3.llama3_tokenizer
Expand Down Expand Up @@ -69,7 +71,7 @@ checkpointer:
model-00030-of-00030.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/Llama-3.3-70B-Instruct/
output_dir: ${output_dir}
model_type: LLAMA3
resume_from_checkpoint: False

Expand All @@ -87,7 +89,7 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
gradient_accumulation_steps: 1 # Use to increase effective batch size


# Training env
Expand All @@ -98,7 +100,7 @@ enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
fsdp_cpu_offload: True
compile: False # pytorch compile, set to true for better perf/memory
compile: False # torch.compile the model + loss, True increases speed + decreases memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Reduced precision
Expand All @@ -107,8 +109,7 @@ dtype: bf16
# Logging
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/full-llama3_3-finetune
log_dir: ${output_dir}/logs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not consistent with other configs? Maybe update all of the rest configs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The others were updated yesterday :)

log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/llama3_3/70B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# This config needs 8 GPUs to run
# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora

output_dir: /tmp/torchtune/llama3_3_70B/lora # /tmp may be deleted by your system. Change it to your preference.

# Model Arguments
model:
_component_: torchtune.models.llama3_3.lora_llama3_3_70b
Expand Down Expand Up @@ -59,7 +61,7 @@ checkpointer:
model-00030-of-00030.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/Llama-3.3-70B-Instruct/
output_dir: ${output_dir}
model_type: LLAMA3
resume_from_checkpoint: False
save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
Expand Down Expand Up @@ -88,14 +90,13 @@ loss:
# Training
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
output_dir: /tmp/lora-llama3_3-finetune-output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
11 changes: 6 additions & 5 deletions recipes/configs/llama3_3/70B_qlora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# This config needs 8 GPUs to run
# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora

output_dir: /tmp/torchtune/llama3_3_70B/qlora # /tmp may be deleted by your system. Change it to your preference.

# Model Arguments
model:
_component_: torchtune.models.llama3_3.qlora_llama3_3_70b
Expand Down Expand Up @@ -59,7 +61,7 @@ checkpointer:
model-00030-of-00030.safetensors,
]
recipe_checkpoint: null
output_dir: /tmp/Llama-3.3-70B-Instruct/
output_dir: ${output_dir}
model_type: LLAMA3
resume_from_checkpoint: False
save_adapter_weights_only: True # Set to false to save the whole model + adapter merged
Expand Down Expand Up @@ -88,14 +90,13 @@ loss:
# Training
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
gradient_accumulation_steps: 1 # Use to increase effective batch size
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
output_dir: /tmp/lora-llama3_3-finetune-output
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: True

Expand Down
8 changes: 4 additions & 4 deletions recipes/configs/qwen2/1.5_to_0.5B_KD_lora_distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_distributed # /tmp may be d
# Model Arguments
model:
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
apply_lora_to_mlp: False
lora_rank: 32
lora_alpha: 64
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 32 # higher increases accuracy and memory
lora_alpha: 64 # usually alpha=2*rank

teacher_model:
_component_: torchtune.models.qwen2.qwen2_1_5b
Expand Down
8 changes: 4 additions & 4 deletions recipes/configs/qwen2/1.5_to_0.5B_KD_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ output_dir: /tmp/torchtune/qwen2_1_5_to_0_5B/KD_lora_single_device # /tmp may be
# Model Arguments
model:
_component_: torchtune.models.qwen2.lora_qwen2_0_5b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
apply_lora_to_mlp: False
lora_rank: 32
lora_alpha: 64
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 32 # higher increases accuracy and memory
lora_alpha: 64 # usually alpha=2*rank

teacher_model:
_component_: torchtune.models.qwen2.qwen2_1_5b
Expand Down
Loading