Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
67aaf52
feat(runtimes): Add lora recipe and config in torch.plugin.
Electronic-Waste Sep 16, 2025
7e8a2a4
chore: add --trainer-use-lora flag and related logics.
Electronic-Waste Sep 16, 2025
d052be8
feat(lora): Add support for QLoRA.
Electronic-Waste Sep 17, 2025
c2e1b57
fix(lora): fix extra rdzv backend in single device mode.
Electronic-Waste Sep 17, 2025
140ca1b
test(lora): Add UTs for lora/qlora in trainingruntime.
Electronic-Waste Sep 17, 2025
5b36509
fix(lora): remove extra quote symbol in lora attn module.
Electronic-Waste Sep 17, 2025
5dfdcfa
fix(lint): fix lint error.
Electronic-Waste Sep 17, 2025
354cee6
fix(runtime): check default GPU resources allocated in CTRs.
Electronic-Waste Oct 2, 2025
5e94189
chore(runtime): add getNumProcPerNode.
Electronic-Waste Oct 2, 2025
392361c
fix(runtime): remove specific vendor for GPU.
Electronic-Waste Oct 2, 2025
4975271
test(runtime): add UTs for lora validation.
Electronic-Waste Oct 2, 2025
448efa9
fix(runtimes): remove extra flags for torchtune.
Electronic-Waste Oct 2, 2025
8853b6d
Update pkg/runtime/framework/plugins/torch/torch.go
Electronic-Waste Oct 3, 2025
97ef1d6
chore(runtime): add extractGPUCountFromRuntime.
Electronic-Waste Oct 3, 2025
4cc601a
fix(runtimes): remove filtering args logics.
Electronic-Waste Oct 3, 2025
65f41a7
fix(runtimes): use fallbackNumProcPerNode if GPU is set in runtimes.
Electronic-Waste Oct 3, 2025
a7b7456
refactor(runtimes): refactor getNumProcPerNode logics.
Electronic-Waste Oct 3, 2025
0557d01
fix(lint): fix lint error.
Electronic-Waste Oct 3, 2025
4f1303e
fix(runtimes): call getNumProcPerNode only once.
Electronic-Waste Oct 3, 2025
b63ac47
test(runtime): fix UTs for torch.
Electronic-Waste Oct 3, 2025
70c9f1e
fix(runtimes): add limits to calculation.
Electronic-Waste Oct 3, 2025
48d9883
refactor(runtimes): reorg nppNode related logics.
Electronic-Waste Oct 5, 2025
7a08859
fix(runtimes): fix lints.
Electronic-Waste Oct 5, 2025
4dc016d
fix(runtimes): update condition that use nppNode in Limits.
Electronic-Waste Oct 5, 2025
5557f9b
refactor(runtimes): add torchtune.go
Electronic-Waste Oct 8, 2025
e46e639
fix(lint): fix lint error.
Electronic-Waste Oct 8, 2025
6f98243
refactor(torch): only reset nppNode when using CPU.
Electronic-Waste Oct 11, 2025
b52e671
chore(runtimes): remove getNumProcPerNodeFromCPU.
Electronic-Waste Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/proposals/2401-llm-trainer-v2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -452,8 +452,8 @@ class LoraConfig:
lora_attn_modules: Optional[List[str]] = None
lora_rank: Optional[int] = None
lora_alpha: Optional[int] = None
lora_dropout: optional[float] = None
quantize_base: optional[bool] = None
lora_dropout: Optional[float] = None
quantize_base: Optional[bool] = None
use_dora: Optional[bool] = None

```
Expand Down
33 changes: 30 additions & 3 deletions pkg/constants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,33 @@ const (
// TorchTuneFullFinetuneMultiNodesConfigSuffix is the config suffix for the multi node distributed full finetune.
TorchTuneFullFinetuneMultiNodesConfigSuffix string = "_full_multinode"

// TorchTuneLoRAFinetuneSingleDevice Recipe is the recipe for the single device LoRA finetune.
TorchTuneLoRAFinetuneSingleDevice string = "lora_finetune_single_device"

// TorchTuneLoRAFinetuneSingleDeviceConfigSuffix is the config suffix for the single device LoRA finetune.
TorchTuneLoRAFinetuneSingleDeviceConfigSuffix string = "_lora_single_device"

// TorchTuneQLoRAFinetuneSingleDeviceConfigSuffix is the config suffix for the single device QLoRA finetune.
TorchTuneQLoRAFinetuneSingleDeviceConfigSuffix string = "_qlora_single_device"

// TorchTuneLoRAFinetuneDistributed Recipe is the recipe for the distributed LoRA finetune.
TorchTuneLoRAFinetuneDistributed string = "lora_finetune_distributed"

// TorchTuneLoRAFinetuneDistributedConfigSuffix is the config suffix for the distributed LoRA finetune.
TorchTuneLoRAFinetuneDistributedConfigSuffix string = "_lora"

// TorchTuneQLoRAFinetuneDistributedConfigSuffix is the config suffix for the distributed QLoRA finetune.
TorchTuneQLoRAFinetuneDistributedConfigSuffix string = "_qlora"

// TorchTuneLoraAttnModules is the config item name for the LoRA attention modules.
TorchTuneLoraAttnModules string = "model.lora_attn_modules"

// TorchTuneQuantizeBase is the config item name for the quantization base.
TorchTuneQuantizeBase string = "model.quantize_base"

// TorchTuneUseDora is the config item name for using DoRA.
TorchTuneUseDora string = "model.use_dora"

// TorchTuneModelOutputDir is the config item name for the model output directory.
TorchTuneModelOutputDir string = "output_dir"

Expand All @@ -163,8 +190,8 @@ const (
// TORCHTUNE_MODEL_LLAMA3_2_1B is the model name for the Llama3.2 1B Instruct model.
TORCHTUNE_MODEL_LLAMA3_2_1B = "llama3_2/1B"

// TORCHTUNE_MODEL_LLAMA3_2_7B is the model name for the Llama3.2 7B Instruct model.
TORCHTUNE_MODEL_LLAMA3_2_7B = "llama3_2/7B"
// TORCHTUNE_MODEL_LLAMA3_2_3B is the model name for the Llama3.2 3B Instruct model.
TORCHTUNE_MODEL_LLAMA3_2_3B = "llama3_2/3B"

// TORCHTUNE_MODEL_LLAMA3_3_70B is the model name for the Llama3.3 70B Instruct model.
TORCHTUNE_MODEL_LLAMA3_3_70B = "llama3_3/70B"
Expand All @@ -184,7 +211,7 @@ var (
ResourceInUseFinalizer = fmt.Sprintf("%s/resource-in-use", trainer.GroupVersion.Group)

// TorchTuneSupportedPretrainedModels supported pretrained models for TorchTune Trainer.
TorchTuneSupportedPretrainedModels = sets.New(TORCHTUNE_MODEL_LLAMA3_2_1B, TORCHTUNE_MODEL_LLAMA3_2_7B, TORCHTUNE_MODEL_LLAMA3_3_70B, TORCHTUNE_MODEL_QWEN2_5_1_5B)
TorchTuneSupportedPretrainedModels = sets.New(TORCHTUNE_MODEL_LLAMA3_2_1B, TORCHTUNE_MODEL_LLAMA3_2_3B, TORCHTUNE_MODEL_LLAMA3_3_70B, TORCHTUNE_MODEL_QWEN2_5_1_5B)

// TorchTuneEntrypoint is the entrypoint for the torchtune.
TorchTuneEntrypoint = []string{"tune", "run"}
Expand Down
Loading
Loading