diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py index 7cfc03b4a..c63bb07c2 100644 --- a/torchtitan/models/llama3/__init__.py +++ b/torchtitan/models/llama3/__init__.py @@ -39,6 +39,24 @@ use_flex_attn=True, attn_mask_type="block_causal", ), + "1B": TransformerModelArgs( + dim=2048, + n_layers=16, + n_heads=32, + n_kv_heads=8, + ffn_dim_multiplier=1.5, + multiple_of=1024, + rope_theta=500000, + ), + "3B": TransformerModelArgs( + dim=3072, + n_layers=28, + n_heads=24, + n_kv_heads=8, + ffn_dim_multiplier=1.0, + multiple_of=1024, + rope_theta=500000, + ), "8B": TransformerModelArgs( dim=4096, n_layers=32, diff --git a/torchtitan/models/llama3/train_configs/llama3_1b.toml b/torchtitan/models/llama3/train_configs/llama3_1b.toml new file mode 100644 index 000000000..116c57762 --- /dev/null +++ b/torchtitan/models/llama3/train_configs/llama3_1b.toml @@ -0,0 +1,62 @@ +# torchtitan Config.toml +# NOTE: this toml config is a preset for 64 A100 GPUs. + +[job] +dump_folder = "./outputs" +description = "Llama 3 1B training" + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 100 + +[metrics] +log_freq = 10 +enable_tensorboard = true +save_tb_folder = "tb" + +[model] +name = "llama3" +flavor = "1B" +tokenizer_path = "./assets/tokenizer/original/tokenizer.model" +# converters = ["float8"] + +[optimizer] +name = "AdamW" +lr = 3e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 200 # lr scheduler warm up + +[training] +local_batch_size = 1 +seq_len = 8192 +max_norm = 1.0 # grad norm clipping +steps = 1000 +compile = false +dataset = "c4" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = -1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +context_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false +folder = "checkpoint" +interval = 500 +last_save_model_weights_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = "selective" # ["none", "selective", "full"] +selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false +filter_fqns = ["output"] diff --git a/torchtitan/models/llama3/train_configs/llama3_3b.toml b/torchtitan/models/llama3/train_configs/llama3_3b.toml new file mode 100644 index 000000000..3aff16471 --- /dev/null +++ b/torchtitan/models/llama3/train_configs/llama3_3b.toml @@ -0,0 +1,62 @@ +# torchtitan Config.toml +# NOTE: this toml config is a preset for 64 A100 GPUs. + +[job] +dump_folder = "./outputs" +description = "Llama 3 3B training" + +[profiling] +enable_profiling = true +save_traces_folder = "profile_trace" +profile_freq = 100 + +[metrics] +log_freq = 10 +enable_tensorboard = true +save_tb_folder = "tb" + +[model] +name = "llama3" +flavor = "3B" +tokenizer_path = "./assets/tokenizer/original/tokenizer.model" +# converters = ["float8"] + +[optimizer] +name = "AdamW" +lr = 3e-4 +eps = 1e-8 + +[lr_scheduler] +warmup_steps = 200 # lr scheduler warm up + +[training] +local_batch_size = 1 +seq_len = 8192 +max_norm = 1.0 # grad norm clipping +steps = 1000 +compile = false +dataset = "c4" + +[parallelism] +data_parallel_replicate_degree = 1 +data_parallel_shard_degree = -1 +tensor_parallel_degree = 1 +pipeline_parallel_degree = 1 +context_parallel_degree = 1 + +[checkpoint] +enable_checkpoint = false +folder = "checkpoint" +interval = 500 +last_save_model_weights_only = false +export_dtype = "float32" +async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = "selective" # ["none", "selective", "full"] +selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_fsdp_float8_all_gather = false +precompute_float8_dynamic_scale_for_fsdp = false +filter_fqns = ["output"]