|
| 1 | +#!/bin/bash |
| 2 | +dir=`pwd` |
| 3 | +############################################################################### |
| 4 | +### Main configs |
| 5 | +## GPT-3 models use 2K sequence length/context window |
| 6 | +seq_len=262144 # need to be divisible by sp size * sp size * num chunks = 4 * 4 * 32 = 128 |
| 7 | + |
| 8 | +## The "GPT-3 XXX" below are configs from GPT-3 paper |
| 9 | +## https://arxiv.org/abs/2005.14165, choose based on |
| 10 | +## your desired model size or build your own configs |
| 11 | + |
| 12 | +## init_std is standard deviation for weight initialization. Usually larger |
| 13 | +## model needs lower std. We used a heuristic equation of sqrt(1/3/hidden_size) |
| 14 | +## from the MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf) |
| 15 | + |
| 16 | +## We changed min_lr to a lower number (1.0e-6), which we found is able to |
| 17 | +## provide better zero-shot eval results. |
| 18 | + |
| 19 | +## GPT-3 Small 125M |
| 20 | +# model_size=0.125 |
| 21 | +# num_layers=12 |
| 22 | +# hidden_size=768 |
| 23 | +# num_attn_heads=12 |
| 24 | +# global_batch_size=256 |
| 25 | +# lr=6.0e-4 |
| 26 | +# min_lr=1.0e-6 |
| 27 | +# init_std=0.02 |
| 28 | + |
| 29 | +## GPT-3 Medium 350M |
| 30 | +# model_size=0.35 |
| 31 | +# num_layers=24 |
| 32 | +# hidden_size=1024 |
| 33 | +# num_attn_heads=16 |
| 34 | +# global_batch_size=256 |
| 35 | +# lr=3.0e-4 |
| 36 | +# min_lr=1.0e-6 |
| 37 | +# init_std=0.018 |
| 38 | + |
| 39 | +## GPT-3 Large 760M |
| 40 | +# model_size=0.76 |
| 41 | +# num_layers=24 |
| 42 | +# hidden_size=1536 |
| 43 | +# num_attn_heads=16 |
| 44 | +# global_batch_size=256 |
| 45 | +# lr=2.5e-4 |
| 46 | +# min_lr=1.0e-6 |
| 47 | +# init_std=0.015 |
| 48 | + |
| 49 | +## GPT-3 XL 1.3B |
| 50 | +# model_size=1.3 |
| 51 | +# num_layers=24 |
| 52 | +# hidden_size=2048 |
| 53 | +# num_attn_heads=16 |
| 54 | +# global_batch_size=32 |
| 55 | +# lr=2.0e-4 |
| 56 | +# min_lr=1.0e-6 |
| 57 | +# init_std=0.013 |
| 58 | + |
| 59 | +## GPT-3 2.7B |
| 60 | +# model_size=2.7 |
| 61 | +# num_layers=32 |
| 62 | +# hidden_size=2560 |
| 63 | +# num_attn_heads=32 |
| 64 | +# global_batch_size=512 |
| 65 | +# lr=1.6e-4 |
| 66 | +# min_lr=1.0e-6 |
| 67 | +# init_std=0.011 |
| 68 | + |
| 69 | +## GPT-3 6.7B |
| 70 | +model_size=6.7 |
| 71 | +num_layers=32 |
| 72 | +hidden_size=4096 |
| 73 | +num_attn_heads=32 |
| 74 | +global_batch_size=1024 |
| 75 | +lr=1.2e-4 |
| 76 | +min_lr=1.0e-6 |
| 77 | +init_std=0.009 |
| 78 | + |
| 79 | +## GPT-3 13B |
| 80 | +# model_size=13 |
| 81 | +# num_layers=40 |
| 82 | +# hidden_size=5120 |
| 83 | +# num_attn_heads=40 |
| 84 | +# global_batch_size=1024 |
| 85 | +# lr=1.0e-4 |
| 86 | +# min_lr=1.0e-6 |
| 87 | +# init_std=0.008 |
| 88 | + |
| 89 | +# GPT-3 30B |
| 90 | +# model_size=30 |
| 91 | +# num_layers=64 |
| 92 | +# hidden_size=6144 |
| 93 | +# num_attn_heads=64 |
| 94 | +# global_batch_size=2 |
| 95 | +# lr=1.0e-4 |
| 96 | +# min_lr=1.0e-6 |
| 97 | +# init_std=0.008 |
| 98 | + |
| 99 | +## GPT-3 175B |
| 100 | +# model_size=175 |
| 101 | +# num_layers=96 |
| 102 | +# hidden_size=12288 |
| 103 | +# num_attn_heads=96 |
| 104 | +# global_batch_size=1536 |
| 105 | +# lr=0.6e-4 |
| 106 | +# min_lr=1.0e-6 |
| 107 | +# init_std=0.005 |
| 108 | +############################################################################### |
| 109 | +### Training duration configs |
| 110 | +## The main termination condition, original GPT-3 paper trains for 300B tokens. |
| 111 | +train_tokens_in_billion=300 |
| 112 | +train_tokens=$((${train_tokens_in_billion} * 1000000000)) |
| 113 | + |
| 114 | +## train_samples is another termination condition and also affect the number of |
| 115 | +## data samples to be indexed. Since we want to reach the train_tokens |
| 116 | +## above, and data efficiency techniques may change num tokens in some samples, |
| 117 | +## so we just set this config large enough to make sure we have enough |
| 118 | +## processed data and don't terminate by train_samples. |
| 119 | +train_samples=$(( 300 * 1000000000 * 2 / ${seq_len} )) |
| 120 | + |
| 121 | +## Another wall-clock time termination condition in minutes. Set it large |
| 122 | +## enough to avoid undesired early termination. |
| 123 | +exit_duration=30000000 |
| 124 | +############################################################################### |
| 125 | +### lr configs |
| 126 | +## lr warmup and decay duration. |
| 127 | +## Original GPT-3 paper uses 375M warmup tokens and 260B cosine decay tokens. |
| 128 | +## Here we increase the warmup tokens to 3B since when batch size warmup is not |
| 129 | +## used, there are more tokens per step. Thus we need to increase warmup tokens |
| 130 | +## to make sure there are enough warmup steps, which is important for training |
| 131 | +## stability. |
| 132 | +lr_warmup_tokens_in_million=3000 |
| 133 | +lr_warmup_tokens=$((${lr_warmup_tokens_in_million} * 1000000)) |
| 134 | +## Here we changed the LR decay tokens to align with total train tokens, since |
| 135 | +## related works (e.g., https://arxiv.org/abs/2203.15556) find that setting the |
| 136 | +## learning rate schedule to match the number of training tokens results in the |
| 137 | +## best final model quality |
| 138 | +lr_decay_tokens_in_billion=${train_tokens_in_billion} |
| 139 | +lr_decay_tokens=$((${lr_decay_tokens_in_billion} * 1000000000)) |
| 140 | +lr_decay_style="cosine" |
| 141 | +############################################################################### |
| 142 | +### Parallelism configs |
| 143 | +## Model parallelism, 1 is no MP |
| 144 | +## Currently we only support MP=1 with SP>1 |
| 145 | +mp_size=1 |
| 146 | + |
| 147 | +## Sequence parallelism, 1 is no SP |
| 148 | +sp_size=4 |
| 149 | + |
| 150 | +## Pipeline parallelism. To disable PP, set pp_size to 1 and no_pp to true. |
| 151 | +## Note that currently both curriculum learning and random-LTD are NOT |
| 152 | +## compatible with pipeline parallelism. |
| 153 | +pp_size=1 |
| 154 | +no_pp="true" |
| 155 | + |
| 156 | +## ZeRO-based data parallelism, stage=0 will disable ZeRO |
| 157 | +zero_stage=3 |
| 158 | + |
| 159 | +## Total number of GPUs. ds_ssh is from DeepSpeed library. |
| 160 | +num_gpus=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2)) |
| 161 | +num_gpus_pernode=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) |
| 162 | +num_node=$(( ${num_gpus} / ${num_gpus_pernode} )) |
| 163 | + |
| 164 | +## Data parallel size. |
| 165 | +dp_size=$(( ${num_gpus} / ${pp_size} / ${mp_size} / ${sp_size} )) |
| 166 | + |
| 167 | +## Micro batch size per GPU |
| 168 | +## Make sure that batch_size <= global_batch_size*pp_size*mp_size/num_gpus |
| 169 | +## Reduce it manually if GPU OOM |
| 170 | +# batch_size=$(( ${global_batch_size} / ${dp_size} )) |
| 171 | +batch_size=2 |
| 172 | + |
| 173 | +############################################################################### |
| 174 | +### Misc configs |
| 175 | +log_interval=10 |
| 176 | +eval_iters=10 |
| 177 | +eval_interval=100 |
| 178 | +# num_save controls how frequent to save checkpoint. num_save=20 means that a |
| 179 | +# checkpoint will be saved every 5% of training. For longer training you would |
| 180 | +# want larger num_save to save more frequently, and vice versa. |
| 181 | +num_save=100 |
| 182 | +estimated_train_iter=$((${train_tokens} / ${seq_len} / ${global_batch_size})) |
| 183 | +# save_interval=$((${estimated_train_iter} / ${num_save})) |
| 184 | +save_interval=100 |
| 185 | + |
| 186 | +## Activation checkpointing saves GPU memory, but reduces training speed |
| 187 | +activation_checkpoint="true" |
| 188 | +# activation_checkpoint="false" |
| 189 | + |
| 190 | +## Whether or not log optimizer states (norms, max abs values) to tensorboard. |
| 191 | +## This is not required for training and might save GPU memory when turned off. |
| 192 | +log_optimizer_state="false" |
| 193 | +############################################################################### |
| 194 | +### Output and data configs |
| 195 | +current_time=$(date "+%Y.%m.%d_%H.%M.%S") |
| 196 | +host="${HOSTNAME}" |
| 197 | +seed=1234 |
| 198 | +num_workers=0 |
| 199 | + |
| 200 | +data_path="BookCorpusDataset_text_document" |
| 201 | +if [ ! -f "BookCorpusDataset_text_document.bin" ]; then |
| 202 | + wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin |
| 203 | +fi |
| 204 | +if [ ! -f "BookCorpusDataset_text_document.idx" ]; then |
| 205 | + wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx |
| 206 | +fi |
| 207 | + |
| 208 | +vocab_path="gpt2-vocab.json" |
| 209 | +if [ ! -f "$vocab_path" ]; then |
| 210 | + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json |
| 211 | +fi |
| 212 | +merge_path="gpt2-merges.txt" |
| 213 | +if [ ! -f "$merge_path" ]; then |
| 214 | + wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt |
| 215 | +fi |
| 216 | + |
| 217 | +prescale_grad="true" |
| 218 | +jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B" |
| 219 | +jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}" |
| 220 | +jobname="${jobname}_gbs${global_batch_size}_mbs${batch_size}_g${num_gpus}" |
| 221 | +if [[ $zero_stage -gt 0 ]]; then |
| 222 | + jobname="${jobname}_z${zero_stage}" |
| 223 | + prescale_grad="false" |
| 224 | +fi |
| 225 | +if [[ $sp_size -gt 1 ]]; then |
| 226 | + jobname="${jobname}_sp${sp_size}" |
| 227 | +fi |
| 228 | +if [[ $mp_size -gt 1 ]]; then |
| 229 | + jobname="${jobname}_mp${mp_size}" |
| 230 | +fi |
| 231 | +if [ "${no_pp}" = "false" ]; then |
| 232 | + jobname="${jobname}_pp${pp_size}" |
| 233 | +fi |
| 234 | +jobname="${jobname}_seed${seed}_rebase" |
| 235 | + |
| 236 | +username=$(whoami) |
| 237 | +output_home="output" |
| 238 | +log_path="${output_home}/log/" |
| 239 | +checkpoint_path="${output_home}/checkpoint/${jobname}" |
| 240 | +tensorboard_dir="${output_home}/tensorboard/" |
| 241 | +tensorboard_path="${tensorboard_dir}${jobname}_${host}_${current_time}" |
| 242 | +mkdir -p ${log_path} |
| 243 | +mkdir -p ${checkpoint_path} |
| 244 | +mkdir -p ${tensorboard_path} |
| 245 | +############################################################################### |
| 246 | +data_options=" \ |
| 247 | + --vocab-file ${vocab_path} \ |
| 248 | + --merge-file ${merge_path} \ |
| 249 | + --data-path ${data_path} \ |
| 250 | + --data-impl mmap" |
| 251 | + |
| 252 | +## If CL is used, make sure to set "--split" the same as what you used during |
| 253 | +## offline data analysis&indexing. |
| 254 | +megatron_options=" \ |
| 255 | + --override-opt_param-scheduler \ |
| 256 | + --adam-beta1 0.9 \ |
| 257 | + --adam-beta2 0.95 \ |
| 258 | + --tensor-model-parallel-size 1 \ |
| 259 | + --ds-sequence-parallel-fpdt \ |
| 260 | + --ds-sequence-parallel-fpdt-chunk-size 65536 \ |
| 261 | + --ds-sequence-parallel-fpdt-offloading \ |
| 262 | + --ds-sequence-parallel-size ${sp_size} \ |
| 263 | + --init-method-std ${init_std} \ |
| 264 | + --lr-decay-tokens ${lr_decay_tokens} \ |
| 265 | + --lr-warmup-tokens ${lr_warmup_tokens} \ |
| 266 | + --micro-batch-size ${batch_size} \ |
| 267 | + --exit-duration-in-mins ${exit_duration} \ |
| 268 | + --global-batch-size ${global_batch_size} \ |
| 269 | + --num-layers ${num_layers} \ |
| 270 | + --hidden-size ${hidden_size} \ |
| 271 | + --num-attention-heads ${num_attn_heads} \ |
| 272 | + --seq-length ${seq_len} \ |
| 273 | + --max-position-embeddings ${seq_len} \ |
| 274 | + --train-tokens ${train_tokens} \ |
| 275 | + --train-samples ${train_samples} \ |
| 276 | + --lr ${lr} \ |
| 277 | + --min-lr ${min_lr} \ |
| 278 | + --lr-decay-style ${lr_decay_style} \ |
| 279 | + --split 949,50,1 \ |
| 280 | + --log-interval ${log_interval} \ |
| 281 | + --eval-interval ${eval_interval} \ |
| 282 | + --eval-iters ${eval_iters} \ |
| 283 | + --save-interval ${save_interval} \ |
| 284 | + --weight-decay 0.1 \ |
| 285 | + --attention-dropout 0.0 \ |
| 286 | + --hidden-dropout 0.0 \ |
| 287 | + --clip-grad 1.0 \ |
| 288 | + --hysteresis 2 \ |
| 289 | + --num-workers ${num_workers} \ |
| 290 | + --fp16 \ |
| 291 | + --seed ${seed} \ |
| 292 | + --load ${checkpoint_path} \ |
| 293 | + --save ${checkpoint_path} \ |
| 294 | + --no-async-tensor-model-parallel-allreduce \ |
| 295 | + --use-flash-attn-v2 \ |
| 296 | + --tensorboard-queue-size 1 \ |
| 297 | + --use-rotary-position-embeddings \ |
| 298 | + --rotary-percent 0.25 \ |
| 299 | + --rotary-position-embeddings-theta 100000000 \ |
| 300 | + --log-timers-to-tensorboard \ |
| 301 | + --log-batch-size-to-tensorboard \ |
| 302 | + --log-validation-ppl-to-tensorboard \ |
| 303 | + --tensorboard-dir ${tensorboard_path}" |
| 304 | + |
| 305 | +if [ "${activation_checkpoint}" = "true" ]; then |
| 306 | +megatron_options="${megatron_options} \ |
| 307 | + --checkpoint-activations" |
| 308 | +fi |
| 309 | + |
| 310 | +if [ "${log_optimizer_state}" = "true" ]; then |
| 311 | +megatron_options="${megatron_options} \ |
| 312 | + --log-optimizer-states-to-tensorboard" |
| 313 | +fi |
| 314 | + |
| 315 | +config_json="ds_config_gbs${global_batch_size}_mbs${batch_size}_log${log_interval}_zero${zero_stage}.json" |
| 316 | +template_json="ds_config_gpt_TEMPLATE.json" |
| 317 | +sed "s/GBSIZE/${global_batch_size}/" ${template_json} \ |
| 318 | + | sed "s/MBSIZE/${batch_size}/" \ |
| 319 | + | sed "s/LOG_INTERVAL/${log_interval}/" \ |
| 320 | + | sed "s/ZERO_STAGE/${zero_stage}/" \ |
| 321 | + | sed "s/PRESCALE_GRAD/${prescale_grad}/" \ |
| 322 | + > ${config_json} |
| 323 | + |
| 324 | +deepspeed_options=" \ |
| 325 | + --deepspeed \ |
| 326 | + --deepspeed_config ${config_json} \ |
| 327 | + --zero-stage ${zero_stage} \ |
| 328 | + --pipeline-model-parallel-size ${pp_size}" |
| 329 | + |
| 330 | +if [[ "${no_pp}" = "true" ]]; then |
| 331 | +deepspeed_options="${deepspeed_options} \ |
| 332 | + --no-pipeline-parallel" |
| 333 | +fi |
| 334 | + |
| 335 | +if [ "${activation_checkpoint}" = "true" ]; then |
| 336 | +deepspeed_options="${deepspeed_options} \ |
| 337 | + --deepspeed-activation-checkpointing \ |
| 338 | + --checkpoint-in-cpu" |
| 339 | +fi |
| 340 | + |
| 341 | +## When saving checkpoint to a storage with cache, their could be consistency |
| 342 | +## issue of the pointer to latest checkpoint. Here we find the correct pointer |
| 343 | +## and broadcast it to all nodes. |
| 344 | +iteration_file="$checkpoint_path/latest_checkpointed_iteration.txt" |
| 345 | +iteration_file_2="$checkpoint_path/latest" |
| 346 | +iteration=0 |
| 347 | +for (( node = 0; node <= num_node-1; node++ )) |
| 348 | +do |
| 349 | + if $(ssh -q worker-"$node" "test -f \"$iteration_file\""); then |
| 350 | + local_iteration=$(ssh -q worker-"$node" cat $iteration_file) |
| 351 | + iteration=$(( ${local_iteration} > ${iteration} ? ${local_iteration} : ${iteration} )) |
| 352 | + fi |
| 353 | +done |
| 354 | +if [[ $iteration -gt 0 ]]; then |
| 355 | + iteration_2="global_step${iteration}" |
| 356 | + ds_ssh "echo $iteration > $iteration_file" |
| 357 | + ds_ssh "echo $iteration_2 > $iteration_file_2" |
| 358 | +fi |
| 359 | + |
| 360 | +deepspeed ${dir}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} 2>&1 | tee ${log_path}/${jobname}_${host}_${current_time}.log |
0 commit comments