Skip to content

feat: public containerized training demo #92

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions .github/workflows/megatron-demo.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Megatron Training Demo

on:
workflow_dispatch:
inputs:
base-image:
description: 'Base image to use'
required: false
push:
branches:
- "dmarx/megatron-demo"
- "main"
paths:
- "megatron-demo/**"
- ".github/workflows/megatron-demo.yml"
- ".github/workflows/build.yml"

jobs:
set-tag:
name: Set Image Tag
runs-on: ubuntu-latest
outputs:
image_tag: ${{ steps.set.outputs.tag }}
steps:
- id: set
run: |
SANITIZED_REF_NAME="${GITHUB_REF_NAME//\//-}"
TAG="${SANITIZED_REF_NAME}-${GITHUB_SHA::7}"
echo "tag=$TAG" >> $GITHUB_OUTPUT


build:
name: Build Container
needs: set-tag
uses: ./.github/workflows/build.yml
secrets: inherit
with:
image-name: megatron-demo
folder: megatron-demo
build-args: |
BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e'}}
IMAGE_TAG=${{ needs.set-tag.outputs.image_tag }}
#tag-suffix: ${{ needs.set-tag.outputs.image_tag }}
2 changes: 1 addition & 1 deletion .github/workflows/megatron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ jobs:
folder: megatron
build-args: |
BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:bfe03aa-nccl-cuda12.4.1-ubuntu22.04-nccl2.21.5-1-torch2.4.0-vision0.19.0-audio2.4.0'}}
COMMIT=${{ inputs.commit || 'main'}}
COMMIT=${{ inputs.commit || 'main'}}
31 changes: 31 additions & 0 deletions megatron-demo/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e
FROM ${BASE_IMAGE}

WORKDIR /usr/src/app/megatron-lm
COPY sbatch_demo.sh .
COPY srun_demo.sh .

### update sbatch to point to this container
ARG IMAGE_TAG
ENV IMAGE_TAG=${IMAGE_TAG}
RUN sed -i "s|megatron-demo:TAG|megatron-demo:${IMAGE_TAG}|g" /usr/src/app/megatron-lm/sbatch_demo.sh

### Install Git LFS
RUN apt-get update && \
apt-get install -y git-lfs && \
git lfs install

### Get tokenizer (with LFS)
RUN mkdir tokenizers && \
cd tokenizers && \
git clone https://huggingface.co/NovelAI/nerdstash-tokenizer-v2 && \
cd nerdstash-tokenizer-v2 && \
git lfs pull && \
rm -rf .git

WORKDIR /usr/src/app/megatron-lm

### Get data
RUN mkdir -p /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/
RUN wget -qO- 'https://blobstore.object.ord1.coreweave.com/mldev/datasets/demo-dataset.tbz' \
| tar -C /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/ -xjf -
33 changes: 33 additions & 0 deletions megatron-demo/sbatch_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

#SBATCH --partition h100
#SBATCH --nodes 1
#SBATCH --ntasks-per-node 8
#SBATCH --gpus-per-node 8
#SBATCH --constraint gpu
#SBATCH --job-name test
#SBATCH --output test.%j
#SBATCH --export all
#SBATCH --exclusive

export NCCL_SOCKET_IFNAME=eth0
export SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1
export NCCL_COLLNET_ENABLE=0
export NCCL_IB_HCA=ibp
export UCX_NET_DEVICES=ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1

export MASTER_PORT="$(expr 10000 + "$(echo -n "${SLURM_JOB_ID:?}" | tail -c 4)")"
export MASTER_ADDR="$(scontrol show hostnames "${SLURM_JOB_NODELIST:?}" | head -n 1)"

### uncomment when running on CW H100's
#CPU_BIND='map_ldom:0,0,0,0,1,1,1,1'

CONTAINER_IMAGE="ghcr.io#coreweave/ml-containers/megatron-demo:TAG"

srun --container-image "${CONTAINER_IMAGE}" \
--container-mounts /mnt/data:/mnt/data,/mnt/home:/mnt/home \
--export=ALL \
--mpi=pmix \
--kill-on-bad-exit=1 \
${CPU_BIND:+"--cpu-bind=$CPU_BIND"} \
bash -c ". /usr/src/app/megatron-lm/srun_demo.sh"
104 changes: 104 additions & 0 deletions megatron-demo/srun_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/bin/bash

export CUDA_DEVICE_MAX_CONNECTIONS=1

export WORLD_SIZE="${SLURM_NTASKS:?}"
export RANK="${SLURM_PROCID:?}"
export LOCAL_RANK="${SLURM_LOCALID:?}"
export CUDA_DEVICE_ORDER='PCI_BUS_ID'

OUTDIR="${OUTDIR:-/mnt/data/test}"
echo "Outputs will be saved to: $OUTDIR"
echo "Set the OUTDIR environment variable to override this location."

CKPTDIR_LOAD="${CKPTDIR_LOAD:-${OUTDIR}/checkpoints}"
CKPTDIR_SAVE="${CKPTDIR_SAVE:-${OUTDIR}/checkpoints}"

mkdir -p "${CKPTDIR_SAVE}"
touch "${CKPTDIR_SAVE}/progress.txt"


cd /usr/src/app/megatron-lm

WARNING_FILTERS=(
'-Wignore::DeprecationWarning'
'-Wignore::FutureWarning'
'-Wignore::UserWarning:megatron.core.tensor_parallel.layers' # "async_grad_allreduce is deprecated"
'-Wignore::UserWarning:megatron.core.optimizer.distrib_optimizer' # "pre_hook" method deprecations
)

python3 "${WARNING_FILTERS[@]:?}" \
"/usr/src/app/megatron-lm/pretrain_gpt.py" \
--train-iters 1000000 \
--lr 4e-05 \
--lr-decay-iters 998000 \
--lr-decay-style cosine \
--min-lr 4e-06 \
--lr-warmup-iters 2000 \
--clip-grad 1.0 \
--bf16 \
--use-flash-attn \
--rotary-seq-len-interpolation-factor 32 \
--no-fp8-wgrad \
--use-distributed-optimizer \
--distributed-backend nccl \
--data-cache-path cache \
--split 949,50,1 \
--seed 42 \
--use-checkpoint-args \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--rotary-base 500000 \
--rotary-percent 1.0 \
--use-rope-scaling \
--micro-batch-size 1 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--context-parallel-size 1 \
--sequence-parallel \
--overlap-grad-reduce \
--overlap-param-gather \
--log-interval 1 \
--tensorboard-log-interval 1 \
--save-interval 3500 \
--eval-interval 100 \
--eval-iters 10 \
--logging-level 20 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-throughput \
--log-progress \
--timing-log-level 0 \
--timing-log-option all \
--log-timers-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--log-memory-to-tensorboard \
--log-world-size-to-tensorboard \
--wandb-project test \
--wandb-save-dir "${OUTDIR}/logs" \
--tensorboard-dir "${OUTDIR}/tensorboard" \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--num-layers 32 \
--hidden-size 4096 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--untie-embeddings-and-output-weights \
--normalization RMSNorm \
--swiglu \
--position-embedding-type rope \
--disable-bias-linear \
--group-query-attention \
--num-query-groups 8 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model /usr/src/app/megatron-lm/tokenizers/nerdstash-tokenizer-v2/tokenizer.model \
--data-path \
/usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/chunk.0 \
/usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/chunk.0 \
--wandb-exp-name "${SLURM_JOB_ID:?}/test" \
--load "${CKPTDIR_LOAD}" \
--save "${CKPTDIR_SAVE}" \
--dataloader-type cyclic \
Loading