From b0964c9f1e2c0b066ab84b2952c60ed01d55d48d Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:34:06 -0700 Subject: [PATCH 01/20] feat(demo): skeleton Dockerfile --- megatron-demo/Dockerfile | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 megatron-demo/Dockerfile diff --git a/megatron-demo/Dockerfile b/megatron-demo/Dockerfile new file mode 100644 index 00000000..d82bb625 --- /dev/null +++ b/megatron-demo/Dockerfile @@ -0,0 +1,4 @@ +ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e +FROM FROM ${BASE_IMAGE} + +WORKDIR /usr/src/app/megatron-lm From 392a0c1c3b2ad7dea2bd2bf9193b892c095cfa66 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:36:43 -0700 Subject: [PATCH 02/20] feat: sbatch_script.sh --- megatron-demo/sbatch_script.sh | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 megatron-demo/sbatch_script.sh diff --git a/megatron-demo/sbatch_script.sh b/megatron-demo/sbatch_script.sh new file mode 100644 index 00000000..d96d0911 --- /dev/null +++ b/megatron-demo/sbatch_script.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +#SBATCH --partition h100 +#SBATCH --nodes 1 +#SBATCH --ntasks-per-node 8 +#SBATCH --gpus-per-node 8 +#SBATCH --constraint gpu +#SBATCH --job-name test +#SBATCH --output test.%j +#SBATCH --export all +#SBATCH --exclusive + +export NCCL_SOCKET_IFNAME=eth0 +export SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 +export NCCL_COLLNET_ENABLE=0 +export NCCL_IB_HCA=ibp +export UCX_NET_DEVICES=ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 + +export MASTER_PORT="$(expr 10000 + "$(echo -n "${SLURM_JOB_ID:?}" | tail -c 4)")" +export MASTER_ADDR="$(scontrol show hostnames "${SLURM_JOB_NODELIST:?}" | head -n 1)" + + +CPU_BIND='map_ldom:0,0,0,0,1,1,1,1' + +CONTAINER_IMAGE="ghcr.io#coreweave/ml-containers/megatron-demo:TAG" + +srun --container-image "${CONTAINER_IMAGE}" \ + --container-mounts /mnt/data:/mnt/data,/mnt/home:/mnt/home \ + --export=ALL \ + --mpi=pmix \ + --kill-on-bad-exit=1 \ + ${CPU_BIND:+"--cpu-bind=$CPU_BIND"} \ + bash -c ". /usr/src/app/megatron-lm/srun_demo.sh" From bb8f82cea67b3383b930824407999a6a363e65c7 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:49:57 -0700 Subject: [PATCH 03/20] Create srun_demo.sh --- megatron-demo/srun_demo.sh | 104 +++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 megatron-demo/srun_demo.sh diff --git a/megatron-demo/srun_demo.sh b/megatron-demo/srun_demo.sh new file mode 100644 index 00000000..7110ac2f --- /dev/null +++ b/megatron-demo/srun_demo.sh @@ -0,0 +1,104 @@ +#!/bin/bash + +export CUDA_DEVICE_MAX_CONNECTIONS=1 + +export WORLD_SIZE="${SLURM_NTASKS:?}" +export RANK="${SLURM_PROCID:?}" +export LOCAL_RANK="${SLURM_LOCALID:?}" +export CUDA_DEVICE_ORDER='PCI_BUS_ID' + +OUTDIR="${OUTDIR:-/mnt/data/test}" +echo "Outputs will be saved to: $OUTDIR" +echo "Set the OUTDIR environment variable to override this location." + +CKPTDIR_LOAD="${CKPTDIR_LOAD:-${OUTDIR}/checkpoints}" +CKPTDIR_SAVE="${CKPTDIR_SAVE:-${OUTDIR}/checkpoints}" + +mkdir -p "${CKPTDIR_SAVE}" +touch "${CKPTDIR_SAVE}/progress.txt" + + +cd /usr/src/app/megatron-lm + +WARNING_FILTERS=( +'-Wignore::DeprecationWarning' +'-Wignore::FutureWarning' +'-Wignore::UserWarning:megatron.core.tensor_parallel.layers' # "async_grad_allreduce is deprecated" +'-Wignore::UserWarning:megatron.core.optimizer.distrib_optimizer' # "pre_hook" method deprecations +) + +python3 "${WARNING_FILTERS[@]:?}" \ + "/usr/src/app/megatron-lm/pretrain_gpt.py" \ + --train-iters 1000000 \ + --lr 4e-05 \ + --lr-decay-iters 998000 \ + --lr-decay-style cosine \ + --min-lr 4e-06 \ + --lr-warmup-iters 2000 \ + --clip-grad 1.0 \ + --bf16 \ + --use-flash-attn \ + --rotary-seq-len-interpolation-factor 32 \ + --no-fp8-wgrad \ + --use-distributed-optimizer \ + --distributed-backend nccl \ + --data-cache-path cache \ + --split 949,50,1 \ + --seed 42 \ + --use-checkpoint-args \ + --no-masked-softmax-fusion \ + --attention-softmax-in-fp32 \ + --transformer-impl transformer_engine \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --rotary-base 500000 \ + --rotary-percent 1.0 \ + --use-rope-scaling \ + --micro-batch-size 1 \ + --tensor-model-parallel-size 4 \ + --pipeline-model-parallel-size 1 \ + --context-parallel-size 1 \ + --sequence-parallel \ + --overlap-grad-reduce \ + --overlap-param-gather \ + --log-interval 1 \ + --tensorboard-log-interval 1 \ + --save-interval 3500 \ + --eval-interval 100 \ + --eval-iters 10 \ + --logging-level 20 \ + --log-params-norm \ + --log-num-zeros-in-grad \ + --log-throughput \ + --log-progress \ + --timing-log-level 0 \ + --timing-log-option all \ + --log-timers-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --log-memory-to-tensorboard \ + --log-world-size-to-tensorboard \ + --wandb-project test \ + --wandb-save-dir "${OUTDIR}/logs" \ + --tensorboard-dir "${OUTDIR}/tensorboard" \ + --ffn-hidden-size 11008 \ + --num-attention-heads 32 \ + --num-layers 32 \ + --hidden-size 4096 \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --untie-embeddings-and-output-weights \ + --normalization RMSNorm \ + --swiglu \ + --position-embedding-type rope \ + --disable-bias-linear \ + --group-query-attention \ + --num-query-groups 8 \ + --tokenizer-type GPTSentencePieceTokenizer \ + --tokenizer-model /usr/src/app/megatron-lm/tokenizers/nerdstash-tokenizer-v2/tokenizer.model \ + --data-path \ + /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/chunk.0 \ + /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/chunk.0 \ + --wandb-exp-name "${SLURM_JOB_ID:?}/test" \ + --load "${CKPTDIR_LOAD}" \ + --save "${CKPTDIR_SAVE}" \ + --dataloader-type cyclic \ From 4443815d7ffc6b62699a7e9e4cb520674d74532c Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:52:18 -0700 Subject: [PATCH 04/20] feat(demo): copy slurm scripts --- megatron-demo/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron-demo/Dockerfile b/megatron-demo/Dockerfile index d82bb625..e93cd63b 100644 --- a/megatron-demo/Dockerfile +++ b/megatron-demo/Dockerfile @@ -2,3 +2,6 @@ ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5 FROM FROM ${BASE_IMAGE} WORKDIR /usr/src/app/megatron-lm +COPY sbatch_script.sh . +COPY srun_demo.sh . + From f02308a96dc445ba7669eeaeba8d9682a6550ca8 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:57:06 -0700 Subject: [PATCH 05/20] Rename sbatch_script.sh to sbatch_demo.sh --- megatron-demo/{sbatch_script.sh => sbatch_demo.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename megatron-demo/{sbatch_script.sh => sbatch_demo.sh} (100%) diff --git a/megatron-demo/sbatch_script.sh b/megatron-demo/sbatch_demo.sh similarity index 100% rename from megatron-demo/sbatch_script.sh rename to megatron-demo/sbatch_demo.sh From cf425b1b60f723b4c7b19e7e08886fa46bed152c Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 13:59:55 -0700 Subject: [PATCH 06/20] Update Dockerfile --- megatron-demo/Dockerfile | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/megatron-demo/Dockerfile b/megatron-demo/Dockerfile index e93cd63b..3f12fa43 100644 --- a/megatron-demo/Dockerfile +++ b/megatron-demo/Dockerfile @@ -1,7 +1,24 @@ ARG BASE_IMAGE=ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e -FROM FROM ${BASE_IMAGE} +FROM ${BASE_IMAGE} WORKDIR /usr/src/app/megatron-lm -COPY sbatch_script.sh . +COPY sbatch_demo.sh . COPY srun_demo.sh . +### update sbatch to point to this container +ARG IMAGE_TAG +ENV IMAGE_TAG=${IMAGE_TAG} +RUN sed -i "s|megatron-demo:TAG|megatron-demo:${IMAGE_TAG}|g" /usr/src/app/megatron-lm/sbatch_demo.sh + +### Get tokenizer +RUN mkdir tokenizers && \ + cd tokenizers && \ + git clone https://huggingface.co/NovelAI/nerdstash-tokenizer-v2 && \ + rm -rf .git + +WORKDIR /usr/src/app/megatron-lm + +### Get data +RUN mkdir -p /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/ +RUN wget -qO- 'https://blobstore.object.ord1.coreweave.com/mldev/datasets/demo-dataset.tbz' \ +| tar -C /usr/src/app/megatron-lm/coreweave-datasets/smol/tokenized/nerdstash_v2-uint16/ -xjf - From 310cd10b1e6a88a5af18b6f7a75452a3d06aef69 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:06:12 -0700 Subject: [PATCH 07/20] Create megatron-demo.yaml --- .github/workflows/megatron-demo.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/megatron-demo.yaml diff --git a/.github/workflows/megatron-demo.yaml b/.github/workflows/megatron-demo.yaml new file mode 100644 index 00000000..1f1a97dc --- /dev/null +++ b/.github/workflows/megatron-demo.yaml @@ -0,0 +1,23 @@ +on: + workflow_dispatch: + inputs: + base-image: + description: 'Base image to use' + required: true + push: + paths: + - "megatron-demo/**" + - ".github/workflows/megatron-demo.yml" + - ".github/workflows/build.yml" + + +jobs: + build: + uses: ./.github/workflows/build.yml + secrets: inherit + with: + image-name: megatron-demo + folder: megatron-demo + build-args: | + BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e'}} + IMAGE_TAG=${{ github.sha }} From 9f25ceed331d0d2614418c7d4c7d34da4ee44aa9 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:08:45 -0700 Subject: [PATCH 08/20] Update megatron-demo.yaml --- .github/workflows/megatron-demo.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/megatron-demo.yaml b/.github/workflows/megatron-demo.yaml index 1f1a97dc..f23a46d1 100644 --- a/.github/workflows/megatron-demo.yaml +++ b/.github/workflows/megatron-demo.yaml @@ -5,6 +5,9 @@ on: description: 'Base image to use' required: true push: + branches: + - "dmarx/megatron-demo" + - "main" paths: - "megatron-demo/**" - ".github/workflows/megatron-demo.yml" From d234fc052e41e6f4eef2e763d7a3d1f4c49e48a8 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:11:24 -0700 Subject: [PATCH 09/20] Update megatron-demo.yaml --- .github/workflows/megatron-demo.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/megatron-demo.yaml b/.github/workflows/megatron-demo.yaml index f23a46d1..809e7111 100644 --- a/.github/workflows/megatron-demo.yaml +++ b/.github/workflows/megatron-demo.yaml @@ -3,7 +3,7 @@ on: inputs: base-image: description: 'Base image to use' - required: true + required: false push: branches: - "dmarx/megatron-demo" From 2929ca8770c9aa0c5a1852758d7f7391be0892cc Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:19:29 -0700 Subject: [PATCH 10/20] Update megatron-demo.yaml --- .github/workflows/megatron-demo.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/megatron-demo.yaml b/.github/workflows/megatron-demo.yaml index 809e7111..47152fbb 100644 --- a/.github/workflows/megatron-demo.yaml +++ b/.github/workflows/megatron-demo.yaml @@ -1,3 +1,5 @@ +name: Megatron Training Demo + on: workflow_dispatch: inputs: From acf93a1c101c7c9fd53bbc413e67bfc97371867e Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:28:20 -0700 Subject: [PATCH 11/20] Rename megatron.yml to megatron-back.yml --- .github/workflows/{megatron.yml => megatron-back.yml} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename .github/workflows/{megatron.yml => megatron-back.yml} (93%) diff --git a/.github/workflows/megatron.yml b/.github/workflows/megatron-back.yml similarity index 93% rename from .github/workflows/megatron.yml rename to .github/workflows/megatron-back.yml index 294ca54e..95c6f9bf 100644 --- a/.github/workflows/megatron.yml +++ b/.github/workflows/megatron-back.yml @@ -23,4 +23,4 @@ jobs: folder: megatron build-args: | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/torch-extras:bfe03aa-nccl-cuda12.4.1-ubuntu22.04-nccl2.21.5-1-torch2.4.0-vision0.19.0-audio2.4.0'}} - COMMIT=${{ inputs.commit || 'main'}} \ No newline at end of file + COMMIT=${{ inputs.commit || 'main'}} From 266937e94564c49a4586c26dabb93042e7be7696 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:29:22 -0700 Subject: [PATCH 12/20] Update and rename megatron-demo.yaml to megatron.yaml --- .github/workflows/{megatron-demo.yaml => megatron.yaml} | 2 ++ 1 file changed, 2 insertions(+) rename .github/workflows/{megatron-demo.yaml => megatron.yaml} (84%) diff --git a/.github/workflows/megatron-demo.yaml b/.github/workflows/megatron.yaml similarity index 84% rename from .github/workflows/megatron-demo.yaml rename to .github/workflows/megatron.yaml index 47152fbb..07534081 100644 --- a/.github/workflows/megatron-demo.yaml +++ b/.github/workflows/megatron.yaml @@ -1,4 +1,6 @@ name: Megatron Training Demo +# TODO: rename this workflow to `megatron-demo.yaml` +# and rename `megatron-back.yaml` to `megatron.yaml` before merging. on: workflow_dispatch: From b116cdf19c55af4e79e03fbb62f36997866c1599 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:36:05 -0700 Subject: [PATCH 13/20] Rename megatron-back.yml to megatron.yml --- .github/workflows/{megatron-back.yml => megatron.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{megatron-back.yml => megatron.yml} (100%) diff --git a/.github/workflows/megatron-back.yml b/.github/workflows/megatron.yml similarity index 100% rename from .github/workflows/megatron-back.yml rename to .github/workflows/megatron.yml From d9c51496c869c88811a78d9081a323fa8c1ac73c Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:36:27 -0700 Subject: [PATCH 14/20] Update and rename megatron.yaml to megatron-demo.yml --- .github/workflows/{megatron.yaml => megatron-demo.yml} | 2 -- 1 file changed, 2 deletions(-) rename .github/workflows/{megatron.yaml => megatron-demo.yml} (84%) diff --git a/.github/workflows/megatron.yaml b/.github/workflows/megatron-demo.yml similarity index 84% rename from .github/workflows/megatron.yaml rename to .github/workflows/megatron-demo.yml index 07534081..47152fbb 100644 --- a/.github/workflows/megatron.yaml +++ b/.github/workflows/megatron-demo.yml @@ -1,6 +1,4 @@ name: Megatron Training Demo -# TODO: rename this workflow to `megatron-demo.yaml` -# and rename `megatron-back.yaml` to `megatron.yaml` before merging. on: workflow_dispatch: From 4abd244d5563c4d0c834254eca1854f470288b70 Mon Sep 17 00:00:00 2001 From: David Marx Date: Fri, 9 May 2025 14:42:22 -0700 Subject: [PATCH 15/20] Update sbatch_demo.sh --- megatron-demo/sbatch_demo.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron-demo/sbatch_demo.sh b/megatron-demo/sbatch_demo.sh index d96d0911..e6a976cd 100644 --- a/megatron-demo/sbatch_demo.sh +++ b/megatron-demo/sbatch_demo.sh @@ -19,8 +19,8 @@ export UCX_NET_DEVICES=ibp0:1,ibp1:1,ibp2:1,ibp3:1,ibp4:1,ibp5:1,ibp6:1,ibp7:1 export MASTER_PORT="$(expr 10000 + "$(echo -n "${SLURM_JOB_ID:?}" | tail -c 4)")" export MASTER_ADDR="$(scontrol show hostnames "${SLURM_JOB_NODELIST:?}" | head -n 1)" - -CPU_BIND='map_ldom:0,0,0,0,1,1,1,1' +### uncomment when running on CW H100's +#CPU_BIND='map_ldom:0,0,0,0,1,1,1,1' CONTAINER_IMAGE="ghcr.io#coreweave/ml-containers/megatron-demo:TAG" From daa0a02fae63ca375593f129d201aa80e8b4c82b Mon Sep 17 00:00:00 2001 From: David Marx Date: Sat, 10 May 2025 11:34:07 -0700 Subject: [PATCH 16/20] fix: use git-lfs to get tokenizer model --- megatron-demo/Dockerfile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/megatron-demo/Dockerfile b/megatron-demo/Dockerfile index 3f12fa43..9b76e271 100644 --- a/megatron-demo/Dockerfile +++ b/megatron-demo/Dockerfile @@ -10,10 +10,17 @@ ARG IMAGE_TAG ENV IMAGE_TAG=${IMAGE_TAG} RUN sed -i "s|megatron-demo:TAG|megatron-demo:${IMAGE_TAG}|g" /usr/src/app/megatron-lm/sbatch_demo.sh -### Get tokenizer +### Install Git LFS +RUN apt-get update && \ + apt-get install -y git-lfs && \ + git lfs install + +### Get tokenizer (with LFS) RUN mkdir tokenizers && \ cd tokenizers && \ git clone https://huggingface.co/NovelAI/nerdstash-tokenizer-v2 && \ + cd nerdstash-tokenizer-v2 && \ + git lfs pull && \ rm -rf .git WORKDIR /usr/src/app/megatron-lm From af0cf6e9c35a98e0cfa0cebfe10a680c39b10896 Mon Sep 17 00:00:00 2001 From: David Marx Date: Sat, 10 May 2025 11:55:39 -0700 Subject: [PATCH 17/20] fix(demo): force tag consistency --- .github/workflows/megatron-demo.yml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/.github/workflows/megatron-demo.yml b/.github/workflows/megatron-demo.yml index 47152fbb..21f1f189 100644 --- a/.github/workflows/megatron-demo.yml +++ b/.github/workflows/megatron-demo.yml @@ -15,9 +15,21 @@ on: - ".github/workflows/megatron-demo.yml" - ".github/workflows/build.yml" - jobs: + set-tag: + name: Set Image Tag + runs-on: ubuntu-latest + outputs: + image_tag: ${{ steps.set.outputs.tag }} + steps: + - id: set + run: | + TAG="${GITHUB_REF_NAME}-${GITHUB_SHA::7}" + echo "tag=$TAG" >> $GITHUB_OUTPUT + build: + name: Build Container + needs: set-tag uses: ./.github/workflows/build.yml secrets: inherit with: @@ -25,4 +37,5 @@ jobs: folder: megatron-demo build-args: | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e'}} - IMAGE_TAG=${{ github.sha }} + IMAGE_TAG=${{ needs.set-tag.outputs.image_tag }} + tag-suffix: ${{ needs.set-tag.outputs.image_tag }} From 8e53c4610705b01ab824609207896da29ca0c183 Mon Sep 17 00:00:00 2001 From: David Marx Date: Sat, 10 May 2025 12:06:24 -0700 Subject: [PATCH 18/20] fix: truncate tag prefix --- .github/workflows/megatron-demo.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/megatron-demo.yml b/.github/workflows/megatron-demo.yml index 21f1f189..2511139c 100644 --- a/.github/workflows/megatron-demo.yml +++ b/.github/workflows/megatron-demo.yml @@ -38,4 +38,5 @@ jobs: build-args: | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e'}} IMAGE_TAG=${{ needs.set-tag.outputs.image_tag }} + tag-prefix: "" tag-suffix: ${{ needs.set-tag.outputs.image_tag }} From 88f3ad2e49e8708243214830f961e3f719d15782 Mon Sep 17 00:00:00 2001 From: David Marx Date: Sat, 10 May 2025 12:42:40 -0700 Subject: [PATCH 19/20] fix(demo): I don't love it but it should work... --- .github/workflows/megatron-demo.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/megatron-demo.yml b/.github/workflows/megatron-demo.yml index 2511139c..c5b3f1f1 100644 --- a/.github/workflows/megatron-demo.yml +++ b/.github/workflows/megatron-demo.yml @@ -38,5 +38,4 @@ jobs: build-args: | BASE_IMAGE=${{ inputs.base-image || 'ghcr.io/coreweave/ml-containers/megatron:es-megatron-tensorizer-5fabc1e'}} IMAGE_TAG=${{ needs.set-tag.outputs.image_tag }} - tag-prefix: "" - tag-suffix: ${{ needs.set-tag.outputs.image_tag }} + #tag-suffix: ${{ needs.set-tag.outputs.image_tag }} From 7773da7409ae99d8c826dcd07f48869ca883705f Mon Sep 17 00:00:00 2001 From: David Marx Date: Sat, 10 May 2025 12:55:05 -0700 Subject: [PATCH 20/20] fix: sanitize repo name in tag --- .github/workflows/megatron-demo.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/megatron-demo.yml b/.github/workflows/megatron-demo.yml index c5b3f1f1..fa1b69cb 100644 --- a/.github/workflows/megatron-demo.yml +++ b/.github/workflows/megatron-demo.yml @@ -24,9 +24,11 @@ jobs: steps: - id: set run: | - TAG="${GITHUB_REF_NAME}-${GITHUB_SHA::7}" + SANITIZED_REF_NAME="${GITHUB_REF_NAME//\//-}" + TAG="${SANITIZED_REF_NAME}-${GITHUB_SHA::7}" echo "tag=$TAG" >> $GITHUB_OUTPUT + build: name: Build Container needs: set-tag