From ea787b08ff3c8167f5568a0634f385fa3212dab4 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Feb 2025 23:17:11 +0000 Subject: [PATCH 01/21] Test build to check processing by different K8 queues. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 4 ++++ .buildkite/test-template.j2 | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a847a68a6ef7..a038fb592dac 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -93,6 +93,7 @@ steps: - label: Core Test # 10min mirror_hardwares: [amd] + amd_gpus: 4 # Just for the sake of queue testing fast_check: true source_file_dependencies: - vllm/core @@ -105,6 +106,7 @@ steps: working_dir: "/vllm-workspace/tests" fast_check: true mirror_hardwares: [amd] + amd_gpus: 2 # Just for the sake of queue testing source_file_dependencies: - vllm/ commands: @@ -257,6 +259,7 @@ steps: - label: LoRA Test %N # 15min each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - vllm/lora - tests/lora @@ -283,6 +286,7 @@ steps: - label: Kernels Test %N # 1h each mirror_hardwares: [amd] + amd_gpus: 8 source_file_dependencies: - csrc/ - vllm/attention diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index ce448836a827..0d7c4ca9c75d 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -27,7 +27,15 @@ steps: depends_on: - "amd-build" agents: - queue: amd_gpu +{% if step.amd_gpus and step.amd_gpus==8%} + queue: amd_gpu_8 +{% elif step.amd_gpus and step.amd_gpus==4%} + queue: amd_gpu_4 +{% elif step.amd_gpus and step.amd_gpus==2%} + queue: amd_gpu_4 +{% else%} + queue: amd_gpu_1 +{% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" env: From 01dfddaa3bc466fae336d19dc39c42605eeb97d5 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:12:56 +0000 Subject: [PATCH 02/21] Testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 009e929ebace..feda9b8dfaaf 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="0" +ARG REMOTE_VLLM="1" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app From 7f80bf893fbb7c7332dbb48f8da2da6119f31644 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 00:33:23 +0000 Subject: [PATCH 03/21] Copying over the tests directory to enable CI testing. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index feda9b8dfaaf..c28ffee09497 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ # default base image -ARG REMOTE_VLLM="1" +ARG REMOTE_VLLM="0" ARG USE_CYTHON="0" ARG BUILD_RPD="1" ARG COMMON_WORKDIR=/app @@ -108,6 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples +COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 14aaf35a1871e0bea62d05ca7e7b2de199991c6a Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 05:06:38 +0000 Subject: [PATCH 04/21] Comparing with MI250 in the "mi250_8xGPU" queue. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 0d7c4ca9c75d..67bd8b5d15ae 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 + queue: mi250_8xGPU {% else%} - queue: amd_gpu_1 + queue: mi250_8xGPU {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From a1064893a9eda82cf29f1181a04fe753dd47c58d Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 06:39:04 +0000 Subject: [PATCH 05/21] Building with "test" as a --target Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 10 +++++----- Dockerfile.rocm | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 67bd8b5d15ae..7106395910d3 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" depends_on: ~ commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --progress plain ." + - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ." - "docker push {{ docker_image_amd }}" key: "amd-build" env: @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: mi250_8xGPU + queue: amd_gpu_8 {% elif step.amd_gpus and step.amd_gpus==4%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% elif step.amd_gpus and step.amd_gpus==2%} - queue: mi250_8xGPU + queue: amd_gpu_4 {% else%} - queue: mi250_8xGPU + queue: amd_gpu_1 {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" diff --git a/Dockerfile.rocm b/Dockerfile.rocm index c28ffee09497..3965880bfd7c 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,8 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests - +#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests +#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 6acfc3aba4cbc7ad79ad9ed86315e39bc37ff065 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 08:04:00 +0000 Subject: [PATCH 06/21] Fixing working directory property. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-pipeline.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index a038fb592dac..9df17920788d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -92,6 +92,7 @@ steps: - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 4 # Just for the sake of queue testing fast_check: true @@ -178,6 +179,7 @@ steps: - pytest -v -s engine test_sequence.py test_config.py test_logger.py # OOM in the CI unless we run this separately - pytest -v -s tokenization + working_dir: "/vllm-workspace/tests" # optional - label: V1 Test #mirror_hardwares: [amd] @@ -219,6 +221,7 @@ steps: - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - label: Prefix Caching Test # 9min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -237,6 +240,7 @@ steps: - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] source_file_dependencies: - vllm/model_executor/layers @@ -258,6 +262,7 @@ steps: - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - label: LoRA Test %N # 15min each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -285,6 +290,7 @@ steps: - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] amd_gpus: 8 source_file_dependencies: @@ -296,6 +302,7 @@ steps: parallelism: 4 - label: Tensorizer Test # 11min + working_dir: "/vllm-workspace/tests" mirror_hardwares: [amd] soft_fail: true source_file_dependencies: @@ -338,6 +345,7 @@ steps: - pytest -v -s encoder_decoder - label: OpenAI-Compatible Tool Use # 20 min + working_dir: "/vllm-workspace/tests" fast_check: false mirror_hardwares: [ amd ] source_file_dependencies: From 172e0e8bd375d43ccfc41aa1d83f2d21256e78cf Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 5 Feb 2025 18:17:17 +0000 Subject: [PATCH 07/21] Dummy alternation to confirm trouble with simultaneous test execution. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 3965880bfd7c..edb042c68f69 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,8 +108,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -#COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests -#COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite + ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 114e750973a4225d90126aca8582ffb31e34695f Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 6 Feb 2025 17:20:59 +0000 Subject: [PATCH 08/21] Dummy alternation to trigger a re-build and re-test. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index edb042c68f69..8c86c618103e 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,7 +108,7 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples - +# "Dummy alternation" ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From 0fc40501a399ea36b4c4d652048b6bd6973d62dc Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Thu, 27 Feb 2025 22:16:50 +0000 Subject: [PATCH 09/21] Updating rocm dockerhub repo. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 7106395910d3..bfced2737204 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -1,5 +1,5 @@ {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT" %} +{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %} {% set default_working_dir = "vllm/tests" %} {% set hf_home = "/root/.cache/huggingface" %} From b2e3e12925ac694b6ae032c3f3ae85255d764d24 Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Mon, 3 Mar 2025 16:13:23 -0600 Subject: [PATCH 10/21] Update run-amd-test.sh Fixing docker repo for testing. Signed-off-by: Alexei V. Ivanov --- .buildkite/run-amd-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 5e79984c9f7b..0aef82e2a036 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -57,7 +57,7 @@ while true; do done echo "--- Pulling container" -image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" +image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" docker pull "${image_name}" From cc41fa6d3394ec80b4399e73b94a9b8e17a93d1f Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Mar 2025 00:55:23 +0000 Subject: [PATCH 11/21] . Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index bfced2737204..5de5e56c7a6f 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -34,7 +34,7 @@ steps: {% elif step.amd_gpus and step.amd_gpus==2%} queue: amd_gpu_4 {% else%} - queue: amd_gpu_1 + queue: amd_gpu_4 {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From 4022a8a392cf766c923b11a7c9fcf066d61908d0 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Mar 2025 03:49:02 +0000 Subject: [PATCH 12/21] Importing Test improvements (Sage's PR #13970 to vllm-project). Signed-off-by: Alexei V. Ivanov --- .buildkite/run-amd-test.sh | 4 +++- .../core/block/e2e/test_correctness_sliding_window.py | 9 +++++++++ tests/prefix_caching/test_prefix_caching.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 0aef82e2a036..1171441fe475 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -92,7 +92,9 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_moe.py \ --ignore=kernels/test_prefix_prefill.py \ --ignore=kernels/test_rand.py \ - --ignore=kernels/test_sampler.py" + --ignore=kernels/test_sampler.py \ + --ignore=kernels/test_cascade_flash_attn.py \ + --ignore=kernels/test_mamba_mixer2.py" fi #ignore certain Entrypoints tests diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 1a8873b00999..6819c8c00e9c 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -7,6 +7,7 @@ from tests.kernels.utils import override_backend_env_variable from vllm import LLM, SamplingParams +from vllm.platforms import current_platform from ....test_utils import xfail_if_rocm62 from .conftest import get_text_from_llm_generator @@ -43,6 +44,10 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, Additionally, we compare the results of the v1 and v2 managers. """ + if backend == "FLASHINFER" and current_platform.is_rocm(): + pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and current_platform.is_rocm(): + pytest.skip("Xformers does not support ROCm/HIP.") override_backend_env_variable(monkeypatch, backend) sampling_params = SamplingParams( @@ -103,6 +108,10 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, The results with and without chunked prefill are not the same due to numerical instabilities. """ + if backend == "FLASHINFER" and current_platform.is_rocm(): + pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and current_platform.is_rocm(): + pytest.skip("Xformers does not support ROCm/HIP.") override_backend_env_variable(monkeypatch, backend) sampling_params = SamplingParams( diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 90d424fe35d8..7a4641a0dbd7 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -12,6 +12,7 @@ from vllm import SamplingParams, TokensPrompt from vllm.core.scheduler import Scheduler from vllm.engine.llm_engine import LLMEngine +from vllm.platforms import current_platform from ..models.utils import check_outputs_equal @@ -53,6 +54,10 @@ def test_mixed_requests( and the others don't. The cached position determines where the sequence is at among the batch of prefills. """ + if backend == "FLASHINFER" and current_platform.is_rocm(): + pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and current_platform.is_rocm(): + pytest.skip("Xformers does not support ROCm/HIP.") override_backend_env_variable(monkeypatch, backend) with hf_runner(model, dtype=dtype) as hf_model: @@ -103,6 +108,11 @@ def test_unstable_prompt_sequence( backend: str, monkeypatch, ) -> None: + + if backend == "FLASHINFER" and current_platform.is_rocm(): + pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and current_platform.is_rocm(): + pytest.skip("Xformers does not support ROCm/HIP.") override_backend_env_variable(monkeypatch, backend) with vllm_runner( From 84ea7b99cbc4fb146cbb359086a34a7746d43f9b Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 4 Mar 2025 06:13:00 +0000 Subject: [PATCH 13/21] Restoring access to amd_gpu_1 queue Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 5de5e56c7a6f..bfced2737204 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -34,7 +34,7 @@ steps: {% elif step.amd_gpus and step.amd_gpus==2%} queue: amd_gpu_4 {% else%} - queue: amd_gpu_4 + queue: amd_gpu_1 {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From fbb39f35a8d95fb335990cd89adba0e855006e9a Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Mon, 10 Mar 2025 15:29:15 +0000 Subject: [PATCH 14/21] Redirecting to the stable test-processing queues. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index bfced2737204..5dbd144291df 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -28,13 +28,13 @@ steps: - "amd-build" agents: {% if step.amd_gpus and step.amd_gpus==8%} - queue: amd_gpu_8 + queue: amd_gpu {% elif step.amd_gpus and step.amd_gpus==4%} - queue: amd_gpu_4 + queue: amd_gpu {% elif step.amd_gpus and step.amd_gpus==2%} - queue: amd_gpu_4 + queue: amd_gpu {% else%} - queue: amd_gpu_1 + queue: amd_gpu {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From e210fb790219450815b3a343223883782da37294 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Mon, 10 Mar 2025 16:16:59 +0000 Subject: [PATCH 15/21] Fix building architectures. Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 5dbd144291df..3ecbd64a973b 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -7,7 +7,7 @@ steps: - label: ":docker: build image" depends_on: ~ commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --target test --progress plain ." + - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f Dockerfile.rocm --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ." - "docker push {{ docker_image_amd }}" key: "amd-build" env: From 5e31d5c4943b90e34251a8765673c91a96a0f531 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Mon, 10 Mar 2025 21:59:37 +0000 Subject: [PATCH 16/21] Removing junk. Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 8c86c618103e..009e929ebace 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -108,7 +108,6 @@ ARG COMMON_WORKDIR # Copy over the benchmark scripts as well COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -# "Dummy alternation" ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false From f7a922e81594f19c37554c10b920db39402bc6b1 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Wed, 12 Mar 2025 19:07:13 +0000 Subject: [PATCH 17/21] Routing some of the tests towards amd_gpu_1_osci Signed-off-by: Alexei V. Ivanov --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index 3ecbd64a973b..da942c254f57 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -34,7 +34,7 @@ steps: {% elif step.amd_gpus and step.amd_gpus==2%} queue: amd_gpu {% else%} - queue: amd_gpu + queue: amd_gpu_1_osci {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From 203476843aa7e6f423d117426fa23d00f06035dc Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Thu, 13 Mar 2025 15:20:41 -0500 Subject: [PATCH 18/21] remove sharing host network with contaienr --- .buildkite/run-amd-test.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 706737e70d04..5ef3e8f85753 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -152,7 +152,6 @@ else echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ - --network host \ --shm-size=16gb \ --rm \ -e HIP_VISIBLE_DEVICES=0 \ From 7182c8019596e31951185c8f91574392e1d16d0a Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Thu, 13 Mar 2025 15:54:10 -0500 Subject: [PATCH 19/21] revert --- .buildkite/run-amd-test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 5ef3e8f85753..6f023dc47272 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -152,6 +152,7 @@ else echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" docker run \ --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ + --network=host \ --shm-size=16gb \ --rm \ -e HIP_VISIBLE_DEVICES=0 \ From 467b8e2eef30830d0265797b42b1c8f212d15276 Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Thu, 20 Mar 2025 16:50:49 -0500 Subject: [PATCH 20/21] test oci cluster --- .buildkite/test-template.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 index da942c254f57..d7725fcb33d9 100644 --- a/.buildkite/test-template.j2 +++ b/.buildkite/test-template.j2 @@ -34,7 +34,7 @@ steps: {% elif step.amd_gpus and step.amd_gpus==2%} queue: amd_gpu {% else%} - queue: amd_gpu_1_osci + queue: amd_gpu_1_oci {% endif%} commands: - bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" && ")) | safe }}" From 77c04fb27a628a734b58dd5cee9e3863c0cfa8b1 Mon Sep 17 00:00:00 2001 From: dhonnappa-amd Date: Mon, 24 Mar 2025 12:24:43 -0500 Subject: [PATCH 21/21] Update base image to include curl wget --- Dockerfile.rocm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index d2c2d3b14678..7188f7a73419 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -12,7 +12,7 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y \ - sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev + sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev apt-transport-https ca-certificates wget curl # Remove sccache RUN python3 -m pip install --upgrade pip && pip install setuptools_scm RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"