vllm-project
diff --git a/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 8 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 8 additions & 1 deletion
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 11 additions & 6 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 11 additions & 6 deletions
diff --git a/‎.github/CODEOWNERS
Lines changed: 4 additions & 0 deletions b/‎.github/CODEOWNERS
Lines changed: 4 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 5 deletions b/‎.gitignore
Lines changed: 1 addition & 5 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.readthedocs.yaml
Lines changed: 2 additions & 6 deletions b/‎.readthedocs.yaml
Lines changed: 2 additions & 6 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions b/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions
diff --git a/‎benchmarks/backend_request_func.py
Lines changed: 8 additions & 1 deletion b/‎benchmarks/backend_request_func.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_dataset.py
Lines changed: 2 additions & 1 deletion b/‎benchmarks/benchmark_dataset.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 5 additions & 1 deletion b/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 5 additions & 1 deletion
@@ -53,4 +53,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo 'Running test file: '$f;
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
@@ -33,14 +33,13 @@ steps:
 
 - label: Documentation Build # 2min
   mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs/docs"
+  working_dir: "/vllm-workspace/test_docs"
   fast_check: true
   no_gpu: True
   commands:
-  - pip install -r ../../requirements/docs.txt
-  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
+  - pip install -r ../requirements/docs.txt
+  # TODO: add `--strict` once warnings in docstrings are fixed
+  - mkdocs build
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   mirror_hardwares: [amdexperimental]
@@ -59,6 +58,7 @@ steps:
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
@@ -125,7 +125,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -138,6 +138,7 @@ steps:
   - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
+  - tests/distributed/test_events
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
@@ -156,6 +157,7 @@ steps:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
@@ -312,6 +314,7 @@ steps:
     - pytest -v -s compile/test_fusion.py
     - pytest -v -s compile/test_silu_mul_quant_fusion.py
     - pytest -v -s compile/test_sequence_parallelism.py
+    - pytest -v -s compile/test_async_tp.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -386,10 +389,12 @@ steps:
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
   commands:
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
 - label: Benchmarks # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
 
@@ -42,3 +42,7 @@ CMakeLists.txt @tlrmchlsmth
 /tests/v1/structured_output @mgoin @russellb
 /tests/weight_loading @mgoin @youkaichao
 /tests/lora @jeejeelee
+
+# Docs
+/docs @hmellor
+mkdocs.yaml @hmellor
@@ -77,11 +77,6 @@ instance/
 # Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
-docs/_build/
-docs/source/getting_started/examples/
-docs/source/api/vllm
-
 # PyBuilder
 .pybuilder/
 target/
@@ -151,6 +146,7 @@ venv.bak/
 
 # mkdocs documentation
 /site
+docs/getting_started/examples
 
 # mypy
 .mypy_cache/
 
@@ -39,6 +39,7 @@ repos:
   rev: v0.9.29
   hooks:
   - id: pymarkdown
+    exclude: '.*\.inc\.md'
     args: [fix]
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
 
@@ -8,12 +8,8 @@ build:
   tools:
     python: "3.12"
 
-sphinx:
-  configuration: docs/source/conf.py
-  fail_on_warning: true
-
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats: []
+mkdocs:
+  configuration: mkdocs.yaml
 
 # Optionally declare the Python requirements required to build your docs
 python:
 
@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
-# Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
-
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
@@ -79,6 +76,15 @@ endif()
 #
 find_package(Torch REQUIRED)
 
+# Supported NVIDIA architectures.
+# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+else()
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+endif()
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
 
@@ -194,6 +194,11 @@ async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
     async with aiohttp.ClientSession(
         trust_env=True, timeout=AIOHTTP_TIMEOUT
     ) as session:
@@ -204,6 +209,8 @@ async def async_request_deepspeed_mii(
             "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
             "top_p": 1.0,
         }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
@@ -215,7 +222,7 @@ async def async_request_deepspeed_mii(
         st = time.perf_counter()
         try:
             async with session.post(
-                url=request_func_input.api_url, json=payload
+                url=api_url, json=payload, headers=headers
             ) as response:
                 if response.status == 200:
                     parsed_resp = await response.json()
 
@@ -35,6 +35,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 
 logger = logging.getLogger(__name__)
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     if isinstance(image, dict) and "bytes" in image:
         image = Image.open(BytesIO(image["bytes"]))
     if isinstance(image, Image.Image):
-        image = image.convert("RGB")
+        image = convert_image_mode(image, "RGB")
         with io.BytesIO() as image_data:
             image.save(image_data, format="JPEG")
             image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
 
@@ -84,7 +84,10 @@ def main(
     if version == "v2":
         if current_platform.is_rocm():
             global PARTITION_SIZE
-            PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
         num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         scale,
                         block_tables,
                         seq_lens,
+                        None,
                         block_size,
                         max_seq_len,
                         alibi_slopes,
Original file line number	Diff line number	Diff line change
`@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")`
`29`	`29`	`#`
`30`	`30`	`set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")`
`31`	`31`
`32`		`-# Supported NVIDIA architectures.`
`33`		`-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")`
`34`		`-`
`35`	`32`	`# Supported AMD GPU architectures.`
`36`	`33`	`set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")`
`37`	`34`
`@@ -79,6 +76,15 @@ endif()`
`79`	`76`	`#`
`80`	`77`	`find_package(Torch REQUIRED)`
`81`	`78`
	`79`	`+# Supported NVIDIA architectures.`
	`80`	`+# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined`
	`81`	`+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND`
	`82`	`+ CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)`
	`83`	`+ set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")`
	`84`	`+else()`
	`85`	`+ set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")`
	`86`	`+endif()`
	`87`	`+`
`82`	`88`	`#`
`83`	`89`	`# Forward the non-CUDA device extensions to external CMake scripts.`
`84`	`90`	`#`