Skip to content

Commit ab20d2f

Browse files
committed
Merge branch 'main' into eagle-kernel-fusion
Signed-off-by: Leo Tian <leo.tian@centml.ai>
2 parents 55bf4bc + 2628a69 commit ab20d2f

File tree

411 files changed

+11212
-8858
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

411 files changed

+11212
-8858
lines changed

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
5353
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5454
--name "${container_name}" \
5555
${image_name} \
56-
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
56+
/bin/bash -c "
57+
python3 /workspace/vllm/examples/offline_inference/neuron.py;
58+
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
59+
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
60+
echo 'Running test file: '$f;
61+
python3 -m pytest \$f -v --capture=tee-sys;
62+
done
63+
"

.buildkite/test-pipeline.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,13 @@ steps:
3333

3434
- label: Documentation Build # 2min
3535
mirror_hardwares: [amdexperimental]
36-
working_dir: "/vllm-workspace/test_docs/docs"
36+
working_dir: "/vllm-workspace/test_docs"
3737
fast_check: true
3838
no_gpu: True
3939
commands:
40-
- pip install -r ../../requirements/docs.txt
41-
- SPHINXOPTS=\"-W\" make html
42-
# Check API reference (if it fails, you may have missing mock imports)
43-
- grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
40+
- pip install -r ../requirements/docs.txt
41+
# TODO: add `--strict` once warnings in docstrings are fixed
42+
- mkdocs build
4443

4544
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4645
mirror_hardwares: [amdexperimental]
@@ -59,6 +58,7 @@ steps:
5958
- pytest -v -s async_engine # AsyncLLMEngine
6059
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
6160
- pytest -v -s test_inputs.py
61+
- pytest -v -s test_outputs.py
6262
- pytest -v -s multimodal
6363
- pytest -v -s test_utils.py # Utils
6464
- pytest -v -s worker # Worker
@@ -125,7 +125,7 @@ steps:
125125
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
126126
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
127127
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
128-
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
128+
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
129129
- pytest -v -s entrypoints/test_chat_utils.py
130130
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
131131

@@ -138,6 +138,7 @@ steps:
138138
- vllm/core/
139139
- tests/distributed/test_utils
140140
- tests/distributed/test_pynccl
141+
- tests/distributed/test_events
141142
- tests/spec_decode/e2e/test_integration_dist_tp4
142143
- tests/compile/test_basic_correctness
143144
- examples/offline_inference/rlhf.py
@@ -156,6 +157,7 @@ steps:
156157
- pytest -v -s distributed/test_utils.py
157158
- pytest -v -s compile/test_basic_correctness.py
158159
- pytest -v -s distributed/test_pynccl.py
160+
- pytest -v -s distributed/test_events.py
159161
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
160162
# TODO: create a dedicated test section for multi-GPU example tests
161163
# when we have multiple distributed example tests
@@ -312,6 +314,7 @@ steps:
312314
- pytest -v -s compile/test_fusion.py
313315
- pytest -v -s compile/test_silu_mul_quant_fusion.py
314316
- pytest -v -s compile/test_sequence_parallelism.py
317+
- pytest -v -s compile/test_async_tp.py
315318

316319
- label: PyTorch Fullgraph Smoke Test # 9min
317320
mirror_hardwares: [amdexperimental, amdproduction]
@@ -386,10 +389,12 @@ steps:
386389
source_file_dependencies:
387390
- vllm/model_executor/model_loader
388391
- tests/tensorizer_loader
392+
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
389393
commands:
390394
- apt-get update && apt-get install -y curl libsodium23
391395
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
392396
- pytest -v -s tensorizer_loader
397+
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
393398

394399
- label: Benchmarks # 9min
395400
mirror_hardwares: [amdexperimental, amdproduction]

.github/CODEOWNERS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,7 @@ CMakeLists.txt @tlrmchlsmth
4242
/tests/v1/structured_output @mgoin @russellb
4343
/tests/weight_loading @mgoin @youkaichao
4444
/tests/lora @jeejeelee
45+
46+
# Docs
47+
/docs @hmellor
48+
mkdocs.yaml @hmellor

.gitignore

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,6 @@ instance/
7777
# Scrapy stuff:
7878
.scrapy
7979

80-
# Sphinx documentation
81-
docs/_build/
82-
docs/source/getting_started/examples/
83-
docs/source/api/vllm
84-
8580
# PyBuilder
8681
.pybuilder/
8782
target/
@@ -151,6 +146,7 @@ venv.bak/
151146

152147
# mkdocs documentation
153148
/site
149+
docs/getting_started/examples
154150

155151
# mypy
156152
.mypy_cache/

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ repos:
3939
rev: v0.9.29
4040
hooks:
4141
- id: pymarkdown
42+
exclude: '.*\.inc\.md'
4243
args: [fix]
4344
- repo: https://github.com/rhysd/actionlint
4445
rev: v1.7.7

.readthedocs.yaml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,8 @@ build:
88
tools:
99
python: "3.12"
1010

11-
sphinx:
12-
configuration: docs/source/conf.py
13-
fail_on_warning: true
14-
15-
# If using Sphinx, optionally build your docs in additional formats such as PDF
16-
formats: []
11+
mkdocs:
12+
configuration: mkdocs.yaml
1713

1814
# Optionally declare the Python requirements required to build your docs
1915
python:

CMakeLists.txt

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
2929
#
3030
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
3131

32-
# Supported NVIDIA architectures.
33-
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
34-
3532
# Supported AMD GPU architectures.
3633
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
3734

@@ -79,6 +76,15 @@ endif()
7976
#
8077
find_package(Torch REQUIRED)
8178

79+
# Supported NVIDIA architectures.
80+
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
81+
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
82+
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
83+
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
84+
else()
85+
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
86+
endif()
87+
8288
#
8389
# Forward the non-CUDA device extensions to external CMake scripts.
8490
#

benchmarks/backend_request_func.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,11 @@ async def async_request_deepspeed_mii(
194194
request_func_input: RequestFuncInput,
195195
pbar: Optional[tqdm] = None,
196196
) -> RequestFuncOutput:
197+
api_url = request_func_input.api_url
198+
assert api_url.endswith(("completions", "profile")), (
199+
"OpenAI Completions API URL must end with 'completions' or 'profile'."
200+
)
201+
197202
async with aiohttp.ClientSession(
198203
trust_env=True, timeout=AIOHTTP_TIMEOUT
199204
) as session:
@@ -204,6 +209,8 @@ async def async_request_deepspeed_mii(
204209
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
205210
"top_p": 1.0,
206211
}
212+
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
213+
207214
output = RequestFuncOutput()
208215
output.prompt_len = request_func_input.prompt_len
209216

@@ -215,7 +222,7 @@ async def async_request_deepspeed_mii(
215222
st = time.perf_counter()
216223
try:
217224
async with session.post(
218-
url=request_func_input.api_url, json=payload
225+
url=api_url, json=payload, headers=headers
219226
) as response:
220227
if response.status == 200:
221228
parsed_resp = await response.json()

benchmarks/benchmark_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from vllm.lora.request import LoRARequest
3636
from vllm.lora.utils import get_adapter_absolute_path
3737
from vllm.multimodal import MultiModalDataDict
38+
from vllm.multimodal.image import convert_image_mode
3839
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
3940

4041
logger = logging.getLogger(__name__)
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
257258
if isinstance(image, dict) and "bytes" in image:
258259
image = Image.open(BytesIO(image["bytes"]))
259260
if isinstance(image, Image.Image):
260-
image = image.convert("RGB")
261+
image = convert_image_mode(image, "RGB")
261262
with io.BytesIO() as image_data:
262263
image.save(image_data, format="JPEG")
263264
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")

benchmarks/kernels/benchmark_paged_attention.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,10 @@ def main(
8484
if version == "v2":
8585
if current_platform.is_rocm():
8686
global PARTITION_SIZE
87-
PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
87+
if not args.custom_paged_attn and not current_platform.is_navi():
88+
PARTITION_SIZE = 1024
89+
else:
90+
PARTITION_SIZE = PARTITION_SIZE_ROCM
8891
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
8992
tmp_output = torch.empty(
9093
size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
159162
scale,
160163
block_tables,
161164
seq_lens,
165+
None,
162166
block_size,
163167
max_seq_len,
164168
alibi_slopes,

0 commit comments

Comments
 (0)