Skip to content

Commit d478a40

Browse files
committed
Merge remote-tracking branch 'upstream/main' into feature/enable-log-outputs
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
2 parents 75fee5b + b639327 commit d478a40

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1625
-192
lines changed

CMakeLists.txt

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
171171
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
172172
endif()
173173

174-
#
175-
# Set nvcc fatbin compression.
176-
#
177-
if(VLLM_GPU_LANG STREQUAL "CUDA")
178-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
179-
list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size")
180-
endif()
181-
endif()
182-
183-
184174
#
185175
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
186176
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.

docs/contributing/model/basic.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def forward(
7373
self,
7474
input_ids: torch.Tensor,
7575
positions: torch.Tensor,
76+
intermediate_tensors: Optional[IntermediateTensors] = None,
77+
inputs_embeds: Optional[torch.Tensor] = None,
7678
) -> torch.Tensor:
7779
...
7880
```

requirements/test.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ opencv-python-headless >= 4.11.0 # required for video test
3434
datamodel_code_generator # required for minicpm3 test
3535
lm-eval[api]==0.4.8 # required for model evaluation test
3636
mteb[bm25s]>=1.38.11, <2 # required for mteb test
37-
transformers==4.52.4
37+
transformers==4.53.2
3838
tokenizers==0.21.1
3939
huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
4040
schemathesis>=3.39.15 # Required for openai schema test.

requirements/test.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -800,7 +800,7 @@ tqdm==4.66.6
800800
# transformers
801801
tqdm-multiprocess==0.0.11
802802
# via lm-eval
803-
transformers==4.52.4
803+
transformers==4.53.2
804804
# via
805805
# -r requirements/test.in
806806
# genai-perf
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
from importlib.util import find_spec
4+
5+
import pytest
6+
import torch
7+
8+
import vllm.envs as envs
9+
from vllm.compilation.collective_fusion import AllReduceFusionPass
10+
from vllm.config import (CompilationConfig, CompilationLevel, DeviceConfig,
11+
ModelConfig, PassConfig, VllmConfig)
12+
from vllm.distributed import tensor_model_parallel_all_reduce
13+
from vllm.distributed.parallel_state import (init_distributed_environment,
14+
initialize_model_parallel)
15+
from vllm.model_executor.layers.layernorm import RMSNorm
16+
from vllm.platforms import current_platform
17+
from vllm.utils import update_environment_variables
18+
19+
from ..utils import multi_gpu_test
20+
from .backend import TestBackend
21+
22+
23+
class TestAllReduceRMSNormModel(torch.nn.Module):
24+
25+
def __init__(self, hidden_size=16, eps=1e-6):
26+
super().__init__()
27+
self.hidden_size = hidden_size
28+
self.eps = eps
29+
self.norm = RMSNorm(hidden_size, eps)
30+
31+
def forward(self, hidden_states, residual):
32+
view = hidden_states.reshape(-1, self.hidden_size)
33+
all_reduce = tensor_model_parallel_all_reduce(view)
34+
norm = self.norm(all_reduce)
35+
return norm
36+
37+
def ops_in_model_before(self):
38+
return [torch.ops.vllm.all_reduce.default]
39+
40+
def ops_in_model_after(self):
41+
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
42+
43+
44+
class TestAllReduceFusedAddRMSNormModel(torch.nn.Module):
45+
46+
def __init__(self, hidden_size=16, eps=1e-6):
47+
super().__init__()
48+
self.hidden_size = hidden_size
49+
self.eps = eps
50+
self.norm = RMSNorm(hidden_size, eps)
51+
52+
def forward(self, hidden_states, residual):
53+
view = hidden_states.reshape(-1, self.hidden_size)
54+
all_reduce = tensor_model_parallel_all_reduce(view)
55+
norm, _ = self.norm(all_reduce, residual)
56+
return norm
57+
58+
def ops_in_model_before(self):
59+
return [torch.ops.vllm.all_reduce.default]
60+
61+
def ops_in_model_after(self):
62+
return [torch.ops.vllm.flashinfer_trtllm_fused_allreduce_norm.default]
63+
64+
65+
@multi_gpu_test(num_gpus=2)
66+
@pytest.mark.parametrize(
67+
"test_model",
68+
[TestAllReduceRMSNormModel, TestAllReduceFusedAddRMSNormModel])
69+
@pytest.mark.parametrize("batch_size", [8])
70+
@pytest.mark.parametrize("seq_len", [8])
71+
@pytest.mark.parametrize("hidden_size", [4096])
72+
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
73+
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
74+
reason="Only test on CUDA")
75+
@pytest.mark.skipif(not find_spec("flashinfer"),
76+
reason="flashinfer is not installed")
77+
@pytest.mark.skipif(not current_platform.is_device_capability(100),
78+
reason="Only test on SM100")
79+
def test_all_reduce_fusion_pass_replace(test_model: torch.nn.Module,
80+
batch_size: int, seq_len: int,
81+
hidden_size: int, dtype: torch.dtype):
82+
num_processes = 2
83+
84+
def run_torch_spawn(fn, nprocs):
85+
torch.multiprocessing.spawn(fn,
86+
args=(num_processes, test_model,
87+
batch_size, seq_len, hidden_size,
88+
dtype),
89+
nprocs=nprocs)
90+
91+
run_torch_spawn(all_reduce_fusion_pass_on_test_model, num_processes)
92+
93+
94+
def all_reduce_fusion_pass_on_test_model(local_rank: int, world_size: int,
95+
test_model_cls: torch.nn.Module,
96+
batch_size: int, seq_len: int,
97+
hidden_size: int, dtype: torch.dtype):
98+
current_platform.seed_everything(0)
99+
100+
device = torch.device(f"cuda:{local_rank}")
101+
torch.cuda.set_device(device)
102+
torch.set_default_device(device)
103+
torch.set_default_dtype(dtype)
104+
105+
update_environment_variables({
106+
'RANK': str(local_rank),
107+
'LOCAL_RANK': str(local_rank),
108+
'WORLD_SIZE': str(world_size),
109+
'MASTER_ADDR': 'localhost',
110+
'MASTER_PORT': '12345',
111+
})
112+
113+
init_distributed_environment()
114+
initialize_model_parallel(tensor_model_parallel_size=world_size)
115+
116+
vllm_config = VllmConfig(
117+
compilation_config=CompilationConfig(level=CompilationLevel.PIECEWISE,
118+
custom_ops=["+rms_norm"],
119+
compile_sizes=[2, 4, 8]))
120+
vllm_config.compilation_config.pass_config = PassConfig(
121+
enable_fi_allreduce_fusion=True)
122+
vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
123+
124+
# this is a fake model name to construct the model config
125+
# in the vllm_config, it's not really used.
126+
model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
127+
vllm_config.model_config = ModelConfig(model=model_name,
128+
task="auto",
129+
tokenizer=model_name,
130+
tokenizer_mode="auto",
131+
trust_remote_code=True,
132+
dtype=dtype,
133+
seed=42)
134+
135+
all_reduce_fusion_pass = AllReduceFusionPass(
136+
vllm_config, vllm_config.compilation_config.pass_config.
137+
fi_allreduce_fusion_max_token_num)
138+
backend = TestBackend(all_reduce_fusion_pass)
139+
140+
model = test_model_cls(hidden_size)
141+
142+
hidden_states = torch.randn((batch_size * seq_len, hidden_size),
143+
requires_grad=False)
144+
residual = torch.randn((batch_size * seq_len, hidden_size),
145+
requires_grad=False)
146+
147+
compiled_model = torch.compile(model, backend=backend)
148+
compiled_model(hidden_states, residual)
149+
150+
backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
151+
backend.check_after_ops(model.ops_in_model_after())
152+
del all_reduce_fusion_pass

tests/distributed/test_pynccl.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import multiprocessing
55
import os
66

7+
import numpy as np
78
import pytest
89
import torch
910
import torch.distributed
@@ -177,6 +178,38 @@ def test_pynccl_all_gather():
177178
distributed_run(all_gather_worker_fn, 2)
178179

179180

181+
@worker_fn_wrapper
182+
def all_gatherv_worker_fn():
183+
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
184+
device=get_world_group().device)
185+
186+
rank = pynccl_comm.rank
187+
world_size = pynccl_comm.world_size
188+
device = f'cuda:{pynccl_comm.rank}'
189+
190+
assert world_size <= 8
191+
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
192+
num_elems = sizes[rank]
193+
tensor = torch.arange(num_elems, dtype=torch.float32,
194+
device=device) + rank * 100
195+
result = torch.zeros(sum(sizes), dtype=torch.float32, device=device)
196+
197+
expected = torch.cat([
198+
torch.arange(sizes[r], dtype=torch.float32) + r * 100
199+
for r in range(world_size)
200+
]).to(device)
201+
202+
pynccl_comm.all_gatherv(result, tensor, sizes=sizes)
203+
torch.cuda.synchronize()
204+
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
205+
206+
207+
@pytest.mark.skipif(torch.cuda.device_count() < 2,
208+
reason="Need at least 2 GPUs to run the test.")
209+
def test_pynccl_all_gatherv():
210+
distributed_run(all_gatherv_worker_fn, 2)
211+
212+
180213
@worker_fn_wrapper
181214
def reduce_scatter_worker_fn():
182215
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
@@ -214,6 +247,43 @@ def test_pynccl_reduce_scatter():
214247
distributed_run(reduce_scatter_worker_fn, 2)
215248

216249

250+
@worker_fn_wrapper
251+
def reduce_scatterv_worker_fn():
252+
pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group,
253+
device=get_world_group().device)
254+
255+
rank = pynccl_comm.rank
256+
world_size = pynccl_comm.world_size
257+
device = f'cuda:{pynccl_comm.rank}'
258+
259+
assert world_size <= 8
260+
sizes = [81, 20, 57, 52, 81, 5, 49, 49][:world_size]
261+
num_elems = sum(sizes)
262+
tensor = torch.arange(num_elems, dtype=torch.float32,
263+
device=device) + rank * 100
264+
result = torch.zeros(sizes[rank], dtype=torch.float32, device=device)
265+
266+
# Calculate expected result for this rank's chunk
267+
all_tensors = [
268+
torch.arange(num_elems, dtype=torch.float32) + r * 100
269+
for r in range(world_size)
270+
]
271+
sizes_cumsum = np.cumsum(sizes)
272+
start = 0 if rank == 0 else sizes_cumsum[rank - 1]
273+
end = sizes_cumsum[rank]
274+
expected = sum(tensor[start:end] for tensor in all_tensors).to(device)
275+
276+
pynccl_comm.reduce_scatterv(result, tensor, sizes=sizes)
277+
torch.cuda.synchronize()
278+
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
279+
280+
281+
@pytest.mark.skipif(torch.cuda.device_count() < 2,
282+
reason="Need at least 2 GPUs to run the test.")
283+
def test_pynccl_reduce_scatterv():
284+
distributed_run(reduce_scatterv_worker_fn, 2)
285+
286+
217287
@pytest.mark.skipif(torch.cuda.device_count() < 2,
218288
reason="Need at least 2 GPUs to run the test.")
219289
def test_pynccl_with_cudagraph():

tests/models/language/pooling/mteb_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,8 @@ def mteb_test_rerank_models(hf_runner,
268268
model_info: RerankModelInfo,
269269
vllm_extra_kwargs=None,
270270
hf_model_callback=None,
271-
vllm_mteb_encoder=VllmMtebEncoder):
271+
vllm_mteb_encoder=VllmMtebEncoder,
272+
atol=MTEB_RERANK_TOL):
272273
if not model_info.enable_test:
273274
# A model family has many models with the same architecture,
274275
# and we don't need to test each one.
@@ -301,4 +302,4 @@ def mteb_test_rerank_models(hf_runner,
301302
print("SentenceTransformers:", st_dtype, st_main_score)
302303
print("Difference:", st_main_score - vllm_main_score)
303304

304-
assert st_main_score == pytest.approx(vllm_main_score, abs=MTEB_RERANK_TOL)
305+
assert st_main_score == pytest.approx(vllm_main_score, abs=atol)

tests/models/language/pooling/test_qwen3_reranker.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import torch
77

88
from tests.conftest import HfRunner
9+
from tests.utils import multi_gpu_test
910

1011
from .mteb_utils import RerankModelInfo, mteb_test_rerank_models
1112

@@ -87,3 +88,29 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
8788

8889
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info,
8990
vllm_extra_kwargs)
91+
92+
93+
@pytest.mark.parametrize("model_info", RERANK_MODELS)
94+
@multi_gpu_test(num_gpus=2)
95+
def test_rerank_models_mteb_tp(vllm_runner,
96+
model_info: RerankModelInfo) -> None:
97+
98+
assert model_info.architecture == "Qwen3ForSequenceClassification"
99+
100+
vllm_extra_kwargs: dict[str, Any] = {
101+
"hf_overrides": {
102+
"architectures": ["Qwen3ForSequenceClassification"],
103+
"classifier_from_token": ["no", "yes"],
104+
"is_original_qwen3_reranker": True,
105+
},
106+
"tensor_parallel_size": 2,
107+
}
108+
109+
if model_info.name == "Qwen/Qwen3-Reranker-4B":
110+
vllm_extra_kwargs["max_num_seqs"] = 1
111+
112+
mteb_test_rerank_models(Qwen3RerankerHfRunner,
113+
vllm_runner,
114+
model_info,
115+
vllm_extra_kwargs,
116+
atol=1.2e-2)

tests/models/multimodal/generation/test_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@
318318
num_logprobs=10,
319319
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
320320
auto_cls=AutoModelForImageTextToText,
321+
marks=[large_gpu_mark(min_gb=32)],
321322
),
322323
"glm4_1v-video": VLMTestInfo(
323324
models=["THUDM/GLM-4.1V-9B-Thinking"],
@@ -331,8 +332,7 @@
331332
inputs=custom_inputs.video_with_metadata_glm4_1v(),
332333
limit_mm_per_prompt={"video": 1},
333334
)],
334-
# This is needed to run on machine with 24GB VRAM
335-
vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
335+
marks=[large_gpu_mark(min_gb=32)],
336336
),
337337
"h2ovl": VLMTestInfo(
338338
models = [

tests/models/multimodal/processing/test_common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ def _test_processing_correctness(
159159
_ADD_SPECIAL_TOKENS_OVERRIDES = {
160160
"mllama": False,
161161
"ovis": False,
162+
"paligemma": False,
162163
"ultravox": False,
163164
"whisper": False,
164165
}

0 commit comments

Comments
 (0)