Skip to content

Commit 0c6ce45

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents bb1f213 + 9254052 commit 0c6ce45

File tree

347 files changed

+8771
-3956
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

347 files changed

+8771
-3956
lines changed

.buildkite/scripts/hardware_ci/run-amd-test.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,14 @@ if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"*
8282
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
8383
fi
8484

85+
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
86+
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
87+
fi
88+
89+
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
90+
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
91+
fi
92+
8593
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
8694
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
8795
fi

.buildkite/scripts/upload-wheels.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,3 +75,4 @@ else
7575
fi
7676

7777
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
78+
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"

.buildkite/test-pipeline.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,6 @@ steps:
220220
- pytest -v -s v1/spec_decode
221221
- pytest -v -s v1/kv_connector/unit
222222
- pytest -v -s v1/test_serial_utils.py
223-
- pytest -v -s v1/test_stats.py
224223
- pytest -v -s v1/test_utils.py
225224
- pytest -v -s v1/test_oracle.py
226225
# TODO: accuracy does not match, whether setting
@@ -317,6 +316,7 @@ steps:
317316
commands:
318317
- pytest -v -s compile/test_pass_manager.py
319318
- pytest -v -s compile/test_fusion.py
319+
- pytest -v -s compile/test_silu_mul_quant_fusion.py
320320
- pytest -v -s compile/test_sequence_parallelism.py
321321

322322
- label: PyTorch Fullgraph Smoke Test # 9min
@@ -467,7 +467,7 @@ steps:
467467
##### models test #####
468468

469469
- label: Basic Models Test # 24min
470-
mirror_hardwares: [amdexperimental]
470+
mirror_hardwares: [amdexperimental, amdproduction]
471471
torch_nightly: true
472472
source_file_dependencies:
473473
- vllm/
@@ -539,7 +539,7 @@ steps:
539539
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
540540

541541
- label: Multi-Modal Models Test (Extended) 3
542-
mirror_hardwares: [amdexperimental]
542+
mirror_hardwares: [amdexperimental, amdproduction]
543543
optional: true
544544
source_file_dependencies:
545545
- vllm/
@@ -549,7 +549,7 @@ steps:
549549
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
550550

551551
- label: Quantized Models Test
552-
mirror_hardwares: [amdexperimental]
552+
mirror_hardwares: [amdexperimental, amdproduction]
553553
source_file_dependencies:
554554
- vllm/model_executor/layers/quantization
555555
- tests/models/quantization

csrc/activation_kernels.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
9595
int64_t num_tokens = input.numel() / input.size(-1); \
9696
dim3 grid(num_tokens); \
9797
dim3 block(std::min(d, 1024)); \
98+
if (num_tokens == 0) { \
99+
return; \
100+
} \
98101
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
99102
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
100103
VLLM_DISPATCH_FLOATING_TYPES( \

csrc/attention/attention_kernels.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ __device__ void paged_attention_kernel(
172172

173173
// Load the query to registers.
174174
// Each thread in a thread group has a different part of the query.
175-
// For example, if the the thread group size is 4, then the first thread in
175+
// For example, if the thread group size is 4, then the first thread in
176176
// the group has 0, 4, 8, ... th vectors of the query, and the second thread
177177
// has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
178178
// q is split from a qkv tensor, it may not be contiguous.
@@ -259,7 +259,7 @@ __device__ void paged_attention_kernel(
259259

260260
// Load a key to registers.
261261
// Each thread in a thread group has a different part of the key.
262-
// For example, if the the thread group size is 4, then the first thread in
262+
// For example, if the thread group size is 4, then the first thread in
263263
// the group has 0, 4, 8, ... th vectors of the key, and the second thread
264264
// has 1, 5, 9, ... th vectors of the key, and so on.
265265
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {

csrc/dispatch_utils.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,19 @@
6565
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
6666
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
6767

68+
#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \
69+
AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
70+
AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
71+
AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
72+
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
73+
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
74+
AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__) \
75+
AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__) \
76+
AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__)
77+
6878
#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
6979
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
80+
81+
#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \
82+
AT_DISPATCH_SWITCH( \
83+
TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__))

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -473,15 +473,15 @@ __global__ void Marlin(
473473
if (mul_topk_weights) {
474474
#pragma unroll
475475
for (int i = 0; i < 4; i++) {
476+
int idx = tid4 * 4 + i;
477+
idx = idx < block_num_valid_tokens ? idx : 0;
476478
if constexpr (w_type == vllm::kFE2M1f) {
477-
sh_block_topk_weights[tid4 * 4 + i] = __hmul2(
478-
global_scale,
479-
Dtype::num2num2(Dtype::float2num(
480-
topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]])));
479+
sh_block_topk_weights[idx] = __hmul2(
480+
global_scale, Dtype::num2num2(Dtype::float2num(
481+
topk_weights_ptr[sh_block_sorted_ids[idx]])));
481482
} else {
482-
sh_block_topk_weights[tid4 * 4 + i] =
483-
Dtype::num2num2(Dtype::float2num(
484-
topk_weights_ptr[sh_block_sorted_ids[tid4 * 4 + i]]));
483+
sh_block_topk_weights[idx] = Dtype::num2num2(
484+
Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
485485
}
486486
}
487487
}

csrc/moe/moe_align_sum_kernels.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
326326
}
327327

328328
if (use_global_memory) {
329-
VLLM_DISPATCH_INTEGRAL_TYPES(
329+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
330330
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
331331
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
332332
// tensors
@@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
351351
cumsum_buffer.data_ptr<int32_t>());
352352
});
353353
} else if (use_i16) {
354-
VLLM_DISPATCH_INTEGRAL_TYPES(
354+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
355355
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
356356
// set dynamic shared mem
357357
auto kernel =
@@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
366366
topk_ids.numel());
367367
});
368368
} else {
369-
VLLM_DISPATCH_INTEGRAL_TYPES(
369+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
370370
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
371371
auto kernel =
372372
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
@@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
391391
TORCH_CHECK(num_experts == 256,
392392
"sgl_moe_align_block_size kernel only supports deepseek v3.");
393393

394-
VLLM_DISPATCH_INTEGRAL_TYPES(
394+
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
395395
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
396396
// calc needed amount of shared mem for `cumsum` tensors
397397
auto options_int =

csrc/moe/topk_softmax_kernels.cu

Lines changed: 45 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__
108108
}
109109
}
110110

111-
template <int TPB>
112-
__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
113-
int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
111+
template <int TPB, typename IndType>
112+
__launch_bounds__(TPB) __global__ void moeTopK(
113+
const float* inputs_after_softmax,
114+
const bool* finished,
115+
float* output,
116+
IndType* indices,
117+
int* source_rows,
118+
const int num_experts,
119+
const int k,
120+
const int start_expert,
121+
const int end_expert)
114122
{
115123

116124
using cub_kvp = cub::KeyValuePair<int, float>;
@@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax
182190
2) This implementation assumes k is small, but will work for any k.
183191
*/
184192

185-
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
193+
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG, typename IndType>
186194
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
187-
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
195+
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices,
188196
int* source_rows, const int k, const int start_expert, const int end_expert)
189197
{
190198
// We begin by enforcing compile time assertions and setting up compile time constants.
@@ -397,8 +405,8 @@ struct TopkConstants
397405
};
398406
} // namespace detail
399407

400-
template <int EXPERTS, int WARPS_PER_TB>
401-
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
408+
template <int EXPERTS, int WARPS_PER_TB, typename IndType>
409+
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
402410
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
403411
{
404412
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
@@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
421429
token_expert_indices, num_tokens, topk, 0, num_experts, \
422430
stream);
423431

432+
template <typename IndType>
424433
void topkGatingSoftmaxKernelLauncher(
425434
const float* gating_output,
426435
float* topk_weights,
427-
int* topk_indicies,
436+
IndType* topk_indicies,
428437
int* token_expert_indices,
429438
float* softmax_workspace,
430439
const int num_tokens,
@@ -493,14 +502,32 @@ void topk_softmax(
493502
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
494503
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
495504
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
496-
vllm::moe::topkGatingSoftmaxKernelLauncher(
497-
gating_output.data_ptr<float>(),
498-
topk_weights.data_ptr<float>(),
499-
topk_indices.data_ptr<int>(),
500-
token_expert_indices.data_ptr<int>(),
501-
softmax_workspace.data_ptr<float>(),
502-
num_tokens,
503-
num_experts,
504-
topk,
505-
stream);
505+
506+
if(topk_indices.scalar_type() == at::ScalarType::Int)
507+
{
508+
vllm::moe::topkGatingSoftmaxKernelLauncher(
509+
gating_output.data_ptr<float>(),
510+
topk_weights.data_ptr<float>(),
511+
topk_indices.data_ptr<int>(),
512+
token_expert_indices.data_ptr<int>(),
513+
softmax_workspace.data_ptr<float>(),
514+
num_tokens,
515+
num_experts,
516+
topk,
517+
stream);
518+
}
519+
else
520+
{
521+
assert(topk_indices.scalar_type() == at::ScalarType::UInt32);
522+
vllm::moe::topkGatingSoftmaxKernelLauncher(
523+
gating_output.data_ptr<float>(),
524+
topk_weights.data_ptr<float>(),
525+
topk_indices.data_ptr<uint32_t>(),
526+
token_expert_indices.data_ptr<int>(),
527+
softmax_workspace.data_ptr<float>(),
528+
num_tokens,
529+
num_experts,
530+
topk,
531+
stream);
532+
}
506533
}

0 commit comments

Comments
 (0)