Skip to content

Commit 2e3f6f3

Browse files
committed
revert custom kernels tiling on device action
Signed-off-by: leo-pony <nengjunma@outlook.com>
1 parent a1b302e commit 2e3f6f3

File tree

7 files changed

+27
-124
lines changed

7 files changed

+27
-124
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ jobs:
137137
max-parallel: 2
138138
matrix:
139139
os: [linux-arm64-npu-1]
140-
vllm_version: [v0.9.2]
140+
vllm_version: [main, v0.9.2]
141141
name: singlecard e2e test
142142
runs-on: ${{ matrix.os }}
143143
container:
@@ -211,13 +211,13 @@ jobs:
211211
VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
212212
213213
e2e-4-cards:
214-
needs: [lint]
215-
if: ${{ needs.lint.result == 'success' }}
214+
needs: [e2e]
215+
if: ${{ needs.e2e.result == 'success' }}
216216
strategy:
217217
max-parallel: 1
218218
matrix:
219219
os: [linux-arm64-npu-4]
220-
vllm_version: [v0.9.2]
220+
vllm_version: [main, v0.9.2]
221221
name: multicard e2e test
222222
runs-on: ${{ matrix.os }}
223223
container:

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
4242
- Software:
4343
* Python >= 3.9, < 3.12
4444
* CANN >= 8.1.RC1
45-
* PyTorch >= 2.5.1, torch-npu >= 2.7.1rc1
45+
* PyTorch >= 2.7.1, torch-npu >= 2.7.1rc1
4646
* vLLM (the same version as vllm-ascend)
4747

4848
## Getting Started

csrc/kernels/get_masked_input_and_mask_kernel.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -325,12 +325,13 @@ extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel(
325325
const int64_t num_org_vocab_padding,
326326
const int64_t added_vocab_start_index,
327327
const int64_t added_vocab_end_index,
328-
const int64_t size)
328+
const int64_t size,
329+
const uint32_t loop_cnt,
330+
const uint32_t aiv_num)
329331
{
330332
{
331333
GetMaskedInputAndMask<int32_t> op{};
332-
uint32_t aiv_num = AscendC::GetBlockNum();
333-
uint32_t loop_cnt = (size + aiv_num - 1) / aiv_num;
334+
334335
for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) {
335336
op.Init(input + i * size/loop_cnt,
336337
masked_input + i * size/loop_cnt,
@@ -356,11 +357,11 @@ void get_masked_input_and_mask_impl(
356357
const int64_t num_org_vocab_padding,
357358
const int64_t added_vocab_start_index,
358359
const int64_t added_vocab_end_index,
359-
const int64_t size)
360+
const int64_t size,
361+
const uint32_t loop_cnt,
362+
const uint32_t aiv_num)
360363
{
361-
// block_dim only used for parameter check, execute parallel number is handled in get_masked_input_and_mask_kernel
362-
const uint32_t block_dim = 8;
363-
get_masked_input_and_mask_kernel<<<block_dim, nullptr, stream>>>(
364+
get_masked_input_and_mask_kernel<<<aiv_num, nullptr, stream>>>(
364365
static_cast<int32_t*>(input),
365366
static_cast<int32_t*>(masked_input),
366367
static_cast<bool*>(mask_out),
@@ -369,7 +370,9 @@ void get_masked_input_and_mask_impl(
369370
num_org_vocab_padding,
370371
added_vocab_start_index,
371372
added_vocab_end_index,
372-
size);
373+
size,
374+
loop_cnt,
375+
aiv_num);
373376
}
374377

375378
} // namespace vllm_ascend

csrc/kernels/pos_encoding_kernels.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ template <typename scalar_t, bool isNeox> class RotaryEmbedding {
311311
__gm__ int64_t* positions, __gm__ void* queryDst, __gm__ void* keyDst, __gm__ TYPE* query, __gm__ TYPE* key, \
312312
__gm__ TYPE* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride, \
313313
const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads, \
314-
const int headSize, const int64_t numTokens, const int coreNum) \
314+
const int headSize, const int64_t numTokens, const int loopNum, const int coreNum) \
315315
{ \
316316
AscendC::TPipe pipe; \
317317
RotaryEmbedding<TYPE, NEOX> op{}; \
@@ -341,12 +341,12 @@ namespace vllm_ascend {
341341
rope_custom_true_##TYPE<<<blockDim, nullptr, stream>>>( \
342342
positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key), \
343343
reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
344-
numHeads, numKvHeads, headSize, numTokens, blockDim); \
344+
numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim); \
345345
else \
346346
rope_custom_false_##TYPE<<<blockDim, nullptr, stream>>>( \
347347
positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key), \
348348
reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
349-
numHeads, numKvHeads, headSize, numTokens, blockDim);
349+
numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);
350350

351351
// maximum number for runtime to launch a ascendc kernel.
352352
// we use this to constrain the maximum number of block size
@@ -356,7 +356,8 @@ extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, in
356356
void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
357357
const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
358358
const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
359-
const int headSize, const int64_t numTokens)
359+
const int headSize, const int64_t numTokens, const uint32_t loopCnt,
360+
uint32_t aivNum)
360361
{
361362

362363
int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize;

csrc/ops.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ namespace vllm_ascend {
2828
void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
2929
const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
3030
const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
31-
const int headSize, const int64_t numTokens);
31+
const int headSize, const int64_t numTokens, const uint32_t loopCnt,
32+
uint32_t aivNum);
3233

3334
extern void get_masked_input_and_mask_impl(
3435
void* stream,
@@ -40,7 +41,9 @@ namespace vllm_ascend {
4041
const int64_t num_org_vocab_padding,
4142
const int64_t added_vocab_start_index,
4243
const int64_t added_vocab_end_index,
43-
const int64_t size);
44+
const int64_t size,
45+
const uint32_t loop_cnt,
46+
const uint32_t aiv_num);
4447

4548
torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
4649
if (!tensor.is_privateuseone()) {

docs/source/installation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ This document describes how to install vllm-ascend manually.
1313
|---------------|----------------------------------|-------------------------------------------|
1414
| CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu |
1515
| torch-npu | >= 2.7.1rc1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
16-
| torch | >= 2.5.1 | Required for torch-npu and vllm |
16+
| torch | >= 2.7.1 | Required for torch-npu and vllm |
1717

1818
You have 2 way to install:
1919
- **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.

tests/ut/patch/worker/patch_common/test_patch_utils.py

Lines changed: 0 additions & 104 deletions
This file was deleted.

0 commit comments

Comments
 (0)