Skip to content

fix: ascend_scheduler adapt v0.9.0 #1018

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/vllm_ascend_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ jobs:
# pytest -sv tests/singlecard/tets_schedule.py
# guided decoding doesn't work, fix it later
# pytest -sv tests/singlecard/test_guided_decoding.py.py
pytest -sv tests/singlecard/ --ignore=tests/singlecard/test_offline_inference.py --ignore=tests/singlecard/test_scheduler.py --ignore=tests/singlecard/test_guided_decoding.py
pytest -sv tests/singlecard/ --ignore=tests/singlecard/test_offline_inference.py --ignore=tests/singlecard/test_guided_decoding.py
else
pytest -sv tests/multicard/test_ilama_lora_tp2.py
VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py
Expand Down
25 changes: 25 additions & 0 deletions tests/singlecard/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager

from tests.conftest import VllmRunner
from vllm_ascend.core.scheduler import AscendScheduler

EOS_TOKEN_ID = 50256
Expand Down Expand Up @@ -394,3 +395,27 @@ def test_stop_via_update_from_output():
assert len(scheduler.running) == 1
assert not requests[0].is_finished()
assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11]

MODELS = [
"Qwen/Qwen3-0.6B-Base",
]

@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [5])
def test_models(model: str, dtype: str, max_tokens: int) -> None:
# 5042 tokens for gemma2
# gemma2 has alternating sliding window size of 4096
# we need a prompt with more than 4096 tokens to test the sliding window
prompt = "The following numbers of the sequence " + ", ".join(
str(i) for i in range(1024)) + " are:"
example_prompts = [prompt]

with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=True,
gpu_memory_utilization=0.7,
enable_prefix_caching=False,
additional_config={'ascend_schduler_config': {}}) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
20 changes: 10 additions & 10 deletions vllm_ascend/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,15 @@ def skip_cur_request():

assert num_new_tokens > 0
watermark = getattr(self.scheduler_config, "watermark", 0.01)
if not self._check_watermark_for_prefill(
request, num_new_tokens, computed_blocks, watermark):
if not self._check_watermark_for_prefill(request, num_new_tokens,
computed_blocks.blocks,
watermark):
# Scheduling would exceed watermark, skip.
skip_cur_request()
continue

new_blocks = self.kv_cache_manager.allocate_slots(
request, num_new_tokens, computed_blocks)
request, num_new_tokens, new_computed_blocks=computed_blocks)
if new_blocks is None:
# The request cannot be scheduled.
break
Expand All @@ -155,9 +156,8 @@ def skip_cur_request():

if self.lora_config and request.lora_request:
scheduled_loras.add(request.lora_request.lora_int_id)
req_to_new_block_ids[request.request_id] = [
b.block_id for b in computed_blocks + new_blocks
]
req_to_new_block_ids[request.request_id] = (
self.kv_cache_manager.get_block_ids(request.request_id))
# Update request info.
num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens
Expand Down Expand Up @@ -215,9 +215,8 @@ def skip_cur_request():
# Schedule the request.
scheduled_running_reqs.append(request)
self.scheduled_req_ids.add(request.request_id)
req_to_new_block_ids[request.request_id] = [
b.block_id for b in new_blocks
]
req_to_new_block_ids[request.request_id] = (
new_blocks.get_block_ids())
num_scheduled_tokens[request.request_id] = num_new_tokens
token_budget -= num_new_tokens
req_index += 1
Expand Down Expand Up @@ -326,7 +325,8 @@ def _check_watermark_for_prefill(self,
len(computed_blocks) * self.block_size)
num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
self.block_size)
req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id]
req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[
request.request_id]
num_new_blocks = (num_required_blocks - len(req_blocks) -
len(computed_blocks))
num_evictable_computed_blocks = sum(1 for blk in computed_blocks
Expand Down
Loading