diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index d3622dce15..19ab36a025 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -112,7 +112,7 @@ jobs: # pytest -sv tests/singlecard/tets_schedule.py # guided decoding doesn't work, fix it later # pytest -sv tests/singlecard/test_guided_decoding.py.py - pytest -sv tests/singlecard/ --ignore=tests/singlecard/test_offline_inference.py --ignore=tests/singlecard/test_scheduler.py --ignore=tests/singlecard/test_guided_decoding.py + pytest -sv tests/singlecard/ --ignore=tests/singlecard/test_offline_inference.py --ignore=tests/singlecard/test_guided_decoding.py else pytest -sv tests/multicard/test_ilama_lora_tp2.py VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py diff --git a/tests/singlecard/test_scheduler.py b/tests/singlecard/test_scheduler.py index 6eddd4f2ca..254f20b9d9 100644 --- a/tests/singlecard/test_scheduler.py +++ b/tests/singlecard/test_scheduler.py @@ -30,6 +30,7 @@ from vllm.v1.request import Request, RequestStatus from vllm.v1.structured_output import StructuredOutputManager +from tests.conftest import VllmRunner from vllm_ascend.core.scheduler import AscendScheduler EOS_TOKEN_ID = 50256 @@ -394,3 +395,27 @@ def test_stop_via_update_from_output(): assert len(scheduler.running) == 1 assert not requests[0].is_finished() assert list(requests[0].output_token_ids) == [EOS_TOKEN_ID, 10, 11] + +MODELS = [ + "Qwen/Qwen3-0.6B-Base", +] + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +def test_models(model: str, dtype: str, max_tokens: int) -> None: + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=True, + gpu_memory_utilization=0.7, + enable_prefix_caching=False, + additional_config={'ascend_schduler_config': {}}) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py index 122f7b9654..bc69bad26c 100644 --- a/vllm_ascend/core/scheduler.py +++ b/vllm_ascend/core/scheduler.py @@ -130,14 +130,15 @@ def skip_cur_request(): assert num_new_tokens > 0 watermark = getattr(self.scheduler_config, "watermark", 0.01) - if not self._check_watermark_for_prefill( - request, num_new_tokens, computed_blocks, watermark): + if not self._check_watermark_for_prefill(request, num_new_tokens, + computed_blocks.blocks, + watermark): # Scheduling would exceed watermark, skip. skip_cur_request() continue new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, computed_blocks) + request, num_new_tokens, new_computed_blocks=computed_blocks) if new_blocks is None: # The request cannot be scheduled. break @@ -155,9 +156,8 @@ def skip_cur_request(): if self.lora_config and request.lora_request: scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in computed_blocks + new_blocks - ] + req_to_new_block_ids[request.request_id] = ( + self.kv_cache_manager.get_block_ids(request.request_id)) # Update request info. num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens @@ -215,9 +215,8 @@ def skip_cur_request(): # Schedule the request. scheduled_running_reqs.append(request) self.scheduled_req_ids.add(request.request_id) - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in new_blocks - ] + req_to_new_block_ids[request.request_id] = ( + new_blocks.get_block_ids()) num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens req_index += 1 @@ -326,7 +325,8 @@ def _check_watermark_for_prefill(self, len(computed_blocks) * self.block_size) num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, self.block_size) - req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id] + req_blocks = self.kv_cache_manager.single_type_manager.req_to_blocks[ + request.request_id] num_new_blocks = (num_required_blocks - len(req_blocks) - len(computed_blocks)) num_evictable_computed_blocks = sum(1 for blk in computed_blocks