|
50 | 50 | from vllm.v1.sample.sampler import Sampler
|
51 | 51 | from vllm.v1.utils import bind_kv_cache
|
52 | 52 | from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
| 53 | +from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin |
53 | 54 |
|
54 | 55 | from vllm_ascend.attention.attention import AttentionMaskBuilder
|
55 | 56 | from vllm_ascend.attention.attention_v1 import AscendAttentionState
|
@@ -102,7 +103,7 @@ def graph_capture(device: torch.device):
|
102 | 103 | yield graph_capture_context
|
103 | 104 |
|
104 | 105 |
|
105 |
| -class NPUModelRunner: |
| 106 | +class NPUModelRunner(LoRAModelRunnerMixin): |
106 | 107 |
|
107 | 108 | def __init__(self, vllm_config: VllmConfig, device: torch.device):
|
108 | 109 | self.vllm_config = vllm_config
|
@@ -507,6 +508,10 @@ def _process_reqs(
|
507 | 508 | max_num_scheduled_tokens = max(max_num_scheduled_tokens,
|
508 | 509 | num_tokens)
|
509 | 510 |
|
| 511 | + # Hot-Swap lora model |
| 512 | + if self.lora_config: |
| 513 | + self.set_active_loras(self.input_batch, num_scheduled_tokens) |
| 514 | + |
510 | 515 | # Prepare positions
|
511 | 516 | req_indices = np.repeat(self.arange_np[:num_reqs],
|
512 | 517 | num_scheduled_tokens)
|
@@ -833,39 +838,55 @@ def _profile_multimodal(self) -> None:
|
833 | 838 |
|
834 | 839 | @torch.inference_mode()
|
835 | 840 | def _dummy_run(self, num_tokens: int) -> torch.Tensor:
|
836 |
| - model = self.model |
837 |
| - if self.is_multimodal_model: |
838 |
| - input_ids = None |
839 |
| - inputs_embeds = self.inputs_embeds[:num_tokens] |
840 |
| - else: |
841 |
| - input_ids = self.input_ids[:num_tokens] |
842 |
| - inputs_embeds = None |
| 841 | + # Set num_scheduled_tokens based on num_tokens and max_num_seqs |
| 842 | + # for dummy run with LoRA so that the num_reqs collectively |
| 843 | + # has num_tokens in total. |
| 844 | + assert num_tokens <= self.scheduler_config.max_num_batched_tokens |
| 845 | + max_num_reqs = self.scheduler_config.max_num_seqs |
| 846 | + num_reqs = max_num_reqs if num_tokens >= max_num_reqs else num_tokens |
| 847 | + min_tokens_per_req = num_tokens // num_reqs |
| 848 | + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs |
| 849 | + num_scheduled_tokens_list[-1] += num_tokens % num_reqs |
| 850 | + assert sum(num_scheduled_tokens_list) == num_tokens |
| 851 | + assert len(num_scheduled_tokens_list) == num_reqs |
| 852 | + num_scheduled_tokens = np.array(num_scheduled_tokens_list, |
| 853 | + dtype=np.int32) |
| 854 | + with self.maybe_dummy_run_with_lora(self.lora_config, |
| 855 | + num_scheduled_tokens): |
| 856 | + model = self.model |
| 857 | + if self.is_multimodal_model: |
| 858 | + input_ids = None |
| 859 | + inputs_embeds = self.inputs_embeds[:num_tokens] |
| 860 | + else: |
| 861 | + input_ids = self.input_ids[:num_tokens] |
| 862 | + inputs_embeds = None |
843 | 863 |
|
844 |
| - if self.uses_mrope: |
845 |
| - positions = self.mrope_positions[:, :num_tokens] |
846 |
| - else: |
847 |
| - positions = self.positions[:num_tokens] |
| 864 | + if self.uses_mrope: |
| 865 | + positions = self.mrope_positions[:, :num_tokens] |
| 866 | + else: |
| 867 | + positions = self.positions[:num_tokens] |
848 | 868 |
|
849 |
| - if get_pp_group().is_first_rank: |
850 |
| - intermediate_tensors = None |
851 |
| - else: |
852 |
| - if self.intermediate_tensors is None: |
853 |
| - self.intermediate_tensors = ( |
854 |
| - self.model.make_empty_intermediate_tensors( |
855 |
| - batch_size=num_tokens, |
856 |
| - dtype=self.dtype, |
857 |
| - device=self.device)) |
858 |
| - intermediate_tensors = IntermediateTensors({ |
859 |
| - k: v[:num_tokens] |
860 |
| - for k, v in self.intermediate_tensors.items() |
861 |
| - }) |
862 |
| - |
863 |
| - with set_forward_context(None, self.vllm_config): |
864 |
| - hidden_states = model(input_ids=input_ids, |
865 |
| - positions=positions, |
866 |
| - intermediate_tensors=intermediate_tensors, |
867 |
| - inputs_embeds=inputs_embeds) |
868 |
| - return hidden_states |
| 869 | + if get_pp_group().is_first_rank: |
| 870 | + intermediate_tensors = None |
| 871 | + else: |
| 872 | + if self.intermediate_tensors is None: |
| 873 | + self.intermediate_tensors = ( |
| 874 | + self.model.make_empty_intermediate_tensors( |
| 875 | + batch_size=num_tokens, |
| 876 | + dtype=self.dtype, |
| 877 | + device=self.device)) |
| 878 | + intermediate_tensors = IntermediateTensors({ |
| 879 | + k: v[:num_tokens] |
| 880 | + for k, v in self.intermediate_tensors.items() |
| 881 | + }) |
| 882 | + |
| 883 | + with set_forward_context(None, self.vllm_config): |
| 884 | + hidden_states = model( |
| 885 | + input_ids=input_ids, |
| 886 | + positions=positions, |
| 887 | + intermediate_tensors=intermediate_tensors, |
| 888 | + inputs_embeds=inputs_embeds) |
| 889 | + return hidden_states |
869 | 890 |
|
870 | 891 | def profile_run(self) -> None:
|
871 | 892 | # Profile with multimodal encoder & encoder cache.
|
@@ -914,7 +935,11 @@ def load_model(self) -> None:
|
914 | 935 | with DeviceMemoryProfiler() as m: # noqa: SIM117
|
915 | 936 | self.model = get_model(vllm_config=self.vllm_config)
|
916 | 937 | if self.lora_config:
|
917 |
| - raise ValueError("LoRA model is not supported on NPU now.") |
| 938 | + self.model = self.load_lora_model(self.model, |
| 939 | + self.model_config, |
| 940 | + self.scheduler_config, |
| 941 | + self.lora_config, |
| 942 | + self.device) |
918 | 943 | logger.info("Loading model weights took %.4f GB",
|
919 | 944 | m.consumed_memory / float(2**30))
|
920 | 945 |
|
|
0 commit comments