Skip to content

Commit a3a6c69

Browse files
authored
[Misc] Qwen MoE model supports LoRA (#20932)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
1 parent 90bd2ab commit a3a6c69

File tree

4 files changed

+20
-8
lines changed

4 files changed

+20
-8
lines changed

docs/models/supported_models.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,9 +380,9 @@ Specified using `--task generate`.
380380
| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | | |
381381
| `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
382382
| `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
383-
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | | ✅︎ | ✅︎ |
383+
| `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
384384
| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B`, etc. | ✅︎ | ✅︎ | ✅︎ |
385-
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | | ✅︎ | ✅︎ |
385+
| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B`, etc. | ✅︎ | ✅︎ | ✅︎ |
386386
| `StableLmForCausalLM` | StableLM | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. | | | ✅︎ |
387387
| `Starcoder2ForCausalLM` | Starcoder2 | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. | | ✅︎ | ✅︎ |
388388
| `SolarForCausalLM` | Solar Pro | `upstage/solar-pro-preview-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |

vllm/lora/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
get_supported_lora_modules,
3030
is_regex_target_modules,
3131
parse_fine_tuned_lora_name, replace_submodule)
32+
from vllm.model_executor.layers.fused_moe import FusedMoE
3233
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
3334
from vllm.model_executor.models import SupportsLoRA, supports_multimodal
3435
from vllm.model_executor.models.interfaces import is_pooling_model
@@ -60,6 +61,17 @@ def get_lora_id():
6061
return _GLOBAL_LORA_ID
6162

6263

64+
def is_moe_model(model: nn.Module) -> bool:
65+
"""Checks if the model contains FusedMoE layers and warns the user."""
66+
if any(isinstance(module, FusedMoE) for module in model.modules()):
67+
logger.warning_once(
68+
"For MoE models, vLLM currently does not support fused MoE LoRA "
69+
"inference. Please ensure that the loaded LoRA model does not "
70+
"contain expert weights.")
71+
return True
72+
return False
73+
74+
6375
class LoRAModel(AdapterModel):
6476
"""A LoRA fine-tuned model."""
6577

@@ -375,6 +387,7 @@ def __init__(
375387
# text modules (e.g. ChatGLM)
376388
and hasattr(self.model, "get_mm_mapping"))
377389
self.is_pooling_model = is_pooling_model(self.model)
390+
self.is_moe_model = is_moe_model(self.model)
378391
self.packed_modules: dict[str, list[str]] = {}
379392
self.modules: dict[str, BaseLayerWithLoRA] = {}
380393
# Dict instead of a set for compatibility with LRUCache.

vllm/model_executor/models/qwen2_moe.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
from vllm.model_executor.sampling_metadata import SamplingMetadata
5454
from vllm.sequence import IntermediateTensors
5555

56-
from .interfaces import SupportsPP
56+
from .interfaces import SupportsLoRA, SupportsPP
5757
from .utils import (AutoWeightsLoader, extract_layer_index,
5858
is_pp_missing_parameter,
5959
make_empty_intermediate_tensors_factory, make_layers,
@@ -448,8 +448,7 @@ def load_weights(self, weights: Iterable[tuple[str,
448448
if weight_name not in name:
449449
continue
450450
name = name.replace(weight_name, param_name)
451-
if "layers.13.mlp.experts.w2_weight" in name:
452-
pass
451+
453452
# Skip layers on other devices.
454453
if is_pp_missing_parameter(name, self):
455454
continue
@@ -494,7 +493,7 @@ def load_weights(self, weights: Iterable[tuple[str,
494493
return loaded_params
495494

496495

497-
class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
496+
class Qwen2MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
498497

499498
fall_back_to_pt_during_load = False
500499
packed_modules_mapping = {

vllm/model_executor/models/qwen3_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
from vllm.model_executor.sampling_metadata import SamplingMetadata
5151
from vllm.sequence import IntermediateTensors
5252

53-
from .interfaces import SupportsPP
53+
from .interfaces import SupportsLoRA, SupportsPP
5454
from .utils import (AutoWeightsLoader, extract_layer_index,
5555
is_pp_missing_parameter,
5656
make_empty_intermediate_tensors_factory, make_layers,
@@ -482,7 +482,7 @@ def load_weights(self, weights: Iterable[tuple[str,
482482
return loaded_params
483483

484484

485-
class Qwen3MoeForCausalLM(nn.Module, SupportsPP):
485+
class Qwen3MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
486486
packed_modules_mapping = {
487487
"qkv_proj": [
488488
"q_proj",

0 commit comments

Comments
 (0)