Skip to content

Commit 11d3679

Browse files
authored
feat:add qwen3 (#117)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy * add txgemma model; * modify model list command * fix typo * add some ecs parameters * add glm4-z1 models * modify vllm backend * add qwen3 * fix cli bugs * fix * add deeseek r1/Qwen3-235B-A22B
1 parent 09e769c commit 11d3679

File tree

4 files changed

+82
-1
lines changed

4 files changed

+82
-1
lines changed

src/emd/commands/deploy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,7 @@ def deploy(
399399
else:
400400
gpu_num = get_gpu_num()
401401
support_gpu_num = model.supported_instances[0].gpu_num
402+
support_gpu_num = support_gpu_num or gpu_num
402403
default_gpus_str = ",".join([str(i) for i in range(min(gpu_num,support_gpu_num))])
403404
gpus_to_deploy = questionary.text(
404405
"input the local gpu ids to deploy the model (e.g. 0,1,2):",

src/emd/models/engines.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ class KtransformersEngine(OpenAICompitableEngine):
108108

109109
vllm_deepseek_r1_distill_llama_engine071 = vllm_deepseek_r1_distill_qwen_engine071
110110

111+
vllm_deepseek_r1_engine084 = VllmEngine(**{
112+
**vllm_engine064.model_dump(),
113+
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
114+
"default_cli_args": "--max_num_seq 10 --max_model_len 16000 --chat-template emd/models/chat_templates/deepseek_r1.jinja"
115+
})
111116

112117
vllm_qwen2d5_72b_engine064 = VllmEngine(**{
113118
**vllm_engine064.model_dump(),
@@ -165,7 +170,7 @@ class KtransformersEngine(OpenAICompitableEngine):
165170
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
166171
"dockerfile_name":"Dockerfile_qwen25_vl",
167172
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
168-
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.9"
173+
"default_cli_args": " --max_model_len 32000 --disable-log-stats --limit-mm-per-prompt image=1,video=1 --max_num_seq 1 --gpu_memory_utilization 0.7"
169174
})
170175

171176
vllm_qwq_engine073 = VllmEngine(**{

src/emd/models/llms/deepseek.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
llama_cpp_deepseek_r1_distill_engineb9ab0a4,
88
tgi_deepseek_r1_llama_70b_engine301,
99
ktransformers_engine,
10+
vllm_deepseek_r1_engine084
1011
)
1112
from ..services import (
1213
sagemaker_service,
@@ -450,6 +451,31 @@
450451
)
451452
)
452453

454+
Model.register(
455+
dict(
456+
model_id = "DeepSeek-R1",
457+
supported_engines=[vllm_deepseek_r1_engine084],
458+
supported_instances=[
459+
local_instance
460+
],
461+
supported_services=[
462+
local_service
463+
],
464+
supported_frameworks=[
465+
fastapi_framework
466+
],
467+
allow_china_region=True,
468+
need_prepare_model=False,
469+
huggingface_model_id="unsloth/DeepSeek-R1",
470+
modelscope_model_id="unsloth/DeepSeek-R1",
471+
require_huggingface_token=False,
472+
application_scenario="Agent, tool use, translation, summary",
473+
description="The latest series of DeepSeek LLMs for reasoning",
474+
model_type=ModelType.LLM,
475+
model_series=DEEPSEEK_REASONING_MODEL
476+
)
477+
)
478+
453479
Model.register(
454480
dict(
455481
model_id = "deepseek-r1-671b-4bit_gguf",

src/emd/models/llms/qwen.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,3 +773,52 @@
773773
model_series=QWEN3_SERIES
774774
)
775775
)
776+
777+
778+
Model.register(
779+
dict(
780+
model_id = "Qwen3-235B-A22B",
781+
supported_engines=[vllm_qwen3_engin084],
782+
supported_instances=[
783+
local_instance
784+
],
785+
supported_services=[
786+
local_service
787+
],
788+
supported_frameworks=[
789+
fastapi_framework
790+
],
791+
allow_china_region=True,
792+
huggingface_model_id="Qwen/Qwen3-235B-A22B",
793+
modelscope_model_id="Qwen/Qwen3-235B-A22B",
794+
require_huggingface_token=False,
795+
application_scenario="Agent, tool use, translation, summary",
796+
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
797+
model_type=ModelType.LLM,
798+
model_series=QWEN3_SERIES
799+
)
800+
)
801+
802+
Model.register(
803+
dict(
804+
model_id = "Qwen3-235B-A22B-FP8",
805+
supported_engines=[vllm_qwen3_engin084],
806+
supported_instances=[
807+
local_instance
808+
],
809+
supported_services=[
810+
local_service
811+
],
812+
supported_frameworks=[
813+
fastapi_framework
814+
],
815+
allow_china_region=True,
816+
huggingface_model_id="Qwen/Qwen3-235B-A22B-FP8",
817+
modelscope_model_id="Qwen/Qwen3-235B-A22B-FP8",
818+
require_huggingface_token=False,
819+
application_scenario="Agent, tool use, translation, summary",
820+
description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
821+
model_type=ModelType.LLM,
822+
model_series=QWEN3_SERIES
823+
)
824+
)

0 commit comments

Comments
 (0)