Skip to content

Commit 82c5022

Browse files
authored
feat: add qwen3 (#114)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy * add txgemma model; * modify model list command * fix typo * add some ecs parameters * add glm4-z1 models * modify vllm backend * add qwen3 * fix cli bugs * fix
1 parent 36a4997 commit 82c5022

File tree

2 files changed

+41
-3
lines changed

2 files changed

+41
-3
lines changed

src/emd/models/engines.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,9 @@ class KtransformersEngine(OpenAICompitableEngine):
140140

141141
vllm_qwen3_engin084 = VllmEngine(**{
142142
**vllm_engine064.model_dump(),
143-
"engine_dockerfile_config": {"VERSION":"v0.8.4"},
143+
"engine_dockerfile_config": {"VERSION":"v0.8.5"},
144144
"environment_variables": "export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
145-
"default_cli_args": " --max_model_len 16000 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
145+
"default_cli_args": " --max_model_len 16000 --max_num_seq 30 --disable-log-stats --enable-reasoning --reasoning-parser deepseek_r1 --enable-auto-tool-choice --tool-call-parser hermes --enable-prefix-caching"
146146
})
147147

148148

src/emd/models/llms/qwen.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@
578578
g5d4xlarge_instance,
579579
g5d8xlarge_instance,
580580
g5d16xlarge_instance,
581-
g4dn2xlarge_instance,
581+
# g4dn2xlarge_instance,
582582
# g5d24xlarge_instance,
583583
# g5d48xlarge_instance,
584584
local_instance
@@ -671,6 +671,44 @@
671671
)
672672
)
673673

674+
675+
# ValueError("type fp8e4nv not supported in this architecture. The supported fp8 dtypes are ('fp8e4b15', 'fp8e5')")
676+
# The g5 instance may not support fp8e4nv
677+
# Model.register(
678+
# dict(
679+
# model_id = "Qwen3-14B-FP8",
680+
# supported_engines=[vllm_qwen3_engin084],
681+
# supported_instances=[
682+
# g5d2xlarge_instance,
683+
# g5d4xlarge_instance,
684+
# g5d8xlarge_instance,
685+
# g5d16xlarge_instance,
686+
# # g4dn2xlarge_instance,
687+
# # g5d24xlarge_instance,
688+
# # g5d48xlarge_instance,
689+
# local_instance
690+
# ],
691+
# supported_services=[
692+
# sagemaker_service,
693+
# sagemaker_async_service,
694+
# ecs_service,
695+
# local_service
696+
# ],
697+
# supported_frameworks=[
698+
# fastapi_framework
699+
# ],
700+
# allow_china_region=True,
701+
# huggingface_model_id="Qwen/Qwen3-14B-FP8",
702+
# modelscope_model_id="Qwen/Qwen3-14B-FP8",
703+
# require_huggingface_token=False,
704+
# application_scenario="Agent, tool use, translation, summary",
705+
# description="The latest series of Qwen LLMs, offers base and tuned models from 0.5B to 72B\n parameters, featuring enhanced knowledge, improved coding and math skills, better instruction\n following, long-text generation, structured data handling, 128K token context support, and\n multilingual capabilities for 29+ languages.",
706+
# model_type=ModelType.LLM,
707+
# model_series=QWEN3_SERIES
708+
# )
709+
# )
710+
711+
674712
Model.register(
675713
dict(
676714
model_id = "Qwen3-32B",

0 commit comments

Comments
 (0)