Skip to content

Commit 4ac8546

Browse files
authored
feat:add_glm_4_0414;fix openai client create bugs (#104)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy * add txgemma model; * modify model list command * fix typo * add some ecs parameters * add glm4-z1 models * modify vllm backend
1 parent 3b5ca4e commit 4ac8546

File tree

5 files changed

+202
-8
lines changed

5 files changed

+202
-8
lines changed

src/emd/commands/deploy.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,9 @@ def deploy(
239239
dockerfile_local_path: Annotated[
240240
str, typer.Option("--dockerfile-local-path", help="Your custom Dockerfile path for building the model image, all files must be in the same directory")
241241
] = None,
242+
local_gpus:Annotated[
243+
str, typer.Option("--local-gpus", help="Local gpu ids to deploy the model (e.g. `0,1,2`), only working with local deployment mode.")
244+
] = None,
242245
):
243246
if only_allow_local_deploy:
244247
allow_local_deploy = True
@@ -389,8 +392,10 @@ def deploy(
389392
)
390393
if service_type == ServiceType.LOCAL:
391394
if check_cuda_exists():
392-
if os.environ.get('CUDA_VISIBLE_DEVICES'):
393-
console.print(f"[bold blue]local gpus: {os.environ.get('CUDA_VISIBLE_DEVICES')}[/bold blue]")
395+
if local_gpus is not None:
396+
os.environ['CUDA_VISIBLE_DEVICES']=local_gpus
397+
elif os.environ.get('CUDA_VISIBLE_DEVICES'):
398+
pass
394399
else:
395400
gpu_num = get_gpu_num()
396401
support_gpu_num = model.supported_instances[0].gpu_num
@@ -400,6 +405,7 @@ def deploy(
400405
default=f"{default_gpus_str}"
401406
).ask()
402407
os.environ['CUDA_VISIBLE_DEVICES']=gpus_to_deploy
408+
console.print(f"[bold blue]local gpus: {os.environ.get('CUDA_VISIBLE_DEVICES')}[/bold blue]")
403409
instance_type = InstanceType.LOCAL
404410
else:
405411
if instance_type is None:

src/emd/models/engines.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,25 @@ class KtransformersEngine(OpenAICompitableEngine):
165165

166166
vllm_glm4_engine064 = vllm_engine064
167167

168+
169+
vllm_glm4_0414_engine082 = VllmEngine(**{
170+
**vllm_qwen25vl72b_engine073.model_dump(),
171+
"engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
172+
"environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
173+
# "default_cli_args": "--max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic"
174+
"default_cli_args": "--max_model_len 16000 --max_num_seq 10 --disable-log-stats"
175+
})
176+
177+
178+
vllm_glm4_z1_engine082 = VllmEngine(**{
179+
**vllm_qwen25vl72b_engine073.model_dump(),
180+
"engine_dockerfile_config": {"VERSION":"glm_z1_and_0414"},
181+
"environment_variables": "export VLLM_USE_V1=0 && export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True",
182+
# "default_cli_args": "--max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-auto-tool-choice --tool-call-parser pythonic --enable-reasoning --reasoning-parser granite"
183+
"default_cli_args": "--max_model_len 16000 --max_num_seq 10 --disable-log-stats --enable-reasoning --reasoning-parser granite"
184+
})
185+
186+
168187
vllm_glm4_wo_flashinfer_engine064 = VllmEngine(**{
169188
**vllm_engine064.model_dump(),
170189
# "engine_dockerfile_config": {"VERSION":"v0.6.0"},

src/emd/models/llms/glm.py

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
from .. import Model
2-
from ..engines import vllm_glm4_engine064,vllm_glm4_wo_flashinfer_engine064
2+
from ..engines import (
3+
vllm_glm4_engine064,
4+
vllm_glm4_wo_flashinfer_engine064,
5+
vllm_glm4_0414_engine082,
6+
vllm_glm4_z1_engine082
7+
)
38
from ..services import (
49
sagemaker_service,
510
sagemaker_async_service,
@@ -79,3 +84,148 @@
7984
model_series=GLM4_SERIES
8085
)
8186
)
87+
88+
89+
Model.register(
90+
dict(
91+
model_id = "GLM-4-9B-0414",
92+
supported_engines=[vllm_glm4_0414_engine082],
93+
supported_instances=[
94+
g5d12xlarge_instance,
95+
g5d24xlarge_instance,
96+
g5d48xlarge_instance,
97+
local_instance
98+
],
99+
supported_services=[
100+
sagemaker_service,
101+
sagemaker_async_service,
102+
ecs_service,
103+
local_service
104+
],
105+
allow_china_region=True,
106+
supported_frameworks=[fastapi_framework],
107+
huggingface_model_id="THUDM/GLM-4-9B-0414",
108+
modelscope_model_id="ZhipuAI/GLM-4-9B-0414",
109+
require_huggingface_token=False,
110+
application_scenario="Agent, tool use, translation, summary",
111+
description="GLM-4-32B-0414 series",
112+
model_type=ModelType.LLM,
113+
model_series=GLM4_SERIES
114+
)
115+
)
116+
117+
Model.register(
118+
dict(
119+
model_id = "GLM-4-32B-0414",
120+
supported_engines=[vllm_glm4_0414_engine082],
121+
supported_instances=[
122+
g5d12xlarge_instance,
123+
g5d24xlarge_instance,
124+
g5d48xlarge_instance,
125+
local_instance
126+
],
127+
supported_services=[
128+
sagemaker_service,
129+
sagemaker_async_service,
130+
ecs_service,
131+
local_service
132+
],
133+
allow_china_region=True,
134+
supported_frameworks=[fastapi_framework],
135+
huggingface_model_id="THUDM/GLM-4-32B-0414",
136+
modelscope_model_id="ZhipuAI/GLM-4-32B-0414",
137+
require_huggingface_token=False,
138+
application_scenario="Agent, tool use, translation, summary",
139+
description="GLM-4-32B-0414 series",
140+
model_type=ModelType.LLM,
141+
model_series=GLM4_SERIES
142+
)
143+
)
144+
145+
146+
147+
Model.register(
148+
dict(
149+
model_id = "GLM-Z1-9B-0414",
150+
supported_engines=[vllm_glm4_z1_engine082],
151+
supported_instances=[
152+
g5d12xlarge_instance,
153+
g5d24xlarge_instance,
154+
g5d48xlarge_instance,
155+
local_instance
156+
],
157+
supported_services=[
158+
sagemaker_service,
159+
sagemaker_async_service,
160+
ecs_service,
161+
local_service
162+
],
163+
allow_china_region=True,
164+
supported_frameworks=[fastapi_framework],
165+
huggingface_model_id="THUDM/GLM-Z1-9B-0414",
166+
modelscope_model_id="ZhipuAI/GLM-Z1-9B-0414",
167+
require_huggingface_token=False,
168+
application_scenario="Agent, tool use, translation, summary",
169+
description="GLM-4-32B-0414 series",
170+
model_type=ModelType.LLM,
171+
model_series=GLM4_SERIES
172+
)
173+
)
174+
175+
176+
Model.register(
177+
dict(
178+
model_id = "GLM-Z1-32B-0414",
179+
supported_engines=[vllm_glm4_z1_engine082],
180+
supported_instances=[
181+
g5d12xlarge_instance,
182+
g5d24xlarge_instance,
183+
g5d48xlarge_instance,
184+
local_instance
185+
],
186+
supported_services=[
187+
sagemaker_service,
188+
sagemaker_async_service,
189+
ecs_service,
190+
local_service
191+
],
192+
allow_china_region=True,
193+
supported_frameworks=[fastapi_framework],
194+
huggingface_model_id="THUDM/GLM-Z1-32B-0414",
195+
modelscope_model_id="ZhipuAI/GLM-Z1-32B-0414",
196+
require_huggingface_token=False,
197+
application_scenario="Agent, tool use, translation, summary",
198+
description="GLM-4-32B-0414 series",
199+
model_type=ModelType.LLM,
200+
model_series=GLM4_SERIES
201+
)
202+
)
203+
204+
205+
Model.register(
206+
dict(
207+
model_id = "GLM-Z1-Rumination-32B-0414",
208+
supported_engines=[vllm_glm4_z1_engine082],
209+
supported_instances=[
210+
g5d12xlarge_instance,
211+
g5d24xlarge_instance,
212+
g5d48xlarge_instance,
213+
local_instance
214+
],
215+
supported_services=[
216+
sagemaker_service,
217+
sagemaker_async_service,
218+
ecs_service,
219+
local_service
220+
],
221+
allow_china_region=True,
222+
supported_frameworks=[fastapi_framework],
223+
huggingface_model_id="THUDM/GLM-Z1-Rumination-32B-0414",
224+
modelscope_model_id="ZhipuAI/GLM-Z1-Rumination-32B-0414",
225+
require_huggingface_token=False,
226+
application_scenario="Agent, tool use, translation, summary",
227+
description="GLM-4-32B-0414 series",
228+
model_type=ModelType.LLM,
229+
model_series=GLM4_SERIES
230+
)
231+
)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
From vllm/vllm-openai:v0.8.4
2+
3+
RUN git clone https://github.com/vllm-project/vllm.git && cd vllm && git fetch origin pull/16618/head:pr-16618 && VLLM_USE_PRECOMPILED=1 pip install --editable .
4+
5+
EXPOSE 8080
6+
7+
# Set the serve script as the entrypoint
8+
ENTRYPOINT ["/usr/bin/serve"]

src/pipeline/backend/vllm/vllm_backend.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import sys
33
import os
44
from emd.models.utils.constants import ModelType
5-
5+
import inspect
66
from backend.backend import OpenAICompitableProxyBackendBase
77
from emd.utils.logger_utils import get_logger
88

@@ -22,6 +22,13 @@ def create_proxy_server_start_command(self,model_path):
2222
serve_command += f" --api-key {self.api_key}"
2323
return serve_command
2424

25+
def openai_create_helper(self,fn:callable,request:dict):
26+
sig = inspect.signature(fn)
27+
extra_body = request.get("extra_body",{})
28+
extra_params = {k:request.pop(k) for k in list(request.keys()) if k not in sig.parameters}
29+
extra_body.update(extra_params)
30+
request['extra_body'] = extra_body
31+
return fn(**request)
2532

2633
def invoke(self, request):
2734
# Transform input to vllm format
@@ -30,7 +37,7 @@ def invoke(self, request):
3037
logger.info(f"Chat request:{request}")
3138
if self.model_type == ModelType.EMBEDDING:
3239
# print('cal embedding....')
33-
response = self.client.embeddings.create(**request)
40+
response =self.openai_create_helper(self.client.embeddings.create,request)
3441
# print('end cal embedding....')
3542
elif self.model_type == ModelType.RERANK:
3643
headers = {
@@ -43,7 +50,8 @@ def invoke(self, request):
4350
headers=headers
4451
).json()
4552
else:
46-
response = self.client.chat.completions.create(**request)
53+
# response = self.client.chat.completions.create(**request)
54+
response = self.openai_create_helper(self.client.chat.completions.create,request)
4755
logger.info(f"response:{response},{request}")
4856

4957
if request.get("stream", False):
@@ -58,7 +66,7 @@ async def ainvoke(self, request):
5866
logger.info(f"Chat request:{request}")
5967
if self.model_type == ModelType.EMBEDDING:
6068
# print('cal embedding....')
61-
response = await self.async_client.embeddings.create(**request)
69+
response = await self.openai_create_helper(self.async_client.embeddings.create,request)
6270
# print('end cal embedding....')
6371
elif self.model_type == ModelType.RERANK:
6472
headers = {
@@ -71,7 +79,10 @@ async def ainvoke(self, request):
7179
headers=headers
7280
).json()
7381
else:
74-
response = await self.async_client.chat.completions.create(**request)
82+
response = await self.openai_create_helper(
83+
self.async_client.chat.completions.create,
84+
request
85+
)
7586
logger.info(f"response:{response},{request}")
7687

7788
if request.get("stream", False):

0 commit comments

Comments
 (0)