Skip to content

Commit d02d834

Browse files
authored
docs:modify docs (#127)
* merge * merge * add Mistral-Small-3.1-24B-Instruct-2503 * modify qwq-32b deploy * add txgemma model; * modify model list command * fix typo * add some ecs parameters * add glm4-z1 models * modify vllm backend * add qwen3 * fix cli bugs * fix * add deeseek r1/Qwen3-235B-A22B * fix local deploy account bug * add qwen 3 awq models * fix serialize_utils bugs * modify qwen3 deployment * modify docs
1 parent 15182d8 commit d02d834

File tree

7 files changed

+13
-13
lines changed

7 files changed

+13
-13
lines changed

docs/en/best_deployment_practices.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ To enable longer context windows, use the `--extra-params` option with engine-sp
6565
```bash
6666
emd deploy --model-id Qwen2.5-7B-Instruct --instance-type g5.4xlarge --engine-type vllm --service-type sagemaker_realtime --extra-params '{
6767
"engine_params": {
68-
"vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4"
68+
"cli_args": "--max_model_len 16000 --max_num_seqs 4"
6969
}
7070
}'
7171
```
@@ -202,13 +202,13 @@ Engine parameters control the behavior of the inference engine.
202202
```json
203203
{
204204
"engine_params": {
205-
"vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",
205+
"cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",
206206
"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
207207
}
208208
}
209209
```
210210

211-
- `vllm_cli_args`: Command line arguments specific to vLLM
211+
- `cli_args`: Command line arguments specific to vLLM
212212
- Common vLLM parameters:
213213
- `--max_model_len`: Maximum context length
214214
- `--max_num_seqs`: Maximum number of sequences

docs/en/commands.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ emd deploy --allow-local-deploy
8787

8888
Deploy with custom parameters:
8989
```bash
90-
emd deploy --model-id Qwen2.5-7B-Instruct --extra-params '{"engine_params": {"vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4"}}'
90+
emd deploy --model-id Qwen2.5-7B-Instruct --extra-params '{"engine_params": {"cli_args": "--max_model_len 16000 --max_num_seqs 4"}}'
9191
```
9292

9393
### status

src/pipeline/backend/tests/vllm_tester.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def setUpClass(self):
1919
service_type = "sagemaker"
2020
framework_type = "fastapi"
2121
model_s3_bucket = "emd-us-east-1-bucket-75c6f785084f4fd998da560a0a6190fc"
22-
vllm_cli_args = "--max_model_len 4096"
22+
cli_args = "--max_model_len 4096"
2323
# model_id = "Qwen2.5-0.5B-Instruct"
2424
model_id = "bge-m3"
2525
model = Model.get_model(model_id)
@@ -30,7 +30,7 @@ def setUpClass(self):
3030
current_service=model.find_current_service(service_type),
3131
current_framework=model.find_current_framework(framework_type),
3232
model_s3_bucket=model_s3_bucket,
33-
vllm_cli_args=vllm_cli_args,
33+
cli_args=cli_args,
3434

3535
)
3636
self.execute_model = model.convert_to_execute_model(executable_config)

src/pipeline/pipeline.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ gpu_num=1
2424
instance_type=g5.12xlarge
2525

2626
# python deploy/prepare_model.py --region $region --model_id $model_id --model_s3_bucket $model_s3_bucket || { echo "Failed to prepare model"; exit 1; }
27-
# python deploy/build_and_push_image.py --region $region --model_id $model_id --backend_type $backend_type --gpu_num $gpu_num --instance_type $instance_type --model_s3_bucket $model_s3_bucket --vllm_cli_args "--max_model_len 4096" || { echo "Failed to build and push image"; exit 1; }
27+
# python deploy/build_and_push_image.py --region $region --model_id $model_id --backend_type $backend_type --gpu_num $gpu_num --instance_type $instance_type --model_s3_bucket $model_s3_bucket --cli_args "--max_model_len 4096" || { echo "Failed to build and push image"; exit 1; }
2828
# python deploy/deploy.py --region $region --instance_type $instance_type --model_id $model_id --backend_type $backend_type --service $service --gpu_num $gpu_num || { echo "Failed to deploy"; exit 1; }
2929

3030
python pipeline.py \
@@ -39,6 +39,6 @@ python pipeline.py \
3939
--role_name SageMakerExecutionRoleTest6 \
4040
--skip_image_build \
4141
--skip_deploy \
42-
--vllm_cli_args "--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
42+
--cli_args "--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
4343

4444
echo "Pipeline executed successfully"

tests/sdk_tests/deploy_tests/glm_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
service_type="sagemaker",
88
region="us-west-2",
99
extra_params={
10-
"vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
10+
"cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
1111
},
1212
force_env_stack_update=True
1313
)

tests/sdk_tests/deploy_tests/internlm_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@
77
service_type="sagemaker",
88
region="us-west-2",
99
extra_params={
10-
"vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
10+
"cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
1111
}
1212
)

tests/sdk_tests/deploy_tests/qwen_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
service_type="sagemaker",
99
region="us-west-2",
1010
# extra_params={
11-
# "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
11+
# "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
1212
# }
1313
)
1414

@@ -20,7 +20,7 @@
2020
# service_type="sagemaker",
2121
# region="us-west-2",
2222
# extra_params={
23-
# "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
23+
# "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
2424
# }
2525
# )
2626
# deploy(
@@ -31,6 +31,6 @@
3131
# service_type="sagemaker",
3232
# region="us-west-2",
3333
# extra_params={
34-
# "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
34+
# "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
3535
# }
3636
# )

0 commit comments

Comments
 (0)