docs:modify docs (#127)

11zhouxuan · web-flow · commit d02d83419aaf · 2025-05-15T11:34:23.000+08:00
* merge

* merge

* add Mistral-Small-3.1-24B-Instruct-2503

* modify qwq-32b deploy

* add txgemma model;

* modify model list command

* fix typo

* add some ecs parameters

* add glm4-z1 models

* modify vllm backend

* add qwen3

* fix cli bugs

* fix

* add deeseek r1/Qwen3-235B-A22B

* fix local deploy account bug

* add qwen 3 awq models

* fix serialize_utils bugs

* modify qwen3 deployment

* modify docs
diff --git a/docs/en/best_deployment_practices.md b/docs/en/best_deployment_practices.md
@@ -65,7 +65,7 @@ To enable longer context windows, use the `--extra-params` option with engine-sp
 ```bash
 emd deploy --model-id Qwen2.5-7B-Instruct --instance-type g5.4xlarge --engine-type vllm --service-type sagemaker_realtime --extra-params '{
   "engine_params": {
-    "vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4"
+    "cli_args": "--max_model_len 16000 --max_num_seqs 4"
   }
 }'
 ```
@@ -202,13 +202,13 @@ Engine parameters control the behavior of the inference engine.
 ```json
 {
   "engine_params": {
-    "vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",
+    "cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",
     "environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
   }
 }
 ```
 
-- `vllm_cli_args`: Command line arguments specific to vLLM
+- `cli_args`: Command line arguments specific to vLLM
 - Common vLLM parameters:
   - `--max_model_len`: Maximum context length
   - `--max_num_seqs`: Maximum number of sequences
diff --git a/docs/en/commands.md b/docs/en/commands.md
@@ -87,7 +87,7 @@ emd deploy --allow-local-deploy
 
 Deploy with custom parameters:
 ```bash
-emd deploy --model-id Qwen2.5-7B-Instruct --extra-params '{"engine_params": {"vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4"}}'
+emd deploy --model-id Qwen2.5-7B-Instruct --extra-params '{"engine_params": {"cli_args": "--max_model_len 16000 --max_num_seqs 4"}}'
 ```
 
 ### status
diff --git a/src/pipeline/backend/tests/vllm_tester.py b/src/pipeline/backend/tests/vllm_tester.py
@@ -19,7 +19,7 @@ def setUpClass(self):
         service_type = "sagemaker"
         framework_type = "fastapi"
         model_s3_bucket = "emd-us-east-1-bucket-75c6f785084f4fd998da560a0a6190fc"
-        vllm_cli_args = "--max_model_len 4096"
+        cli_args = "--max_model_len 4096"
         # model_id = "Qwen2.5-0.5B-Instruct"
         model_id = "bge-m3"
         model = Model.get_model(model_id)
@@ -30,7 +30,7 @@ def setUpClass(self):
             current_service=model.find_current_service(service_type),
             current_framework=model.find_current_framework(framework_type),
             model_s3_bucket=model_s3_bucket,
-            vllm_cli_args=vllm_cli_args,
+            cli_args=cli_args,
 
         )
         self.execute_model = model.convert_to_execute_model(executable_config)
diff --git a/src/pipeline/pipeline.sh b/src/pipeline/pipeline.sh
@@ -24,7 +24,7 @@ gpu_num=1
 instance_type=g5.12xlarge
 
 # python deploy/prepare_model.py --region $region --model_id $model_id --model_s3_bucket $model_s3_bucket || { echo "Failed to prepare model"; exit 1; }
-# python deploy/build_and_push_image.py --region $region --model_id $model_id --backend_type $backend_type --gpu_num $gpu_num --instance_type $instance_type --model_s3_bucket $model_s3_bucket --vllm_cli_args "--max_model_len 4096" || { echo "Failed to build and push image"; exit 1; }
+# python deploy/build_and_push_image.py --region $region --model_id $model_id --backend_type $backend_type --gpu_num $gpu_num --instance_type $instance_type --model_s3_bucket $model_s3_bucket --cli_args "--max_model_len 4096" || { echo "Failed to build and push image"; exit 1; }
 # python deploy/deploy.py --region $region --instance_type $instance_type --model_id $model_id --backend_type $backend_type --service $service --gpu_num $gpu_num || { echo "Failed to deploy"; exit 1; }
 
 python pipeline.py \
@@ -39,6 +39,6 @@ python pipeline.py \
     --role_name SageMakerExecutionRoleTest6 \
     --skip_image_build \
     --skip_deploy \
-    --vllm_cli_args "--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
+    --cli_args "--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
 
 echo "Pipeline executed successfully"
diff --git a/tests/sdk_tests/deploy_tests/glm_test.py b/tests/sdk_tests/deploy_tests/glm_test.py
@@ -7,7 +7,7 @@
     service_type="sagemaker",
     region="us-west-2",
     extra_params={
-        "vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
+        "cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
     },
     force_env_stack_update=True
 )
diff --git a/tests/sdk_tests/deploy_tests/internlm_test.py b/tests/sdk_tests/deploy_tests/internlm_test.py
@@ -7,6 +7,6 @@
     service_type="sagemaker",
     region="us-west-2",
     extra_params={
-        "vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
+        "cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"
     }
 )
diff --git a/tests/sdk_tests/deploy_tests/qwen_test.py b/tests/sdk_tests/deploy_tests/qwen_test.py
@@ -8,7 +8,7 @@
     service_type="sagemaker",
     region="us-west-2",
     # extra_params={
-    #     "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
+    #     "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
     # }
 )
 
@@ -20,7 +20,7 @@
 #     service_type="sagemaker",
 #     region="us-west-2",
 #     extra_params={
-#         "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
+#         "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
 #     }
 # )
 # deploy(
@@ -31,6 +31,6 @@
 #     service_type="sagemaker",
 #     region="us-west-2",
 #     extra_params={
-#         "vllm_cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
+#         "cli_args":"--max_num_seqs 20 --max_model_len 16000 --disable-log-stats"
 #     }
 # )

Original file line number	Diff line number	Diff line change
@@ -65,7 +65,7 @@ To enable longer context windows, use the `--extra-params` option with engine-sp
`65`	`65`	```bash
`66`	`66`	`emd deploy --model-id Qwen2.5-7B-Instruct --instance-type g5.4xlarge --engine-type vllm --service-type sagemaker_realtime --extra-params '{`
`67`	`67`	`"engine_params": {`
`68`		`- "vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4"`
	`68`	`+ "cli_args": "--max_model_len 16000 --max_num_seqs 4"`
`69`	`69`	`}`
`70`	`70`	`}'`
`71`	`71`	```
`@@ -202,13 +202,13 @@ Engine parameters control the behavior of the inference engine.`
`202`	`202`	```json
`203`	`203`	`{`
`204`	`204`	`"engine_params": {`
`205`		`- "vllm_cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",`
	`205`	`+ "cli_args": "--max_model_len 16000 --max_num_seqs 4 --gpu_memory_utilization 0.9",`
`206`	`206`	`"environment_variables": "export VLLM_ATTENTION_BACKEND=FLASHINFER && export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"`
`207`	`207`	`}`
`208`	`208`	`}`
`209`	`209`	```
`210`	`210`
`211`		-- `vllm_cli_args`: Command line arguments specific to vLLM
	`211`	+- `cli_args`: Command line arguments specific to vLLM
`212`	`212`	`- Common vLLM parameters:`
`213`	`213`	- `--max_model_len`: Maximum context length
`214`	`214`	- `--max_num_seqs`: Maximum number of sequences
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`service_type="sagemaker",`
`8`	`8`	`region="us-west-2",`
`9`	`9`	`extra_params={`
`10`		`- "vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"`
	`10`	`+ "cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"`
`11`	`11`	`},`
`12`	`12`	`force_env_stack_update=True`
`13`	`13`	`)`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,6 @@`
`7`	`7`	`service_type="sagemaker",`
`8`	`8`	`region="us-west-2",`
`9`	`9`	`extra_params={`
`10`		`- "vllm_cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"`
	`10`	`+ "cli_args":"--max_num_seqs 4 --max_model_len 16000 --disable-log-stats"`
`11`	`11`	`}`
`12`	`12`	`)`