[Docs] Update Altlas 300I series doc and fix CI lint (#1537)

leo-pony · Yikun · web-flow · commit 53ec583bbb58 · 2025-06-30T23:34:00.000+08:00
### What this PR does / why we need it?
- Update Altlas 300I series doc: cleanup unused parameters and enable
optimized ops
- Fix code spell CI

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
CI passed

---------

Signed-off-by: leo-pony &lt;nengjunma@outlook.com&gt;
Signed-off-by: Yikun Jiang &lt;yikunkero@gmail.com&gt;
Co-authored-by: Yikun Jiang &lt;yikunkero@gmail.com&gt;
diff --git a/.github/workflows/doc_codespell.yaml b/.github/workflows/doc_codespell.yaml
@@ -28,6 +28,6 @@ jobs:
       - name: Run codespell check
         run: |
           CODESPELL_EXCLUDES=('--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**')
-          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn')
+          CODESPELL_IGNORE_WORDS=('-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn')
 
           codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" "${CODESPELL_IGNORE_WORDS[@]}"
diff --git a/docs/source/tutorials/single_node_300i.md b/docs/source/tutorials/single_node_300i.md
@@ -61,31 +61,24 @@ Run the following command to start the vLLM server:
 ```{code-block} bash
    :substitutions:
 export VLLM_USE_V1=1
-export MODEL="Qwen/Qwen3-0.6B"
-python -m vllm.entrypoints.api_server \
-    --model $MODEL \
+vllm serve Qwen/Qwen3-0.6B \
     --tensor-parallel-size 1 \
-    --max-num-batched-tokens 2048 \
-    --gpu-memory-utilization 0.5 \
-    --max-num-seqs 4 \
     --enforce-eager \
-    --trust-remote-code \
-    --max-model-len 1024 \
-    --disable-custom-all-reduce \
     --dtype float16 \
-    --port 8000 \
-    --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}' 
+    --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
 ```
 
 Once your server is started, you can query the model with input prompts
 
 ```bash
-curl http://localhost:8000/generate \
+curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "prompt": "Hello, my name is ？",
-    "max_tokens": 20,
-    "temperature": 0
+    "prompt": "The future of AI is",
+    "max_tokens": 64,
+    "top_p": 0.95,
+    "top_k": 50,
+    "temperature": 0.6
   }'
 ```
 ::::
@@ -98,31 +91,24 @@ Run the following command to start the vLLM server:
 ```{code-block} bash
    :substitutions:
 export VLLM_USE_V1=1
-export MODEL="Qwen/Qwen2.5-7B-Instruct"
-python -m vllm.entrypoints.api_server \
-    --model $MODEL \
+vllm serve Qwen/Qwen2.5-7B-Instruct \
     --tensor-parallel-size 2 \
-    --max-num-batched-tokens 2048 \
-    --gpu-memory-utilization 0.5 \
-    --max-num-seqs 4 \
     --enforce-eager \
-    --trust-remote-code \
-    --max-model-len 1024 \
-    --disable-custom-all-reduce \
     --dtype float16 \
-    --port 8000 \
-    --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}' 
+    --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
 ```
 
 Once your server is started, you can query the model with input prompts
 
 ```bash
-curl http://localhost:8000/generate \
+curl http://localhost:8000/v1/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "prompt": "Hello, my name is ？",
-    "max_tokens": 20,
-    "temperature": 0
+    "prompt": "The future of AI is",
+    "max_tokens": 64,
+    "top_p": 0.95,
+    "top_k": 50,
+    "temperature": 0.6
   }'
 ```
 
@@ -206,14 +192,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
 llm = LLM(
     model="Qwen/Qwen3-0.6B",
-    max_model_len=4096,
-    max_num_seqs=4,
-    trust_remote_code=True,
     tensor_parallel_size=1,
     enforce_eager=True, # For 300I series, only eager mode is supported.
     dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
-    disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
-    compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
+    compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
 )
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
@@ -253,14 +235,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
 # Create an LLM.
 llm = LLM(
     model="Qwen/Qwen2.5-7B-Instruct",
-    max_model_len=4096,
-    max_num_seqs=4,
-    trust_remote_code=True,
     tensor_parallel_size=2,
     enforce_eager=True, # For 300I series, only eager mode is supported.
     dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
-    disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
-    compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
+    compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
 )
 # Generate texts from the prompts.
 outputs = llm.generate(prompts, sampling_params)
diff --git a/format.sh b/format.sh
@@ -145,7 +145,7 @@ CODESPELL_EXCLUDES=(
 )
 
 CODESPELL_IGNORE_WORDS=(
-    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn'
+    '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn'
 )
 
 # check spelling of specified files

Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ CODESPELL_EXCLUDES=(`
`145`	`145`	`)`
`146`	`146`
`147`	`147`	`CODESPELL_IGNORE_WORDS=(`
`148`		`- '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn'`
	`148`	`+ '-L' 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,assertIn'`
`149`	`149`	`)`
`150`	`150`
`151`	`151`	`# check spelling of specified files`