@@ -61,31 +61,24 @@ Run the following command to start the vLLM server:
61
61
``` {code-block} bash
62
62
:substitutions:
63
63
export VLLM_USE_V1=1
64
- export MODEL="Qwen/Qwen3-0.6B"
65
- python -m vllm.entrypoints.api_server \
66
- --model $MODEL \
64
+ vllm serve Qwen/Qwen3-0.6B \
67
65
--tensor-parallel-size 1 \
68
- --max-num-batched-tokens 2048 \
69
- --gpu-memory-utilization 0.5 \
70
- --max-num-seqs 4 \
71
66
--enforce-eager \
72
- --trust-remote-code \
73
- --max-model-len 1024 \
74
- --disable-custom-all-reduce \
75
67
--dtype float16 \
76
- --port 8000 \
77
- --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}'
68
+ --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
78
69
```
79
70
80
71
Once your server is started, you can query the model with input prompts
81
72
82
73
``` bash
83
- curl http://localhost:8000/generate \
74
+ curl http://localhost:8000/v1/completions \
84
75
-H " Content-Type: application/json" \
85
76
-d ' {
86
- "prompt": "Hello, my name is ?",
87
- "max_tokens": 20,
88
- "temperature": 0
77
+ "prompt": "The future of AI is",
78
+ "max_tokens": 64,
79
+ "top_p": 0.95,
80
+ "top_k": 50,
81
+ "temperature": 0.6
89
82
}'
90
83
```
91
84
::::
@@ -98,31 +91,24 @@ Run the following command to start the vLLM server:
98
91
``` {code-block} bash
99
92
:substitutions:
100
93
export VLLM_USE_V1=1
101
- export MODEL="Qwen/Qwen2.5-7B-Instruct"
102
- python -m vllm.entrypoints.api_server \
103
- --model $MODEL \
94
+ vllm serve Qwen/Qwen2.5-7B-Instruct \
104
95
--tensor-parallel-size 2 \
105
- --max-num-batched-tokens 2048 \
106
- --gpu-memory-utilization 0.5 \
107
- --max-num-seqs 4 \
108
96
--enforce-eager \
109
- --trust-remote-code \
110
- --max-model-len 1024 \
111
- --disable-custom-all-reduce \
112
97
--dtype float16 \
113
- --port 8000 \
114
- --compilation-config '{"custom_ops":["+rms_norm", "+rotary_embedding"]}'
98
+ --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}'
115
99
```
116
100
117
101
Once your server is started, you can query the model with input prompts
118
102
119
103
``` bash
120
- curl http://localhost:8000/generate \
104
+ curl http://localhost:8000/v1/completions \
121
105
-H " Content-Type: application/json" \
122
106
-d ' {
123
- "prompt": "Hello, my name is ?",
124
- "max_tokens": 20,
125
- "temperature": 0
107
+ "prompt": "The future of AI is",
108
+ "max_tokens": 64,
109
+ "top_p": 0.95,
110
+ "top_k": 50,
111
+ "temperature": 0.6
126
112
}'
127
113
```
128
114
@@ -206,14 +192,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
206
192
# Create an LLM.
207
193
llm = LLM(
208
194
model="Qwen/Qwen3-0.6B",
209
- max_model_len=4096,
210
- max_num_seqs=4,
211
- trust_remote_code=True,
212
195
tensor_parallel_size=1,
213
196
enforce_eager=True, # For 300I series, only eager mode is supported.
214
197
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
215
- disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
216
- compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
198
+ compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
217
199
)
218
200
# Generate texts from the prompts.
219
201
outputs = llm.generate(prompts, sampling_params)
@@ -253,14 +235,10 @@ sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
253
235
# Create an LLM.
254
236
llm = LLM(
255
237
model="Qwen/Qwen2.5-7B-Instruct",
256
- max_model_len=4096,
257
- max_num_seqs=4,
258
- trust_remote_code=True,
259
238
tensor_parallel_size=2,
260
239
enforce_eager=True, # For 300I series, only eager mode is supported.
261
240
dtype="float16", # IMPORTANT cause some ATB ops cannot support bf16 on 300I series
262
- disable_custom_all_reduce=True, # IMPORTANT cause 300I series needed
263
- compilation_config={"custom_ops":["+rms_norm", "+rotary_embedding"]}, # IMPORTANT cause 300I series needed custom ops
241
+ compilation_config={"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}, # High performance for 300I series
264
242
)
265
243
# Generate texts from the prompts.
266
244
outputs = llm.generate(prompts, sampling_params)
0 commit comments