Skip to content

Commit 21a97da

Browse files
committed
add more models to cuda and rocm benchmarks
1 parent d532ab7 commit 21a97da

File tree

2 files changed

+548
-2
lines changed

2 files changed

+548
-2
lines changed

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 295 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,90 @@
7878
"num_prompts": 200
7979
}
8080
},
81+
{
82+
"test_name": "serving_qwen3_30b_a3b_tp8_random_in1k_out2k",
83+
"qps_list": [10],
84+
"server_parameters": {
85+
"model": "Qwen/Qwen3-30B-A3B",
86+
"tensor_parallel_size": 8,
87+
"swap_space": 16,
88+
"disable_log_stats": "",
89+
"disable_log_requests": "",
90+
"load_format": "dummy",
91+
"max_model_len": 8192
92+
},
93+
"client_parameters": {
94+
"model": "Qwen/Qwen3-30B-A3B",
95+
"backend": "vllm",
96+
"dataset_name": "random",
97+
"num_prompts": 200,
98+
"random_input_len": 1024,
99+
"random_output_len": 2048
100+
}
101+
},
102+
{
103+
"test_name": "serving_gemma_3_27b_it_tp8_random_in1k_out2k",
104+
"qps_list": [10],
105+
"server_parameters": {
106+
"model": "google/gemma-3-27b-it",
107+
"tensor_parallel_size": 8,
108+
"swap_space": 16,
109+
"disable_log_stats": "",
110+
"disable_log_requests": "",
111+
"load_format": "dummy",
112+
"max_model_len": 8192
113+
},
114+
"client_parameters": {
115+
"model": "google/gemma-3-27b-it",
116+
"backend": "vllm",
117+
"dataset_name": "random",
118+
"num_prompts": 200,
119+
"random_input_len": 1024,
120+
"random_output_len": 2048
121+
}
122+
},
123+
{
124+
"test_name": "serving_gemma_3_4b_it_tp1_random_in1k_out2k",
125+
"qps_list": [10],
126+
"server_parameters": {
127+
"model": "google/gemma-3-4b-it",
128+
"tensor_parallel_size": 1,
129+
"swap_space": 16,
130+
"disable_log_stats": "",
131+
"disable_log_requests": "",
132+
"load_format": "dummy",
133+
"max_model_len": 8192
134+
},
135+
"client_parameters": {
136+
"model": "google/gemma-3-4b-it",
137+
"backend": "vllm",
138+
"dataset_name": "random",
139+
"num_prompts": 200,
140+
"random_input_len": 1024,
141+
"random_output_len": 2048
142+
}
143+
},
144+
{
145+
"test_name": "serving_qwen3_8b_tp1_random_in1k_out2k",
146+
"qps_list": [10],
147+
"server_parameters": {
148+
"model": "Qwen/Qwen3-8B",
149+
"tensor_parallel_size": 1,
150+
"swap_space": 16,
151+
"disable_log_stats": "",
152+
"disable_log_requests": "",
153+
"load_format": "dummy",
154+
"max_model_len": 8192
155+
},
156+
"client_parameters": {
157+
"model": "Qwen/Qwen3-8B",
158+
"backend": "vllm",
159+
"dataset_name": "random",
160+
"num_prompts": 200,
161+
"random_input_len": 1024,
162+
"random_output_len": 2048
163+
}
164+
},
81165
{
82166
"test_name": "serving_llama4_scout_tp4_sharegpt",
83167
"qps_list": [1, 4, 16, "inf"],
@@ -99,7 +183,112 @@
99183
}
100184
},
101185
{
102-
"test_name": "serving_llama4_maverick_fp8_tp8",
186+
"test_name": "serving_llama4_scout_tp4_random_in200_out200",
187+
"qps_list": [10],
188+
"server_parameters": {
189+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
190+
"tensor_parallel_size": 4,
191+
"swap_space": 16,
192+
"disable_log_stats": "",
193+
"disable_log_requests": "",
194+
"load_format": "dummy",
195+
"max_model_len": 8192
196+
},
197+
"client_parameters": {
198+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
199+
"backend": "vllm",
200+
"dataset_name": "random",
201+
"num_prompts": 200,
202+
"random_input_len": 200,
203+
"random_output_len": 200
204+
}
205+
},
206+
{
207+
"test_name": "serving_llama4_scout_tp4_random_in1k_out2k",
208+
"qps_list": [10],
209+
"server_parameters": {
210+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
211+
"tensor_parallel_size": 4,
212+
"swap_space": 16,
213+
"disable_log_stats": "",
214+
"disable_log_requests": "",
215+
"load_format": "dummy",
216+
"max_model_len": 8192
217+
},
218+
"client_parameters": {
219+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
220+
"backend": "vllm",
221+
"dataset_name": "random",
222+
"num_prompts": 200,
223+
"random_input_len": 1024,
224+
"random_output_len": 2048
225+
}
226+
},
227+
{
228+
"test_name": "serving_llama4_scout_tp4_random_in5k_out1k",
229+
"qps_list": [10],
230+
"server_parameters": {
231+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
232+
"tensor_parallel_size": 4,
233+
"swap_space": 16,
234+
"disable_log_stats": "",
235+
"disable_log_requests": "",
236+
"load_format": "dummy",
237+
"max_model_len": 8192
238+
},
239+
"client_parameters": {
240+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
241+
"backend": "vllm",
242+
"dataset_name": "random",
243+
"num_prompts": 200,
244+
"random_input_len": 5120,
245+
"random_output_len": 1024
246+
}
247+
},
248+
{
249+
"test_name": "serving_llama4_scout_tp4_random_in10k_out500",
250+
"qps_list": [10],
251+
"server_parameters": {
252+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
253+
"tensor_parallel_size": 4,
254+
"swap_space": 16,
255+
"disable_log_stats": "",
256+
"disable_log_requests": "",
257+
"load_format": "dummy",
258+
"max_model_len": 11264
259+
},
260+
"client_parameters": {
261+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
262+
"backend": "vllm",
263+
"dataset_name": "random",
264+
"num_prompts": 200,
265+
"random_input_len": 1024,
266+
"random_output_len": 500
267+
}
268+
},
269+
{
270+
"test_name": "serving_llama4_scout_tp4_random_in30k_out100",
271+
"qps_list": [10],
272+
"server_parameters": {
273+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
274+
"tensor_parallel_size": 4,
275+
"swap_space": 16,
276+
"disable_log_stats": "",
277+
"disable_log_requests": "",
278+
"load_format": "dummy",
279+
"max_model_len": 31744
280+
},
281+
"client_parameters": {
282+
"model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
283+
"backend": "vllm",
284+
"dataset_name": "random",
285+
"num_prompts": 200,
286+
"random_input_len": 30720,
287+
"random_output_len": 100
288+
}
289+
},
290+
{
291+
"test_name": "serving_llama4_maverick_fp8_tp8_sharegpt",
103292
"qps_list": [1, 4, 16, "inf"],
104293
"server_parameters": {
105294
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
@@ -117,5 +306,110 @@
117306
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
118307
"num_prompts": 200
119308
}
309+
},
310+
{
311+
"test_name": "serving_llama4_maverick_fp8_tp8_random_in200_out200",
312+
"qps_list": [10],
313+
"server_parameters": {
314+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
315+
"tensor_parallel_size": 8,
316+
"swap_space": 16,
317+
"disable_log_stats": "",
318+
"disable_log_requests": "",
319+
"load_format": "dummy",
320+
"max_model_len": 8192
321+
},
322+
"client_parameters": {
323+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
324+
"backend": "vllm",
325+
"dataset_name": "random",
326+
"num_prompts": 200,
327+
"random_input_len": 200,
328+
"random_output_len": 200
329+
}
330+
},
331+
{
332+
"test_name": "serving_llama4_maverick_fp8_tp8_random_in1k_out2k",
333+
"qps_list": [10],
334+
"server_parameters": {
335+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
336+
"tensor_parallel_size": 8,
337+
"swap_space": 16,
338+
"disable_log_stats": "",
339+
"disable_log_requests": "",
340+
"load_format": "dummy",
341+
"max_model_len": 8192
342+
},
343+
"client_parameters": {
344+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
345+
"backend": "vllm",
346+
"dataset_name": "random",
347+
"num_prompts": 200,
348+
"random_input_len": 1024,
349+
"random_output_len": 2048
350+
}
351+
},
352+
{
353+
"test_name": "serving_llama4_maverick_fp8_tp8_random_in5k_out1k",
354+
"qps_list": [10],
355+
"server_parameters": {
356+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
357+
"tensor_parallel_size": 8,
358+
"swap_space": 16,
359+
"disable_log_stats": "",
360+
"disable_log_requests": "",
361+
"load_format": "dummy",
362+
"max_model_len": 8192
363+
},
364+
"client_parameters": {
365+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
366+
"backend": "vllm",
367+
"dataset_name": "random",
368+
"num_prompts": 200,
369+
"random_input_len": 5120,
370+
"random_output_len": 1024
371+
}
372+
},
373+
{
374+
"test_name": "serving_llama4_maverick_fp8_tp8_random_in10k_out500",
375+
"qps_list": [10],
376+
"server_parameters": {
377+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
378+
"tensor_parallel_size": 8,
379+
"swap_space": 16,
380+
"disable_log_stats": "",
381+
"disable_log_requests": "",
382+
"load_format": "dummy",
383+
"max_model_len": 11264
384+
},
385+
"client_parameters": {
386+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
387+
"backend": "vllm",
388+
"dataset_name": "random",
389+
"num_prompts": 200,
390+
"random_input_len": 10240,
391+
"random_output_len": 500
392+
}
393+
},
394+
{
395+
"test_name": "serving_llama4_maverick_fp8_tp8_random_in30k_out100",
396+
"qps_list": [10],
397+
"server_parameters": {
398+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
399+
"tensor_parallel_size": 8,
400+
"swap_space": 16,
401+
"disable_log_stats": "",
402+
"disable_log_requests": "",
403+
"load_format": "dummy",
404+
"max_model_len": 31744
405+
},
406+
"client_parameters": {
407+
"model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
408+
"backend": "vllm",
409+
"dataset_name": "random",
410+
"num_prompts": 200,
411+
"random_input_len": 30720,
412+
"random_output_len": 100
413+
}
120414
}
121415
]

0 commit comments

Comments
 (0)