diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json index 9456bb8..66b7c4d 100644 --- a/vllm-benchmarks/benchmarks/cuda/serving-tests.json +++ b/vllm-benchmarks/benchmarks/cuda/serving-tests.json @@ -78,6 +78,90 @@ "num_prompts": 200 } }, + { + "test_name": "serving_qwen3_30b_a3b_tp8_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_gemma_3_27b_it_tp8_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "google/gemma-3-27b-it", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "google/gemma-3-27b-it", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_gemma_3_4b_it_tp1_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "google/gemma-3-4b-it", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "google/gemma-3-4b-it", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_qwen3_8b_tp1_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, { "test_name": "serving_llama4_scout_tp4_sharegpt", "qps_list": [1, 4, 16, "inf"], @@ -99,7 +183,112 @@ } }, { - "test_name": "serving_llama4_maverick_fp8_tp8", + "test_name": "serving_llama4_scout_tp4_random_in200_out200", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 200, + "random_output_len": 200 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in5k_out1k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5120, + "random_output_len": 1024 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in10k_out500", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 11264 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 500 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in30k_out100", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 31744 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 30720, + "random_output_len": 100 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", @@ -117,5 +306,110 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in200_out200", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 200, + "random_output_len": 200 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in5k_out1k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5120, + "random_output_len": 1024 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in10k_out500", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 11264 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 10240, + "random_output_len": 500 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in30k_out100", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 31744 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 30720, + "random_output_len": 100 + } } ] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json index 9456bb8..7b32d38 100644 --- a/vllm-benchmarks/benchmarks/rocm/serving-tests.json +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -99,7 +99,112 @@ } }, { - "test_name": "serving_llama4_maverick_fp8_tp8", + "test_name": "serving_llama4_scout_tp4_random_in200_out200", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 200, + "random_output_len": 200 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in5k_out1k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5120, + "random_output_len": 1024 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in10k_out500", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 11264 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 500 + } + }, + { + "test_name": "serving_llama4_scout_tp4_random_in30k_out100", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 31744 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 30720, + "random_output_len": 100 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_sharegpt", "qps_list": [1, 4, 16, "inf"], "server_parameters": { "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", @@ -117,5 +222,193 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in200_out200", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 200, + "random_output_len": 200 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in5k_out1k", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 5120, + "random_output_len": 1024 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in10k_out500", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 11264 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 10240, + "random_output_len": 500 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8_random_in30k_out100", + "qps_list": [10], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 31744 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 30720, + "random_output_len": 100 + } + }, + { + "test_name": "serving_gemma_3_27b_it_tp8_in1k_out2k", + "qps_list": ["inf"], + "server_parameters": { + "model": "google/gemma-3-27b-it", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "google/gemma-3-27b-it", + "backend": "vllm", + "dataset_name": "random", + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_gemma_3_4b_it_tp1_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "google/gemma-3-4b-it", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "google/gemma-3-4b-it", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_qwen3_30b_a3b_tp8_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "Qwen/Qwen3-30B-A3B", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } + }, + { + "test_name": "serving_qwen3_8b_tp1_random_in1k_out2k", + "qps_list": [10], + "server_parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "Qwen/Qwen3-8B", + "backend": "vllm", + "dataset_name": "random", + "num_prompts": 200, + "random_input_len": 1024, + "random_output_len": 2048 + } } ]