[Benchmark] Download model from modelscope (#634)

Potabk · web-flow · commit 866ce7168cc2 · 2025-04-24T14:48:24.000+08:00
### What this PR does / why we need it?
-  Run benchmark scripts will Download model from modelscope

Signed-off-by: wangli &lt;wangli858794774@gmail.com&gt;
diff --git a/benchmarks/scripts/run-performance-benchmarks.sh b/benchmarks/scripts/run-performance-benchmarks.sh
@@ -264,6 +264,10 @@ main() {
   # turn of the reporting of the status of each request, to clean up the terminal output
   export VLLM_LOG_LEVEL="WARNING"
 
+  # set env
+  export VLLM_USE_MODELSCOPE="True"
+  export HF_ENDPOINT="https://hf-mirror.com"
+
   # prepare for benchmarking
   cd benchmarks || exit 1
   get_benchmarks_scripts
diff --git a/benchmarks/tests/latency-tests.json b/benchmarks/tests/latency-tests.json
@@ -2,7 +2,7 @@
   {
     "test_name": "latency_llama8B_tp1",
     "parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
       "tensor_parallel_size": 1,
       "load_format": "dummy",
       "num_iters_warmup": 5,
diff --git a/benchmarks/tests/serving-tests.json b/benchmarks/tests/serving-tests.json
@@ -8,15 +8,15 @@
       "inf"
     ],
     "server_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
       "tensor_parallel_size": 1,
       "swap_space": 16,
       "disable_log_stats": "",
       "disable_log_requests": "",
       "load_format": "dummy"
     },
     "client_parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
       "backend": "vllm",
       "dataset_name": "sharegpt",
       "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
diff --git a/benchmarks/tests/throughput-tests.json b/benchmarks/tests/throughput-tests.json
@@ -2,7 +2,7 @@
   {
     "test_name": "throughput_llama8B_tp1",
     "parameters": {
-      "model": "meta-llama/Llama-3.1-8B-Instruct",
+      "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",
       "tensor_parallel_size": 1,
       "load_format": "dummy",
       "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`{`
`3`	`3`	`"test_name": "latency_llama8B_tp1",`
`4`	`4`	`"parameters": {`
`5`		`- "model": "meta-llama/Llama-3.1-8B-Instruct",`
	`5`	`+ "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",`
`6`	`6`	`"tensor_parallel_size": 1,`
`7`	`7`	`"load_format": "dummy",`
`8`	`8`	`"num_iters_warmup": 5,`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`{`
`3`	`3`	`"test_name": "throughput_llama8B_tp1",`
`4`	`4`	`"parameters": {`
`5`		`- "model": "meta-llama/Llama-3.1-8B-Instruct",`
	`5`	`+ "model": "LLM-Research/Meta-Llama-3.1-8B-Instruct",`
`6`	`6`	`"tensor_parallel_size": 1,`
`7`	`7`	`"load_format": "dummy",`
`8`	`8`	`"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",`