7
7
from typing import Optional
8
8
9
9
import datasets
10
+ import logging
10
11
import matplotlib .pyplot as plt
11
12
import numpy as np
12
13
import requests
13
14
from tqdm .contrib .concurrent import thread_map
14
15
15
16
17
+ logging .basicConfig (level = logging .INFO )
18
+ logger = logging .getLogger ("server-bench" )
19
+
20
+
16
21
def get_prompts (n_prompts : int ) -> list [str ]:
17
- print ( " Loading MMLU dataset..." )
22
+ logger . info ( " Loading MMLU dataset..." )
18
23
ret = datasets .load_dataset ("cais/mmlu" , "all" )["test" ]["question" ]
19
24
if n_prompts >= 0 :
20
25
ret = ret [:n_prompts ]
21
26
return ret
22
27
23
28
24
29
def get_server (path_server : str , path_model : str , path_log : Optional [str ], port : int , n_gpu_layers : int , parallel : int , ctx_size : int ) -> dict :
25
- print ( " Starting the llama.cpp server..." )
30
+ logger . info ( " Starting the llama.cpp server..." )
26
31
address = f"http://localhost:{ port } "
27
32
28
33
popen_args : list [str ] = [
@@ -121,11 +126,10 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
121
126
for i , p in enumerate (prompts ):
122
127
data .append ({"session" : session , "server_address" : server_address , "prompt" : p , "n_predict" : n_predict , "seed" : i })
123
128
124
- print ( " Getting the prompt lengths..." )
129
+ logger . info ( " Getting the prompt lengths..." )
125
130
prompt_n : list [int ] = [get_prompt_length (d ) for d in data ]
126
131
127
- print ("Starting the benchmark..." )
128
- print ()
132
+ logger .info (" Starting the benchmark...\n " )
129
133
t0 = time ()
130
134
results : list [tuple [int , list [float ]]] = thread_map (send_prompt , data , max_workers = parallel + 1 , chunksize = 1 )
131
135
finally :
@@ -149,17 +153,17 @@ def benchmark(path_server: str, path_model: str, path_log: Optional[str], port:
149
153
token_t -= t0
150
154
token_t_last = np .max (token_t )
151
155
152
- print ( )
153
- print (f"Benchmark duration: { token_t_last :.2f} s" )
154
- print (f"Request throughput: { n_prompts / token_t_last :.2f} requests/s = { n_prompts / (token_t_last / 60 ):.2f} requests/min" )
155
- print (f"Total prompt length: { np .sum (prompt_n )} tokens" )
156
- print (f"Average prompt length: { np .mean (prompt_n ):.2f} tokens" )
157
- print (f"Average prompt latency: { np .mean (prompt_ms ):.2f} ms" )
158
- print (f"Average prompt speed: { np .sum (prompt_n ) / (1e-3 * np .sum (prompt_ms )):.2f} tokens/s" )
159
- print (f"Total generated tokens: { token_t .shape [0 ]} " )
160
- print (f"Average generation depth: { depth_sum / token_t .shape [0 ]:.2f} tokens" )
161
- print (f"Average total generation speed: { token_t .shape [0 ] / token_t_last :.2f} tokens/s" )
162
- print (f"Average generation speed per slot: { token_t .shape [0 ] / (parallel * token_t_last ):.2f} tokens/s / slot" )
156
+ logger . info ( "" )
157
+ logger . info (f" Benchmark duration: { token_t_last :.2f} s" )
158
+ logger . info (f" Request throughput: { n_prompts / token_t_last :.2f} requests/s = { n_prompts / (token_t_last / 60 ):.2f} requests/min" )
159
+ logger . info (f" Total prompt length: { np .sum (prompt_n )} tokens" )
160
+ logger . info (f" Average prompt length: { np .mean (prompt_n ):.2f} tokens" )
161
+ logger . info (f" Average prompt latency: { np .mean (prompt_ms ):.2f} ms" )
162
+ logger . info (f" Average prompt speed: { np .sum (prompt_n ) / (1e-3 * np .sum (prompt_ms )):.2f} tokens/s" )
163
+ logger . info (f" Total generated tokens: { token_t .shape [0 ]} " )
164
+ logger . info (f" Average generation depth: { depth_sum / token_t .shape [0 ]:.2f} tokens" )
165
+ logger . info (f" Average total generation speed: { token_t .shape [0 ] / token_t_last :.2f} tokens/s" )
166
+ logger . info (f" Average generation speed per slot: { token_t .shape [0 ] / (parallel * token_t_last ):.2f} tokens/s / slot" )
163
167
164
168
plt .figure ()
165
169
plt .scatter (prompt_n , prompt_ms , s = 10.0 , marker = "." , alpha = 0.25 )
0 commit comments