36
36
@dataclass
37
37
class RequestFuncInput :
38
38
"""Input for requesting LLMs via API"""
39
+ no : int
39
40
prompt : str
40
41
history_QA : Optional [dict ]
41
42
hyper_parameters : dict
@@ -54,6 +55,7 @@ class RequestFuncInput:
54
55
@dataclass
55
56
class RequestFuncOutput :
56
57
"""Output for requesting LLMs via API"""
58
+ no : int = 0
57
59
generated_text : str = ""
58
60
reasoning_content : str = ""
59
61
success : bool = False
@@ -84,7 +86,7 @@ async def async_request_eb_openai_chat_completions(
84
86
if request_func_input .multi_modal_content :
85
87
content .append (request_func_input .multi_modal_content )
86
88
payload = {
87
- "model" : "default" ,
89
+ "model" : request_func_input . model ,
88
90
"messages" : request_func_input .history_QA ,
89
91
"stream" : True ,
90
92
"stream_options" : {
@@ -97,13 +99,17 @@ async def async_request_eb_openai_chat_completions(
97
99
98
100
if request_func_input .ignore_eos :
99
101
payload ["ignore_eos" ] = request_func_input .ignore_eos
102
+
103
+ print ("payload:{}" .format (json .dumps (payload , ensure_ascii = False )))
104
+
100
105
headers = {
101
106
"Content-Type" : "application/json" ,
102
107
"Authorization" : f"Bearer { os .environ .get ('OPENAI_API_KEY' )} " ,
103
108
}
104
109
105
110
output = RequestFuncOutput ()
106
111
output .prompt_len = 0
112
+ output .no = request_func_input .no
107
113
108
114
ttft = 0.0
109
115
st = time .perf_counter ()
@@ -132,7 +138,8 @@ async def async_request_eb_openai_chat_completions(
132
138
ttft = timestamp - st
133
139
output .ttft = ttft
134
140
# cached_tokens
135
- output .prompt_len = data ["usage" ]["prompt_tokens_details" ]["cached_tokens" ]
141
+ output .prompt_len = data ["usage" ].get ("prompt_tokens_details" , {}).get ("cached_tokens" , 0 )
142
+
136
143
137
144
# Decoding phase
138
145
else :
@@ -141,12 +148,12 @@ async def async_request_eb_openai_chat_completions(
141
148
142
149
output .generated_text += content or ""
143
150
output .reasoning_content += reason_content or ""
144
- output .arrival_time .append (choices [0 ].get ("arrival_time" ))
145
- elif usage := data .get ("usage" ):
151
+ output .arrival_time .append (choices [0 ].get ("arrival_time" , timestamp ))
152
+ elif usage := data .get ("usage" , {} ):
146
153
output .output_tokens = usage .get (
147
- "completion_tokens" )
154
+ "completion_tokens" , 0 )
148
155
output .prompt_tokens = usage .get (
149
- "prompt_tokens" )
156
+ "prompt_tokens" , 0 )
150
157
151
158
most_recent_timestamp = timestamp
152
159
@@ -173,6 +180,7 @@ async def async_request_eb_openai_chat_completions(
173
180
f .write (str (output ) + "\n " )
174
181
if pbar :
175
182
pbar .update (1 )
183
+ print ("#####final_output:" , output )
176
184
return output
177
185
178
186
@@ -189,7 +197,7 @@ async def async_request_eb_openai_completions(
189
197
async with aiohttp .ClientSession (trust_env = True ,
190
198
timeout = AIOHTTP_TIMEOUT ) as session :
191
199
payload = {
192
- "model" : "default" ,
200
+ "model" : request_func_input . model ,
193
201
"prompt" : request_func_input .prompt ,
194
202
"stream" : True ,
195
203
"stream_options" : {
@@ -202,14 +210,20 @@ async def async_request_eb_openai_completions(
202
210
203
211
if request_func_input .ignore_eos :
204
212
payload ["ignore_eos" ] = request_func_input .ignore_eos
213
+
214
+ print ("payload:" , json .dumps (payload , ensure_ascii = False ))
215
+
205
216
headers = {
206
- "Authorization" : f"Bearer { os .environ .get ('OPENAI_API_KEY' )} "
217
+ "Authorization" : f"Bearer { os .environ .get ('OPENAI_API_KEY' )} " ,
218
+ "Content-Type" : "application/json"
207
219
}
208
220
209
221
output = RequestFuncOutput ()
210
222
output .prompt_len = request_func_input .prompt_len
223
+ output .no = request_func_input .no
211
224
212
225
generated_text = ""
226
+ ttft = 0.0
213
227
st = time .perf_counter ()
214
228
most_recent_timestamp = st
215
229
try :
@@ -226,6 +240,7 @@ async def async_request_eb_openai_completions(
226
240
"data: " )
227
241
if chunk != "[DONE]" :
228
242
# print("####chunk:", chunk, chunk.usage)
243
+ timestamp = time .perf_counter ()
229
244
data = json .loads (chunk )
230
245
231
246
# NOTE: Some completion API might have a last
@@ -235,21 +250,22 @@ async def async_request_eb_openai_completions(
235
250
# Note that text could be empty here
236
251
# e.g. for special tokens
237
252
text = choices [0 ].get ("text" )
238
- timestamp = time . perf_counter ()
253
+
239
254
# First token
240
255
if not first_chunk_received :
241
256
first_chunk_received = True
242
- ttft = time . perf_counter () - st
257
+ ttft = timestamp - st
243
258
output .ttft = ttft
244
259
245
260
# Decoding phase
246
261
else :
247
262
output .itl .append (timestamp -
248
263
most_recent_timestamp )
264
+
265
+ generated_text += text or ""
249
266
250
267
most_recent_timestamp = timestamp
251
- output .arrival_time .append (choices [0 ].get ("arrival_time" ))
252
- generated_text += text or ""
268
+ output .arrival_time .append (choices [0 ].get ("arrival_time" , timestamp ))
253
269
elif usage := data .get ("usage" ):
254
270
output .prompt_tokens = usage .get (
255
271
"prompt_tokens" )
@@ -262,15 +278,24 @@ async def async_request_eb_openai_completions(
262
278
output .error = (
263
279
"Never received a valid chunk to calculate TTFT."
264
280
"This response will be marked as failed!" )
281
+
265
282
output .generated_text = generated_text
266
283
output .latency = most_recent_timestamp - st
284
+
285
+ if output .generated_text == "" :
286
+ output .success = False
287
+ output .error = "No generated text found!"
288
+ else :
289
+ output .success = True
267
290
else :
268
291
output .error = response .reason or ""
269
292
output .success = False
270
293
except Exception :
271
294
output .success = False
272
295
exc_info = sys .exc_info ()
273
296
output .error = "" .join (traceback .format_exception (* exc_info ))
297
+
298
+ print ("final_output:{}" .format (output ))
274
299
275
300
if pbar :
276
301
pbar .update (1 )
0 commit comments