25
25
#include < thread>
26
26
#include < mutex>
27
27
#include < chrono>
28
+ #include < condition_variable>
28
29
29
30
#ifndef SERVER_VERBOSE
30
31
#define SERVER_VERBOSE 1
@@ -541,7 +542,9 @@ struct llama_server_context
541
542
std::vector<task_result> queue_results;
542
543
std::vector<task_multi> queue_multitasks;
543
544
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
545
+ std::condition_variable condition_tasks;
544
546
std::mutex mutex_results;
547
+ std::condition_variable condition_results;
545
548
546
549
~llama_server_context ()
547
550
{
@@ -1169,14 +1172,15 @@ struct llama_server_context
1169
1172
1170
1173
void send_error (task_server& task, std::string error)
1171
1174
{
1172
- std::lock_guard <std::mutex> lock (mutex_results);
1175
+ std::unique_lock <std::mutex> lock (mutex_results);
1173
1176
task_result res;
1174
1177
res.id = task.id ;
1175
1178
res.multitask_id = task.multitask_id ;
1176
1179
res.stop = false ;
1177
1180
res.error = true ;
1178
1181
res.result_json = { { " content" , error } };
1179
1182
queue_results.push_back (res);
1183
+ condition_results.notify_all ();
1180
1184
}
1181
1185
1182
1186
void add_multi_task (int id, std::vector<int >& sub_ids)
@@ -1186,6 +1190,7 @@ struct llama_server_context
1186
1190
multi.id = id;
1187
1191
std::copy (sub_ids.begin (), sub_ids.end (), std::inserter (multi.subtasks_remaining , multi.subtasks_remaining .end ()));
1188
1192
queue_multitasks.push_back (multi);
1193
+ condition_tasks.notify_one ();
1189
1194
}
1190
1195
1191
1196
void update_multi_task (int multitask_id, int subtask_id, task_result& result)
@@ -1197,6 +1202,7 @@ struct llama_server_context
1197
1202
{
1198
1203
multitask.subtasks_remaining .erase (subtask_id);
1199
1204
multitask.results .push_back (result);
1205
+ condition_tasks.notify_one ();
1200
1206
}
1201
1207
}
1202
1208
}
@@ -1244,7 +1250,7 @@ struct llama_server_context
1244
1250
1245
1251
void send_partial_response (llama_client_slot &slot, completion_token_output tkn)
1246
1252
{
1247
- std::lock_guard <std::mutex> lock (mutex_results);
1253
+ std::unique_lock <std::mutex> lock (mutex_results);
1248
1254
task_result res;
1249
1255
res.id = slot.task_id ;
1250
1256
res.multitask_id = slot.multitask_id ;
@@ -1280,11 +1286,12 @@ struct llama_server_context
1280
1286
}
1281
1287
1282
1288
queue_results.push_back (res);
1289
+ condition_results.notify_all ();
1283
1290
}
1284
1291
1285
1292
void send_final_response (llama_client_slot &slot)
1286
1293
{
1287
- std::lock_guard <std::mutex> lock (mutex_results);
1294
+ std::unique_lock <std::mutex> lock (mutex_results);
1288
1295
task_result res;
1289
1296
res.id = slot.task_id ;
1290
1297
res.multitask_id = slot.multitask_id ;
@@ -1340,11 +1347,12 @@ struct llama_server_context
1340
1347
}
1341
1348
1342
1349
queue_results.push_back (res);
1350
+ condition_results.notify_all ();
1343
1351
}
1344
1352
1345
1353
void send_embedding (llama_client_slot &slot)
1346
1354
{
1347
- std::lock_guard <std::mutex> lock (mutex_results);
1355
+ std::unique_lock <std::mutex> lock (mutex_results);
1348
1356
task_result res;
1349
1357
res.id = slot.task_id ;
1350
1358
res.multitask_id = slot.multitask_id ;
@@ -1372,6 +1380,7 @@ struct llama_server_context
1372
1380
};
1373
1381
}
1374
1382
queue_results.push_back (res);
1383
+ condition_results.notify_all ();
1375
1384
}
1376
1385
1377
1386
int request_completion (json data, bool infill, bool embedding, int multitask_id)
@@ -1395,20 +1404,18 @@ struct llama_server_context
1395
1404
1396
1405
// otherwise, it's a single-prompt task, we actually queue it
1397
1406
queue_tasks.push_back (task);
1407
+ condition_tasks.notify_one ();
1398
1408
return task.id ;
1399
1409
}
1400
1410
1401
1411
task_result next_result (int task_id)
1402
1412
{
1403
1413
while (true )
1404
1414
{
1405
- std::this_thread::sleep_for (std::chrono::microseconds (5 ));
1406
- std::lock_guard<std::mutex> lock (mutex_results);
1407
-
1408
- if (queue_results.empty ())
1409
- {
1410
- continue ;
1411
- }
1415
+ std::unique_lock<std::mutex> lock (mutex_results);
1416
+ condition_results.wait (lock, [&]{
1417
+ return !queue_results.empty ();
1418
+ });
1412
1419
1413
1420
for (int i = 0 ; i < (int ) queue_results.size (); i++)
1414
1421
{
@@ -1504,12 +1511,13 @@ struct llama_server_context
1504
1511
1505
1512
void request_cancel (int task_id)
1506
1513
{
1507
- std::lock_guard <std::mutex> lock (mutex_tasks);
1514
+ std::unique_lock <std::mutex> lock (mutex_tasks);
1508
1515
task_server task;
1509
1516
task.id = id_gen++;
1510
1517
task.type = CANCEL_TASK;
1511
1518
task.target_id = task_id;
1512
1519
queue_tasks.push_back (task);
1520
+ condition_tasks.notify_one ();
1513
1521
}
1514
1522
1515
1523
int split_multiprompt_task (task_server& multiprompt_task)
@@ -1535,7 +1543,7 @@ struct llama_server_context
1535
1543
1536
1544
void process_tasks ()
1537
1545
{
1538
- std::lock_guard <std::mutex> lock (mutex_tasks);
1546
+ std::unique_lock <std::mutex> lock (mutex_tasks);
1539
1547
while (!queue_tasks.empty ())
1540
1548
{
1541
1549
task_server task = queue_tasks.front ();
@@ -1607,6 +1615,7 @@ struct llama_server_context
1607
1615
1608
1616
std::lock_guard<std::mutex> lock (mutex_results);
1609
1617
queue_results.push_back (aggregate_result);
1618
+ condition_results.notify_all ();
1610
1619
1611
1620
queue_iterator = queue_multitasks.erase (queue_iterator);
1612
1621
}
@@ -1637,8 +1646,10 @@ struct llama_server_context
1637
1646
LOG_TEE (" all slots are idle and system prompt is empty, clear the KV cache\n " );
1638
1647
kv_cache_clear ();
1639
1648
}
1640
- // avoid 100% usage of cpu all time
1641
- std::this_thread::sleep_for (std::chrono::milliseconds (5 ));
1649
+ std::unique_lock<std::mutex> lock (mutex_tasks);
1650
+ condition_tasks.wait (lock, [&]{
1651
+ return !queue_tasks.empty ();
1652
+ });
1642
1653
}
1643
1654
1644
1655
for (llama_client_slot &slot : slots)
0 commit comments