Skip to content

Commit db49ff8

Browse files
authored
server : replace sleep with condition variables (#4673)
The server currently schedules tasks using a sleep(5ms) busy loop. This adds unnecessary latency since most sleep implementations do a round up to the system scheduling quantum (usually 10ms). Other libc sleep impls spin for smaller time intervals which results in the server's busy loop consuming all available cpu. Having the explicit notify() / wait() code also helps aid in the readability of the server code. See Mozilla-Ocho/llamafile@711344b
1 parent 60f55e8 commit db49ff8

File tree

1 file changed

+26
-15
lines changed

1 file changed

+26
-15
lines changed

examples/server/server.cpp

Lines changed: 26 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <thread>
2626
#include <mutex>
2727
#include <chrono>
28+
#include <condition_variable>
2829

2930
#ifndef SERVER_VERBOSE
3031
#define SERVER_VERBOSE 1
@@ -541,7 +542,9 @@ struct llama_server_context
541542
std::vector<task_result> queue_results;
542543
std::vector<task_multi> queue_multitasks;
543544
std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks
545+
std::condition_variable condition_tasks;
544546
std::mutex mutex_results;
547+
std::condition_variable condition_results;
545548

546549
~llama_server_context()
547550
{
@@ -1169,14 +1172,15 @@ struct llama_server_context
11691172

11701173
void send_error(task_server& task, std::string error)
11711174
{
1172-
std::lock_guard<std::mutex> lock(mutex_results);
1175+
std::unique_lock<std::mutex> lock(mutex_results);
11731176
task_result res;
11741177
res.id = task.id;
11751178
res.multitask_id = task.multitask_id;
11761179
res.stop = false;
11771180
res.error = true;
11781181
res.result_json = { { "content", error } };
11791182
queue_results.push_back(res);
1183+
condition_results.notify_all();
11801184
}
11811185

11821186
void add_multi_task(int id, std::vector<int>& sub_ids)
@@ -1186,6 +1190,7 @@ struct llama_server_context
11861190
multi.id = id;
11871191
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
11881192
queue_multitasks.push_back(multi);
1193+
condition_tasks.notify_one();
11891194
}
11901195

11911196
void update_multi_task(int multitask_id, int subtask_id, task_result& result)
@@ -1197,6 +1202,7 @@ struct llama_server_context
11971202
{
11981203
multitask.subtasks_remaining.erase(subtask_id);
11991204
multitask.results.push_back(result);
1205+
condition_tasks.notify_one();
12001206
}
12011207
}
12021208
}
@@ -1244,7 +1250,7 @@ struct llama_server_context
12441250

12451251
void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
12461252
{
1247-
std::lock_guard<std::mutex> lock(mutex_results);
1253+
std::unique_lock<std::mutex> lock(mutex_results);
12481254
task_result res;
12491255
res.id = slot.task_id;
12501256
res.multitask_id = slot.multitask_id;
@@ -1280,11 +1286,12 @@ struct llama_server_context
12801286
}
12811287

12821288
queue_results.push_back(res);
1289+
condition_results.notify_all();
12831290
}
12841291

12851292
void send_final_response(llama_client_slot &slot)
12861293
{
1287-
std::lock_guard<std::mutex> lock(mutex_results);
1294+
std::unique_lock<std::mutex> lock(mutex_results);
12881295
task_result res;
12891296
res.id = slot.task_id;
12901297
res.multitask_id = slot.multitask_id;
@@ -1340,11 +1347,12 @@ struct llama_server_context
13401347
}
13411348

13421349
queue_results.push_back(res);
1350+
condition_results.notify_all();
13431351
}
13441352

13451353
void send_embedding(llama_client_slot &slot)
13461354
{
1347-
std::lock_guard<std::mutex> lock(mutex_results);
1355+
std::unique_lock<std::mutex> lock(mutex_results);
13481356
task_result res;
13491357
res.id = slot.task_id;
13501358
res.multitask_id = slot.multitask_id;
@@ -1372,6 +1380,7 @@ struct llama_server_context
13721380
};
13731381
}
13741382
queue_results.push_back(res);
1383+
condition_results.notify_all();
13751384
}
13761385

13771386
int request_completion(json data, bool infill, bool embedding, int multitask_id)
@@ -1395,20 +1404,18 @@ struct llama_server_context
13951404

13961405
// otherwise, it's a single-prompt task, we actually queue it
13971406
queue_tasks.push_back(task);
1407+
condition_tasks.notify_one();
13981408
return task.id;
13991409
}
14001410

14011411
task_result next_result(int task_id)
14021412
{
14031413
while (true)
14041414
{
1405-
std::this_thread::sleep_for(std::chrono::microseconds(5));
1406-
std::lock_guard<std::mutex> lock(mutex_results);
1407-
1408-
if (queue_results.empty())
1409-
{
1410-
continue;
1411-
}
1415+
std::unique_lock<std::mutex> lock(mutex_results);
1416+
condition_results.wait(lock, [&]{
1417+
return !queue_results.empty();
1418+
});
14121419

14131420
for (int i = 0; i < (int) queue_results.size(); i++)
14141421
{
@@ -1504,12 +1511,13 @@ struct llama_server_context
15041511

15051512
void request_cancel(int task_id)
15061513
{
1507-
std::lock_guard<std::mutex> lock(mutex_tasks);
1514+
std::unique_lock<std::mutex> lock(mutex_tasks);
15081515
task_server task;
15091516
task.id = id_gen++;
15101517
task.type = CANCEL_TASK;
15111518
task.target_id = task_id;
15121519
queue_tasks.push_back(task);
1520+
condition_tasks.notify_one();
15131521
}
15141522

15151523
int split_multiprompt_task(task_server& multiprompt_task)
@@ -1535,7 +1543,7 @@ struct llama_server_context
15351543

15361544
void process_tasks()
15371545
{
1538-
std::lock_guard<std::mutex> lock(mutex_tasks);
1546+
std::unique_lock<std::mutex> lock(mutex_tasks);
15391547
while (!queue_tasks.empty())
15401548
{
15411549
task_server task = queue_tasks.front();
@@ -1607,6 +1615,7 @@ struct llama_server_context
16071615

16081616
std::lock_guard<std::mutex> lock(mutex_results);
16091617
queue_results.push_back(aggregate_result);
1618+
condition_results.notify_all();
16101619

16111620
queue_iterator = queue_multitasks.erase(queue_iterator);
16121621
}
@@ -1637,8 +1646,10 @@ struct llama_server_context
16371646
LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
16381647
kv_cache_clear();
16391648
}
1640-
// avoid 100% usage of cpu all time
1641-
std::this_thread::sleep_for(std::chrono::milliseconds(5));
1649+
std::unique_lock<std::mutex> lock(mutex_tasks);
1650+
condition_tasks.wait(lock, [&]{
1651+
return !queue_tasks.empty();
1652+
});
16421653
}
16431654

16441655
for (llama_client_slot &slot : slots)

0 commit comments

Comments
 (0)