Skip to content

Update server.cpp example with correct startup sequence #6739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 7 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 103 additions & 99 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3009,6 +3009,108 @@ int main(int argc, char ** argv) {
log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
}

const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load();
switch (current_state) {
case SERVER_STATE_READY:
{
// request slots data using task queue
server_task task;
task.id = ctx_server.queue_tasks.get_new_id();
task.type = SERVER_TASK_TYPE_METRICS;
task.id_target = -1;

ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);

// get the result
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);

const int n_idle_slots = result.data["idle"];
const int n_processing_slots = result.data["processing"];

json health = {
{"status", "ok"},
{"slots_idle", n_idle_slots},
{"slots_processing", n_processing_slots}
};

res.status = 200; // HTTP OK
if (sparams.slots_endpoint && req.has_param("include_slots")) {
health["slots"] = result.data["slots"];
}

if (n_idle_slots == 0) {
health["status"] = "no slot available";
if (req.has_param("fail_on_no_slot")) {
res.status = 503; // HTTP Service Unavailable
}
}

res.set_content(health.dump(), "application/json");
break;
}
case SERVER_STATE_LOADING_MODEL:
{
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
} break;
case SERVER_STATE_ERROR:
{
res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
} break;
}
};

// register Health API routes
svr->Get ("/health", handle_health);

auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
return false;
};
};

//
// Router
//

// register static assets routes
if (!sparams.public_path.empty()) {
// Set the base directory for serving static files
svr->set_base_dir(sparams.public_path);
}

// using embedded static files
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));

//
// Start the server
//
if (sparams.n_threads_http < 1) {
// +2 threads for monitoring endpoints
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
}
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };

LOG_INFO("HTTP server listening", log_data);

// run the HTTP server in a thread - see comment below
std::thread t([&]() {
if (!svr->listen_after_bind()) {
state.store(SERVER_STATE_ERROR);
return 1;
}

return 0;
});

// load the model
if (!ctx_server.load_model(params)) {
state.store(SERVER_STATE_ERROR);
Expand Down Expand Up @@ -3110,59 +3212,6 @@ int main(int argc, char ** argv) {
// Route handlers (or controllers)
//

const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load();
switch (current_state) {
case SERVER_STATE_READY:
{
// request slots data using task queue
server_task task;
task.id = ctx_server.queue_tasks.get_new_id();
task.type = SERVER_TASK_TYPE_METRICS;
task.id_target = -1;

ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);

// get the result
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);

const int n_idle_slots = result.data["idle"];
const int n_processing_slots = result.data["processing"];

json health = {
{"status", "ok"},
{"slots_idle", n_idle_slots},
{"slots_processing", n_processing_slots}
};

res.status = 200; // HTTP OK
if (sparams.slots_endpoint && req.has_param("include_slots")) {
health["slots"] = result.data["slots"];
}

if (n_idle_slots == 0) {
health["status"] = "no slot available";
if (req.has_param("fail_on_no_slot")) {
res.status = 503; // HTTP Service Unavailable
}
}

res.set_content(health.dump(), "application/json");
break;
}
case SERVER_STATE_LOADING_MODEL:
{
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
} break;
case SERVER_STATE_ERROR:
{
res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
} break;
}
};

const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
if (!sparams.slots_endpoint) {
res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
Expand Down Expand Up @@ -3715,32 +3764,8 @@ int main(int argc, char ** argv) {
return res.set_content(root.dump(), "application/json; charset=utf-8");
};

auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
return false;
};
};

//
// Router
//

// register static assets routes
if (!sparams.public_path.empty()) {
// Set the base directory for serving static files
svr->set_base_dir(sparams.public_path);
}

// using embedded static files
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));

// register API routes
svr->Get ("/health", handle_health);
svr->Get ("/slots", handle_slots);
svr->Get ("/metrics", handle_metrics);
svr->Get ("/props", handle_props);
Expand All @@ -3756,33 +3781,12 @@ int main(int argc, char ** argv) {
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);

if (!sparams.slot_save_path.empty()) {
// only enable slot endpoints if slot_save_path is set
svr->Post("/slots/:id_slot", handle_slots_action);
}

//
// Start the server
//
if (sparams.n_threads_http < 1) {
// +2 threads for monitoring endpoints
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
}
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };

LOG_INFO("HTTP server listening", log_data);

// run the HTTP server in a thread - see comment below
std::thread t([&]() {
if (!svr->listen_after_bind()) {
state.store(SERVER_STATE_ERROR);
return 1;
}

return 0;
});

ctx_server.queue_tasks.on_new_task(std::bind(
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
ctx_server.queue_tasks.on_finish_multitask(std::bind(
Expand Down