From d9157cdf345bc0b878d20416044508f000a6d61b Mon Sep 17 00:00:00 2001
From: ManniX-ITA <20623405+mann1x@users.noreply.github.com>
Date: Wed, 17 Apr 2024 18:55:09 +0200
Subject: [PATCH 1/5] Update server.cpp example with correct startup sequence

The HTTP listener start and the health API endpoint are moved before the model loading starts, hence the server can correctly report is loading the model
---
 examples/server/server.cpp | 202 +++++++++++++++++++------------------
 1 file changed, 103 insertions(+), 99 deletions(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 634e653ada284..6dcdc7c8902f7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3009,6 +3009,108 @@ int main(int argc, char ** argv) {
         log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
     }
 
+    const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
+        server_state current_state = state.load();
+        switch (current_state) {
+            case SERVER_STATE_READY:
+                {
+                    // request slots data using task queue
+                    server_task task;
+                    task.id   = ctx_server.queue_tasks.get_new_id();
+                    task.type = SERVER_TASK_TYPE_METRICS;
+                    task.id_target = -1;
+
+                    ctx_server.queue_results.add_waiting_task_id(task.id);
+                    ctx_server.queue_tasks.post(task);
+
+                    // get the result
+                    server_task_result result = ctx_server.queue_results.recv(task.id);
+                    ctx_server.queue_results.remove_waiting_task_id(task.id);
+
+                    const int n_idle_slots       = result.data["idle"];
+                    const int n_processing_slots = result.data["processing"];
+
+                    json health = {
+                        {"status",           "ok"},
+                        {"slots_idle",       n_idle_slots},
+                        {"slots_processing", n_processing_slots}
+                    };
+
+                    res.status = 200; // HTTP OK
+                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
+                        health["slots"] = result.data["slots"];
+                    }
+
+                    if (n_idle_slots == 0) {
+                        health["status"] = "no slot available";
+                        if (req.has_param("fail_on_no_slot")) {
+                            res.status = 503; // HTTP Service Unavailable
+                        }
+                    }
+
+                    res.set_content(health.dump(), "application/json");
+                    break;
+                }
+            case SERVER_STATE_LOADING_MODEL:
+                {
+                    res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+                } break;
+            case SERVER_STATE_ERROR:
+                {
+                    res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
+                } break;
+        }
+    };
+
+    // register Health API routes
+    svr->Get ("/health",              handle_health);
+
+    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
+        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
+            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
+            return false;
+        };
+    };
+
+    //
+    // Router
+    //
+
+    // register static assets routes
+    if (!sparams.public_path.empty()) {
+        // Set the base directory for serving static files
+        svr->set_base_dir(sparams.public_path);
+    }
+
+    // using embedded static files
+    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
+        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+
+    //
+    // Start the server
+    //
+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
+    LOG_INFO("HTTP server listening", log_data);
+
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]() {
+        if (!svr->listen_after_bind()) {
+            state.store(SERVER_STATE_ERROR);
+            return 1;
+        }
+
+        return 0;
+    });
+
     // load the model
     if (!ctx_server.load_model(params)) {
         state.store(SERVER_STATE_ERROR);
@@ -3110,59 +3212,6 @@ int main(int argc, char ** argv) {
     // Route handlers (or controllers)
     //
 
-    const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) {
-        server_state current_state = state.load();
-        switch (current_state) {
-            case SERVER_STATE_READY:
-                {
-                    // request slots data using task queue
-                    server_task task;
-                    task.id   = ctx_server.queue_tasks.get_new_id();
-                    task.type = SERVER_TASK_TYPE_METRICS;
-                    task.id_target = -1;
-
-                    ctx_server.queue_results.add_waiting_task_id(task.id);
-                    ctx_server.queue_tasks.post(task);
-
-                    // get the result
-                    server_task_result result = ctx_server.queue_results.recv(task.id);
-                    ctx_server.queue_results.remove_waiting_task_id(task.id);
-
-                    const int n_idle_slots       = result.data["idle"];
-                    const int n_processing_slots = result.data["processing"];
-
-                    json health = {
-                        {"status",           "ok"},
-                        {"slots_idle",       n_idle_slots},
-                        {"slots_processing", n_processing_slots}
-                    };
-
-                    res.status = 200; // HTTP OK
-                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                        health["slots"] = result.data["slots"];
-                    }
-
-                    if (n_idle_slots == 0) {
-                        health["status"] = "no slot available";
-                        if (req.has_param("fail_on_no_slot")) {
-                            res.status = 503; // HTTP Service Unavailable
-                        }
-                    }
-
-                    res.set_content(health.dump(), "application/json");
-                    break;
-                }
-            case SERVER_STATE_LOADING_MODEL:
-                {
-                    res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-                } break;
-            case SERVER_STATE_ERROR:
-                {
-                    res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
-                } break;
-        }
-    };
-
     const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
         if (!sparams.slots_endpoint) {
             res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
@@ -3715,32 +3764,8 @@ int main(int argc, char ** argv) {
         return res.set_content(root.dump(), "application/json; charset=utf-8");
     };
 
-    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
-        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
-            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
-            return false;
-        };
-    };
-
-    //
-    // Router
-    //
-
-    // register static assets routes
-    if (!sparams.public_path.empty()) {
-        // Set the base directory for serving static files
-        svr->set_base_dir(sparams.public_path);
-    }
-
-    // using embedded static files
-    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
-        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
 
     // register API routes
-    svr->Get ("/health",              handle_health);
     svr->Get ("/slots",               handle_slots);
     svr->Get ("/metrics",             handle_metrics);
     svr->Get ("/props",               handle_props);
@@ -3756,33 +3781,12 @@ int main(int argc, char ** argv) {
     svr->Post("/v1/embeddings",       handle_embeddings);
     svr->Post("/tokenize",            handle_tokenize);
     svr->Post("/detokenize",          handle_detokenize);
+
     if (!sparams.slot_save_path.empty()) {
         // only enable slot endpoints if slot_save_path is set
         svr->Post("/slots/:id_slot",  handle_slots_action);
     }
 
-    //
-    // Start the server
-    //
-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]() {
-        if (!svr->listen_after_bind()) {
-            state.store(SERVER_STATE_ERROR);
-            return 1;
-        }
-
-        return 0;
-    });
-
     ctx_server.queue_tasks.on_new_task(std::bind(
         &server_context::process_single_task, &ctx_server, std::placeholders::_1));
     ctx_server.queue_tasks.on_finish_multitask(std::bind(

From 52a4d5974739160c2bdb5e2cd0893a15cb19f3b4 Mon Sep 17 00:00:00 2001
From: ManniX-ITA <20623405+mann1x@users.noreply.github.com>
Date: Thu, 18 Apr 2024 18:14:29 +0200
Subject: [PATCH 2/5] Moved endpoints registration before listener and fixes

- Moved endpoints registration before HTTP listener starts
- Endpoints are returning the correct error when the model is loading or failed to load
- Server is exiting if failed to bind the port
---
 examples/server/server.cpp | 271 +++++++++++++++++++++++--------------
 1 file changed, 166 insertions(+), 105 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 6dcdc7c8902f7..cc455d8ce9150 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2282,6 +2282,17 @@ struct server_context {
             {"size",        llama_model_size    (model)},
         };
     }
+
+    json empty_model_meta() const {
+        return json {
+            {"vocab_type",  llama_vocab_type    (0)},
+            {"n_vocab",     llama_n_vocab       (0)},
+            {"n_ctx_train", llama_n_ctx_train   (0)},
+            {"n_embd",      llama_n_embd        (0)},
+            {"n_params",    llama_model_n_params(0)},
+            {"size",        llama_model_size    (0)},
+        };
+    }
 };
 
 static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
@@ -3062,92 +3073,6 @@ int main(int argc, char ** argv) {
         }
     };
 
-    // register Health API routes
-    svr->Get ("/health",              handle_health);
-
-    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
-        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
-            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
-            return false;
-        };
-    };
-
-    //
-    // Router
-    //
-
-    // register static assets routes
-    if (!sparams.public_path.empty()) {
-        // Set the base directory for serving static files
-        svr->set_base_dir(sparams.public_path);
-    }
-
-    // using embedded static files
-    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
-        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
-
-    //
-    // Start the server
-    //
-    if (sparams.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
-
-    LOG_INFO("HTTP server listening", log_data);
-
-    // run the HTTP server in a thread - see comment below
-    std::thread t([&]() {
-        if (!svr->listen_after_bind()) {
-            state.store(SERVER_STATE_ERROR);
-            return 1;
-        }
-
-        return 0;
-    });
-
-    // load the model
-    if (!ctx_server.load_model(params)) {
-        state.store(SERVER_STATE_ERROR);
-        return 1;
-    } else {
-        ctx_server.init();
-        state.store(SERVER_STATE_READY);
-    }
-
-    LOG_INFO("model loaded", {});
-
-    const auto model_meta = ctx_server.model_meta();
-
-    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (sparams.chat_template.empty()) {
-        if (!ctx_server.validate_model_chat_template()) {
-            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "chatml";
-        }
-    }
-
-    // print sample chat example to make it clear which template is used
-    {
-        json chat;
-        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"},      {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
-
-        LOG_INFO("chat template", {
-            {"chat_example", chat_example},
-            {"built_in", sparams.chat_template.empty()},
-        });
-    }
-
     //
     // Middlewares
     //
@@ -3208,6 +3133,52 @@ int main(int argc, char ** argv) {
         return httplib::Server::HandlerResponse::Unhandled;
     });
 
+    auto middleware_model_loading = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res, server_state current_state) {
+        static const std::set<std::string> invalid_endpoints = {
+            "/slots",
+            "/metrics",
+            "/props",
+            "/v1/models",
+            "/completion",
+            "/completions",
+            "/v1/completions",
+            "/chat/completions",
+            "/v1/chat/completions",
+            "/infill",
+            "/tokenize",
+            "/detokenize",
+            "/embedding",
+            "/embeddings",
+            "/v1/embeddings",
+        };
+
+        // If path is not in invalid_endpoints list, skip validation
+        if (invalid_endpoints.find(req.path) == invalid_endpoints.end()) {
+            return true;
+        }
+        switch (current_state) {
+            case SERVER_STATE_LOADING_MODEL:
+                {
+                    res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+                    return false;
+                } break;
+            case SERVER_STATE_ERROR:
+                {
+                    res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
+                    return false;
+                } break;
+        }
+        return true;
+    };
+
+    svr->set_pre_routing_handler([&state, &middleware_model_loading](const httplib::Request & req, httplib::Response & res) {
+        server_state current_state = state.load();
+        if (!middleware_model_loading(req, res, current_state)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
     //
     // Route handlers (or controllers)
     //
@@ -3531,25 +3502,6 @@ int main(int argc, char ** argv) {
         }
     };
 
-    const auto handle_models = [&params, &model_meta](const httplib::Request & req, httplib::Response & res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-
-        json models = {
-            {"object", "list"},
-            {"data", {
-                 {
-                     {"id",       params.model_alias},
-                     {"object",   "model"},
-                     {"created",  std::time(0)},
-                     {"owned_by", "llamacpp"},
-                     {"meta",     model_meta}
-                 },
-             }}
-        };
-
-        res.set_content(models.dump(), "application/json; charset=utf-8");
-    };
-
     const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
@@ -3763,9 +3715,30 @@ int main(int argc, char ** argv) {
             : responses[0];
         return res.set_content(root.dump(), "application/json; charset=utf-8");
     };
+    
+    const auto handle_models = [&](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
 
+        json model_meta = ctx_server.model_meta();
+
+        json models = {
+            {"object", "list"},
+            {"data", {
+                 {
+                     {"id",       params.model_alias},
+                     {"object",   "model"},
+                     {"created",  std::time(0)},
+                     {"owned_by", "llamacpp"},
+                     {"meta",     model_meta}
+                 },
+             }}
+        };
+
+        res.set_content(models.dump(), "application/json; charset=utf-8");
+    };
 
     // register API routes
+    svr->Get ("/health",              handle_health);
     svr->Get ("/slots",               handle_slots);
     svr->Get ("/metrics",             handle_metrics);
     svr->Get ("/props",               handle_props);
@@ -3787,6 +3760,94 @@ int main(int argc, char ** argv) {
         svr->Post("/slots/:id_slot",  handle_slots_action);
     }
 
+    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
+        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
+            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
+            return false;
+        };
+    };
+
+    //
+    // Router
+    //
+
+    // register static assets routes
+    if (!sparams.public_path.empty()) {
+        // Set the base directory for serving static files
+        svr->set_base_dir(sparams.public_path);
+    }
+
+    // using embedded static files
+    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
+        json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+
+    //
+    // Start the server
+    //
+    if (sparams.n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
+    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+
+    LOG_INFO("HTTP server listening", log_data);
+
+    // run the HTTP server in a thread - see comment below
+    std::thread t([&]() {
+        if (!svr->listen_after_bind()) {
+            state.store(SERVER_STATE_ERROR);
+            return 1;
+        }
+
+        return 0;
+    });
+
+    if (state.load() == SERVER_STATE_ERROR) {
+        // HTTP Server could not bind the port
+        return 1;
+    }
+    
+    // load the model
+    if (!ctx_server.load_model(params)) {
+        state.store(SERVER_STATE_ERROR);
+        return 1;
+    } else {
+        ctx_server.init();
+        state.store(SERVER_STATE_READY);
+    }
+
+    LOG_INFO("model loaded", {});
+
+    const auto model_meta = ctx_server.model_meta();
+
+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (sparams.chat_template.empty()) {
+        if (!ctx_server.validate_model_chat_template()) {
+            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
+            sparams.chat_template = "chatml";
+        }
+    }
+
+    // print sample chat example to make it clear which template is used
+    {
+        json chat;
+        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
+        chat.push_back({{"role", "user"},      {"content", "Hello"}});
+        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
+        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
+
+        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+
+        LOG_INFO("chat template", {
+            {"chat_example", chat_example},
+            {"built_in", sparams.chat_template.empty()},
+        });
+    }
+
     ctx_server.queue_tasks.on_new_task(std::bind(
         &server_context::process_single_task, &ctx_server, std::placeholders::_1));
     ctx_server.queue_tasks.on_finish_multitask(std::bind(

From b9613ef11a748ba3b8961fd0504aca89134d705e Mon Sep 17 00:00:00 2001
From: ManniX-ITA <20623405+mann1x@users.noreply.github.com>
Date: Thu, 18 Apr 2024 22:14:33 +0200
Subject: [PATCH 3/5] Removed leftover

---
 examples/server/server.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index cc455d8ce9150..2cc8025fe149e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2282,17 +2282,6 @@ struct server_context {
             {"size",        llama_model_size    (model)},
         };
     }
-
-    json empty_model_meta() const {
-        return json {
-            {"vocab_type",  llama_vocab_type    (0)},
-            {"n_vocab",     llama_n_vocab       (0)},
-            {"n_ctx_train", llama_n_ctx_train   (0)},
-            {"n_embd",      llama_n_embd        (0)},
-            {"n_params",    llama_model_n_params(0)},
-            {"size",        llama_model_size    (0)},
-        };
-    }
 };
 
 static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {

From 61b483d3a8482334442a83a29164f7633bac5c3e Mon Sep 17 00:00:00 2001
From: ManniX-ITA <20623405+mann1x@users.noreply.github.com>
Date: Fri, 19 Apr 2024 19:03:23 +0200
Subject: [PATCH 4/5] Update server.cpp after code review

https://github.com/ggerganov/llama.cpp/pull/6739/files/b9613ef11a748ba3b8961fd0504aca89134d705e
---
 examples/server/server.cpp | 53 +++++++-------------------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 2cc8025fe149e..4286682d4b648 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2282,6 +2282,7 @@ struct server_context {
             {"size",        llama_model_size    (model)},
         };
     }
+
 };
 
 static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
@@ -3114,37 +3115,13 @@ int main(int argc, char ** argv) {
         return false;
     };
 
-    // register server middlewares
-    svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) {
-        if (!middleware_validate_api_key(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        return httplib::Server::HandlerResponse::Unhandled;
-    });
-
     auto middleware_model_loading = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res, server_state current_state) {
-        static const std::set<std::string> invalid_endpoints = {
-            "/slots",
-            "/metrics",
-            "/props",
-            "/v1/models",
-            "/completion",
-            "/completions",
-            "/v1/completions",
-            "/chat/completions",
-            "/v1/chat/completions",
-            "/infill",
-            "/tokenize",
-            "/detokenize",
-            "/embedding",
-            "/embeddings",
-            "/v1/embeddings",
-        };
 
-        // If path is not in invalid_endpoints list, skip validation
-        if (invalid_endpoints.find(req.path) == invalid_endpoints.end()) {
+        // If path is not an health check skip validation
+        if (req.path == "/health" || req.path == "/v1/health") {
             return true;
         }
+
         switch (current_state) {
             case SERVER_STATE_LOADING_MODEL:
                 {
@@ -3160,9 +3137,10 @@ int main(int argc, char ** argv) {
         return true;
     };
 
-    svr->set_pre_routing_handler([&state, &middleware_model_loading](const httplib::Request & req, httplib::Response & res) {
+    // register server middlewares
+    svr->set_pre_routing_handler([&middleware_validate_api_key, &state, &middleware_model_loading](const httplib::Request & req, httplib::Response & res) {
         server_state current_state = state.load();
-        if (!middleware_model_loading(req, res, current_state)) {
+        if (!middleware_model_loading(req, res, current_state) || !middleware_validate_api_key(req, res)) {
             return httplib::Server::HandlerResponse::Handled;
         }
         return httplib::Server::HandlerResponse::Unhandled;
@@ -3705,7 +3683,7 @@ int main(int argc, char ** argv) {
         return res.set_content(root.dump(), "application/json; charset=utf-8");
     };
     
-    const auto handle_models = [&](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_models = [&params, &ctx_server](const httplib::Request & req, httplib::Response & res) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
 
         json model_meta = ctx_server.model_meta();
@@ -3728,6 +3706,7 @@ int main(int argc, char ** argv) {
 
     // register API routes
     svr->Get ("/health",              handle_health);
+    svr->Get ("/v1/health",           handle_health);
     svr->Get ("/slots",               handle_slots);
     svr->Get ("/metrics",             handle_metrics);
     svr->Get ("/props",               handle_props);
@@ -3756,11 +3735,6 @@ int main(int argc, char ** argv) {
         };
     };
 
-    //
-    // Router
-    //
-
-    // register static assets routes
     if (!sparams.public_path.empty()) {
         // Set the base directory for serving static files
         svr->set_base_dir(sparams.public_path);
@@ -3794,12 +3768,7 @@ int main(int argc, char ** argv) {
 
         return 0;
     });
-
-    if (state.load() == SERVER_STATE_ERROR) {
-        // HTTP Server could not bind the port
-        return 1;
-    }
-    
+   
     // load the model
     if (!ctx_server.load_model(params)) {
         state.store(SERVER_STATE_ERROR);
@@ -3811,8 +3780,6 @@ int main(int argc, char ** argv) {
 
     LOG_INFO("model loaded", {});
 
-    const auto model_meta = ctx_server.model_meta();
-
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
     if (sparams.chat_template.empty()) {
         if (!ctx_server.validate_model_chat_template()) {

From 942f023930ee7b5877034de52fabd3e67aed3589 Mon Sep 17 00:00:00 2001
From: ManniX-ITA <20623405+mann1x@users.noreply.github.com>
Date: Sat, 20 Apr 2024 09:05:57 +0200
Subject: [PATCH 5/5] Fixes unhandled status ready with default: switch

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
---
 examples/server/server.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 4286682d4b648..a19d4babd9acb 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3133,6 +3133,8 @@ int main(int argc, char ** argv) {
                     res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER));
                     return false;
                 } break;
+            default:
+                break;
         }
         return true;
     };