Skip to content

Commit e75c627

Browse files
authored
server : enhanced health endpoint (#5548)
* server: enrich health endpoint with available slots, return 503 if not slots are available * server: document new status no slot available in the README.md
1 parent 36376ab commit e75c627

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

examples/server/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ node index.js
136136
- `{"status": "loading model"}` if the model is still being loaded.
137137
- `{"status": "error"}` if the model failed to load.
138138
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
139+
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
139140

140141
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
141142

examples/server/server.cpp

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2578,8 +2578,35 @@ int main(int argc, char **argv)
25782578
server_state current_state = state.load();
25792579
switch(current_state) {
25802580
case SERVER_STATE_READY:
2581-
res.set_content(R"({"status": "ok"})", "application/json");
2582-
res.status = 200; // HTTP OK
2581+
if (llama.all_slots_are_idle) {
2582+
res.set_content(R"({"status": "ok"})", "application/json");
2583+
res.status = 200; // HTTP OK
2584+
} else {
2585+
int available_slots = 0;
2586+
int processing_slots = 0;
2587+
for (llama_client_slot & slot : llama.slots) {
2588+
if (slot.available()) {
2589+
available_slots++;
2590+
} else {
2591+
processing_slots++;
2592+
}
2593+
}
2594+
if (available_slots > 0) {
2595+
json health = {
2596+
{"status", "ok"},
2597+
{"slots_idle", available_slots},
2598+
{"slots_processing", processing_slots}};
2599+
res.set_content(health.dump(), "application/json");
2600+
res.status = 200; // HTTP OK
2601+
} else {
2602+
json health = {
2603+
{"status", "no slot available"},
2604+
{"slots_idle", available_slots},
2605+
{"slots_processing", processing_slots}};
2606+
res.set_content(health.dump(), "application/json");
2607+
res.status = 503; // HTTP Service Unavailable
2608+
}
2609+
}
25832610
break;
25842611
case SERVER_STATE_LOADING_MODEL:
25852612
res.set_content(R"({"status": "loading model"})", "application/json");

0 commit comments

Comments
 (0)