server : health endpoint configurable failure on no slot (#5594)

This commit is contained in:
Pierrick Hymbert 2024-02-20 08:48:19 +01:00 committed by GitHub
parent b9111bd209
commit c0a8c6db37
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 30 deletions

View File

@ -134,10 +134,11 @@ node index.js
## API Endpoints ## API Endpoints
- **GET** `/health`: Returns the current state of the server: - **GET** `/health`: Returns the current state of the server:
- `{"status": "loading model"}` if the model is still being loaded. - 503 -> `{"status": "loading model"}` if the model is still being loaded.
- `{"status": "error"}` if the model failed to load. - 500 -> `{"status": "error"}` if the model failed to load.
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion.

View File

@ -2582,40 +2582,40 @@ int main(int argc, char **argv)
res.set_header("Access-Control-Allow-Headers", "*"); res.set_header("Access-Control-Allow-Headers", "*");
}); });
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
server_state current_state = state.load(); server_state current_state = state.load();
switch(current_state) { switch(current_state) {
case SERVER_STATE_READY: case SERVER_STATE_READY: {
if (llama.all_slots_are_idle) { int available_slots = 0;
res.set_content(R"({"status": "ok"})", "application/json"); int processing_slots = 0;
for (llama_client_slot &slot: llama.slots) {
if (slot.available()) {
available_slots++;
} else {
processing_slots++;
}
}
if (available_slots > 0) {
json health = {
{"status", "ok"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 200; // HTTP OK res.status = 200; // HTTP OK
} else { } else {
int available_slots = 0; json health = {
int processing_slots = 0; {"status", "no slot available"},
for (llama_client_slot & slot : llama.slots) { {"slots_idle", available_slots},
if (slot.available()) { {"slots_processing", processing_slots}};
available_slots++; res.set_content(health.dump(), "application/json");
} else { if (req.has_param("fail_on_no_slot")) {
processing_slots++;
}
}
if (available_slots > 0) {
json health = {
{"status", "ok"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 200; // HTTP OK
} else {
json health = {
{"status", "no slot available"},
{"slots_idle", available_slots},
{"slots_processing", processing_slots}};
res.set_content(health.dump(), "application/json");
res.status = 503; // HTTP Service Unavailable res.status = 503; // HTTP Service Unavailable
} else {
res.status = 200; // HTTP OK
} }
} }
break; break;
}
case SERVER_STATE_LOADING_MODEL: case SERVER_STATE_LOADING_MODEL:
res.set_content(R"({"status": "loading model"})", "application/json"); res.set_content(R"({"status": "loading model"})", "application/json");
res.status = 503; // HTTP Service Unavailable res.status = 503; // HTTP Service Unavailable