From c0a8c6db371cb3e4379900867b948879f5842201 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 20 Feb 2024 08:48:19 +0100 Subject: [PATCH] server : health endpoint configurable failure on no slot (#5594) --- examples/server/README.md | 9 ++++--- examples/server/server.cpp | 52 +++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 809e2d37c..f6b9c7402 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -134,10 +134,11 @@ node index.js ## API Endpoints - **GET** `/health`: Returns the current state of the server: - - `{"status": "loading model"}` if the model is still being loaded. - - `{"status": "error"}` if the model failed to load. - - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available + - 503 -> `{"status": "loading model"}` if the model is still being loaded. + - 500 -> `{"status": "error"}` if the model failed to load. + - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. + - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. + - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 22c344dd4..23482ed95 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2582,40 +2582,40 @@ int main(int argc, char **argv) res.set_header("Access-Control-Allow-Headers", "*"); }); - svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { + svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) { server_state current_state = state.load(); switch(current_state) { - case SERVER_STATE_READY: - if (llama.all_slots_are_idle) { - res.set_content(R"({"status": "ok"})", "application/json"); + case SERVER_STATE_READY: { + int available_slots = 0; + int processing_slots = 0; + for (llama_client_slot &slot: llama.slots) { + if (slot.available()) { + available_slots++; + } else { + processing_slots++; + } + } + if (available_slots > 0) { + json health = { + {"status", "ok"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); res.status = 200; // HTTP OK } else { - int available_slots = 0; - int processing_slots = 0; - for (llama_client_slot & slot : llama.slots) { - if (slot.available()) { - available_slots++; - } else { - processing_slots++; - } - } - if (available_slots > 0) { - json health = { - {"status", "ok"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); - res.status = 200; // HTTP OK - } else { - json health = { - {"status", "no slot available"}, - {"slots_idle", available_slots}, - {"slots_processing", processing_slots}}; - res.set_content(health.dump(), "application/json"); + json health = { + {"status", "no slot available"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + if (req.has_param("fail_on_no_slot")) { res.status = 503; // HTTP Service Unavailable + } else { + res.status = 200; // HTTP OK } } break; + } case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json"); res.status = 503; // HTTP Service Unavailable