From e75c6279d1c8e7abb82a331f5de7124eed402de2 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 18 Feb 2024 17:31:28 +0100 Subject: [PATCH] server : enhanced health endpoint (#5548) * server: enrich health endpoint with available slots, return 503 if not slots are available * server: document new status no slot available in the README.md --- examples/server/README.md | 1 + examples/server/server.cpp | 31 +++++++++++++++++++++++++++++-- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index fe5cd8d5d..5e3ae833b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -136,6 +136,7 @@ node index.js - `{"status": "loading model"}` if the model is still being loaded. - `{"status": "error"}` if the model failed to load. - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. + - `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7aa706e95..8145af867 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2578,8 +2578,35 @@ int main(int argc, char **argv) server_state current_state = state.load(); switch(current_state) { case SERVER_STATE_READY: - res.set_content(R"({"status": "ok"})", "application/json"); - res.status = 200; // HTTP OK + if (llama.all_slots_are_idle) { + res.set_content(R"({"status": "ok"})", "application/json"); + res.status = 200; // HTTP OK + } else { + int available_slots = 0; + int processing_slots = 0; + for (llama_client_slot & slot : llama.slots) { + if (slot.available()) { + available_slots++; + } else { + processing_slots++; + } + } + if (available_slots > 0) { + json health = { + {"status", "ok"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + res.status = 200; // HTTP OK + } else { + json health = { + {"status", "no slot available"}, + {"slots_idle", available_slots}, + {"slots_processing", processing_slots}}; + res.set_content(health.dump(), "application/json"); + res.status = 503; // HTTP Service Unavailable + } + } break; case SERVER_STATE_LOADING_MODEL: res.set_content(R"({"status": "loading model"})", "application/json");