mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server : health endpoint configurable failure on no slot (#5594)
This commit is contained in:
parent
b9111bd209
commit
c0a8c6db37
@ -134,10 +134,11 @@ node index.js
|
|||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
||||||
- **GET** `/health`: Returns the current state of the server:
|
- **GET** `/health`: Returns the current state of the server:
|
||||||
- `{"status": "loading model"}` if the model is still being loaded.
|
- 503 -> `{"status": "loading model"}` if the model is still being loaded.
|
||||||
- `{"status": "error"}` if the model failed to load.
|
- 500 -> `{"status": "error"}` if the model failed to load.
|
||||||
- `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
- 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below.
|
||||||
- `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available
|
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||||
|
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||||
|
|
||||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
|
@ -2582,17 +2582,13 @@ int main(int argc, char **argv)
|
|||||||
res.set_header("Access-Control-Allow-Headers", "*");
|
res.set_header("Access-Control-Allow-Headers", "*");
|
||||||
});
|
});
|
||||||
|
|
||||||
svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) {
|
svr.Get("/health", [&](const httplib::Request& req, httplib::Response& res) {
|
||||||
server_state current_state = state.load();
|
server_state current_state = state.load();
|
||||||
switch(current_state) {
|
switch(current_state) {
|
||||||
case SERVER_STATE_READY:
|
case SERVER_STATE_READY: {
|
||||||
if (llama.all_slots_are_idle) {
|
|
||||||
res.set_content(R"({"status": "ok"})", "application/json");
|
|
||||||
res.status = 200; // HTTP OK
|
|
||||||
} else {
|
|
||||||
int available_slots = 0;
|
int available_slots = 0;
|
||||||
int processing_slots = 0;
|
int processing_slots = 0;
|
||||||
for (llama_client_slot & slot : llama.slots) {
|
for (llama_client_slot &slot: llama.slots) {
|
||||||
if (slot.available()) {
|
if (slot.available()) {
|
||||||
available_slots++;
|
available_slots++;
|
||||||
} else {
|
} else {
|
||||||
@ -2612,10 +2608,14 @@ int main(int argc, char **argv)
|
|||||||
{"slots_idle", available_slots},
|
{"slots_idle", available_slots},
|
||||||
{"slots_processing", processing_slots}};
|
{"slots_processing", processing_slots}};
|
||||||
res.set_content(health.dump(), "application/json");
|
res.set_content(health.dump(), "application/json");
|
||||||
|
if (req.has_param("fail_on_no_slot")) {
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
} else {
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case SERVER_STATE_LOADING_MODEL:
|
case SERVER_STATE_LOADING_MODEL:
|
||||||
res.set_content(R"({"status": "loading model"})", "application/json");
|
res.set_content(R"({"status": "loading model"})", "application/json");
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
|
Loading…
Reference in New Issue
Block a user