mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server: health: fix race condition on slots data using tasks queue (#5634)
* server: health: fix race condition on slots data using tasks queue * server: health: * include_slots only if slots_endpoint * fix compile warning task.target_id not initialized.
This commit is contained in:
parent
a00a35cef9
commit
1ecea255eb
@ -140,6 +140,8 @@ node index.js
|
|||||||
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
- 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available.
|
||||||
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
- 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available.
|
||||||
|
|
||||||
|
If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set.
|
||||||
|
|
||||||
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
- **POST** `/completion`: Given a `prompt`, it returns the predicted completion.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
@ -1394,6 +1394,46 @@ struct llama_server_context
|
|||||||
case TASK_TYPE_NEXT_RESPONSE: {
|
case TASK_TYPE_NEXT_RESPONSE: {
|
||||||
// do nothing
|
// do nothing
|
||||||
} break;
|
} break;
|
||||||
|
case TASK_TYPE_SLOTS_DATA: {
|
||||||
|
json slots_data = json::array();
|
||||||
|
int n_idle_slots = 0;
|
||||||
|
int n_processing_slots = 0;
|
||||||
|
|
||||||
|
for (llama_client_slot &slot: slots) {
|
||||||
|
if (slot.available()) {
|
||||||
|
n_idle_slots++;
|
||||||
|
} else {
|
||||||
|
n_processing_slots++;
|
||||||
|
}
|
||||||
|
json slot_data = get_formated_generation(slot);
|
||||||
|
slot_data["id"] = slot.id;
|
||||||
|
slot_data["task_id"] = slot.task_id;
|
||||||
|
slot_data["state"] = slot.state;
|
||||||
|
slot_data["prompt"] = slot.prompt;
|
||||||
|
slot_data["next_token"] = {
|
||||||
|
{"has_next_token", slot.has_next_token},
|
||||||
|
{"n_remain", slot.n_remaining},
|
||||||
|
{"num_tokens_predicted", slot.n_decoded},
|
||||||
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
{"stopped_word", slot.stopped_word},
|
||||||
|
{"stopped_limit", slot.stopped_limit},
|
||||||
|
{"stopping_word", slot.stopping_word},
|
||||||
|
};
|
||||||
|
slots_data.push_back(slot_data);
|
||||||
|
}
|
||||||
|
LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots);
|
||||||
|
task_result res;
|
||||||
|
res.id = task.id;
|
||||||
|
res.multitask_id = task.multitask_id;
|
||||||
|
res.stop = true;
|
||||||
|
res.error = false;
|
||||||
|
res.result_json = {
|
||||||
|
{ "idle", n_idle_slots },
|
||||||
|
{ "processing", n_processing_slots },
|
||||||
|
{ "slots", slots_data }
|
||||||
|
};
|
||||||
|
queue_results.send(res);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2557,34 +2597,38 @@ int main(int argc, char **argv)
|
|||||||
server_state current_state = state.load();
|
server_state current_state = state.load();
|
||||||
switch(current_state) {
|
switch(current_state) {
|
||||||
case SERVER_STATE_READY: {
|
case SERVER_STATE_READY: {
|
||||||
int available_slots = 0;
|
// request slots data using task queue
|
||||||
int processing_slots = 0;
|
task_server task;
|
||||||
for (llama_client_slot &slot: llama.slots) {
|
task.id = llama.queue_tasks.get_new_id();
|
||||||
if (slot.available()) {
|
task.type = TASK_TYPE_SLOTS_DATA;
|
||||||
available_slots++;
|
task.target_id = -1;
|
||||||
} else {
|
|
||||||
processing_slots++;
|
llama.queue_results.add_waiting_task_id(task.id);
|
||||||
}
|
llama.queue_tasks.post(task);
|
||||||
}
|
|
||||||
if (available_slots > 0) {
|
// get the result
|
||||||
|
task_result result = llama.queue_results.recv(task.id);
|
||||||
|
llama.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
|
int n_idle_slots = result.result_json["idle"];
|
||||||
|
int n_processing_slots = result.result_json["processing"];
|
||||||
|
|
||||||
json health = {
|
json health = {
|
||||||
{"status", "ok"},
|
{"status", "ok"},
|
||||||
{"slots_idle", available_slots},
|
{"slots_idle", n_idle_slots},
|
||||||
{"slots_processing", processing_slots}};
|
{"slots_processing", n_processing_slots}};
|
||||||
res.set_content(health.dump(), "application/json");
|
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
} else {
|
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
||||||
json health = {
|
health["slots"] = result.result_json["slots"];
|
||||||
{"status", "no slot available"},
|
}
|
||||||
{"slots_idle", available_slots},
|
|
||||||
{"slots_processing", processing_slots}};
|
if (n_idle_slots == 0) {
|
||||||
res.set_content(health.dump(), "application/json");
|
health["status"] = "no slot available";
|
||||||
if (req.has_param("fail_on_no_slot")) {
|
if (req.has_param("fail_on_no_slot")) {
|
||||||
res.status = 503; // HTTP Service Unavailable
|
res.status = 503; // HTTP Service Unavailable
|
||||||
} else {
|
|
||||||
res.status = 200; // HTTP OK
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
res.set_content(health.dump(), "application/json");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case SERVER_STATE_LOADING_MODEL:
|
case SERVER_STATE_LOADING_MODEL:
|
||||||
@ -2600,26 +2644,20 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
if (sparams.slots_endpoint) {
|
if (sparams.slots_endpoint) {
|
||||||
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||||
json slots;
|
// request slots data using task queue
|
||||||
for (llama_client_slot & slot : llama.slots) {
|
task_server task;
|
||||||
json slot_data = llama.get_formated_generation(slot);
|
task.id = llama.queue_tasks.get_new_id();
|
||||||
slot_data["id"] = slot.id;
|
task.type = TASK_TYPE_SLOTS_DATA;
|
||||||
slot_data["task_id"] = slot.task_id;
|
task.target_id = -1;
|
||||||
slot_data["state"] = slot.state;
|
|
||||||
slot_data["prompt"] = slot.prompt;
|
|
||||||
slot_data["next_token"] = {
|
|
||||||
{"has_next_token", slot.has_next_token},
|
|
||||||
{"n_remain", slot.n_remaining},
|
|
||||||
{"num_tokens_predicted", slot.n_decoded},
|
|
||||||
{"stopped_eos", slot.stopped_eos},
|
|
||||||
{"stopped_word", slot.stopped_word},
|
|
||||||
{"stopped_limit", slot.stopped_limit},
|
|
||||||
{"stopping_word", slot.stopping_word},
|
|
||||||
};
|
|
||||||
|
|
||||||
slots.push_back(slot_data);
|
llama.queue_results.add_waiting_task_id(task.id);
|
||||||
}
|
llama.queue_tasks.post(task);
|
||||||
res.set_content(slots.dump(), "application/json");
|
|
||||||
|
// get the result
|
||||||
|
task_result result = llama.queue_results.recv(task.id);
|
||||||
|
llama.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
|
res.set_content(result.result_json["slots"].dump(), "application/json");
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -49,7 +49,8 @@ enum server_state {
|
|||||||
enum task_type {
|
enum task_type {
|
||||||
TASK_TYPE_COMPLETION,
|
TASK_TYPE_COMPLETION,
|
||||||
TASK_TYPE_CANCEL,
|
TASK_TYPE_CANCEL,
|
||||||
TASK_TYPE_NEXT_RESPONSE
|
TASK_TYPE_NEXT_RESPONSE,
|
||||||
|
TASK_TYPE_SLOTS_DATA
|
||||||
};
|
};
|
||||||
|
|
||||||
struct task_server {
|
struct task_server {
|
||||||
|
Loading…
Reference in New Issue
Block a user