server : defer tasks when "slot unavailable" (#5018)

* server: defer task when no slot is available

* remove unnecessary log

---------

Co-authored-by: Xuan Son Nguyen <xuanson.nguyen@snowpack.eu>
This commit is contained in:
Xuan Son Nguyen 2024-01-18 21:33:05 +01:00 committed by GitHub
parent 96d7f56d29
commit 821f0a271e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1558,6 +1558,7 @@ struct llama_server_context
void process_tasks() void process_tasks()
{ {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
std::vector<task_server> deferred_tasks;
while (!queue_tasks.empty()) while (!queue_tasks.empty())
{ {
task_server task = queue_tasks.front(); task_server task = queue_tasks.front();
@ -1568,9 +1569,8 @@ struct llama_server_context
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
if (slot == nullptr) if (slot == nullptr)
{ {
LOG_TEE("slot unavailable\n"); // if no slot is available, we defer this task for processing later
// send error result deferred_tasks.push_back(task);
send_error(task, "slot unavailable");
break; break;
} }
@ -1616,6 +1616,12 @@ struct llama_server_context
} }
} }
// add all the deferred tasks back the the queue
for (task_server &task : deferred_tasks)
{
queue_tasks.push_back(task);
}
// remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue
std::vector<task_result> agg_results; std::vector<task_result> agg_results;
auto queue_iterator = queue_multitasks.begin(); auto queue_iterator = queue_multitasks.begin();