diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 93f999298..0462fbd24 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1558,6 +1558,7 @@ struct llama_server_context void process_tasks() { std::unique_lock lock(mutex_tasks); + std::vector deferred_tasks; while (!queue_tasks.empty()) { task_server task = queue_tasks.front(); @@ -1568,9 +1569,8 @@ struct llama_server_context llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { - LOG_TEE("slot unavailable\n"); - // send error result - send_error(task, "slot unavailable"); + // if no slot is available, we defer this task for processing later + deferred_tasks.push_back(task); break; } @@ -1616,6 +1616,12 @@ struct llama_server_context } } + // add all the deferred tasks back the the queue + for (task_server &task : deferred_tasks) + { + queue_tasks.push_back(task); + } + // remove finished multitasks from the queue of multitasks, and add the corresponding result to the result queue std::vector agg_results; auto queue_iterator = queue_multitasks.begin();