mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 20:14:29 +00:00
server: Add standby-timeout
Add standby-timeout. A timeout for automatically terminating the server after being unused for a certain amount of time
This commit is contained in:
parent
26a8406ba9
commit
9a8df14d5c
@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
params.n_cache_reuse = value;
|
params.n_cache_reuse = value;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--standby-timeout"}, "N",
|
||||||
|
string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
params.standby_timeout = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--metrics"},
|
{"--metrics"},
|
||||||
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
|
||||||
|
@ -306,6 +306,7 @@ struct common_params {
|
|||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
||||||
|
int32_t standby_timeout = 0; // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
|
@ -29,6 +29,8 @@
|
|||||||
#include <thread>
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
#include <chrono>
|
||||||
|
#include <variant>
|
||||||
|
|
||||||
using json = nlohmann::ordered_json;
|
using json = nlohmann::ordered_json;
|
||||||
|
|
||||||
@ -1162,6 +1164,16 @@ struct server_metrics {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct Signal {
|
||||||
|
int number;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct StandbyTimeout {};
|
||||||
|
|
||||||
|
using ShutdownReason = std::variant<Signal, StandbyTimeout>;
|
||||||
|
|
||||||
|
std::function<void(ShutdownReason)> shutdown_handler;
|
||||||
|
|
||||||
struct server_queue {
|
struct server_queue {
|
||||||
int id = 0;
|
int id = 0;
|
||||||
bool running;
|
bool running;
|
||||||
@ -1258,7 +1270,7 @@ struct server_queue {
|
|||||||
* - Check if multitask is finished
|
* - Check if multitask is finished
|
||||||
* - Update all slots
|
* - Update all slots
|
||||||
*/
|
*/
|
||||||
void start_loop() {
|
void start_loop(int standby_timeout) {
|
||||||
running = true;
|
running = true;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -1291,9 +1303,19 @@ struct server_queue {
|
|||||||
QUE_DBG("%s", "terminate\n");
|
QUE_DBG("%s", "terminate\n");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
condition_tasks.wait(lock, [&]{
|
const auto pred = [&] {
|
||||||
return (!queue_tasks.empty() || !running);
|
return (!queue_tasks.empty() || !running);
|
||||||
});
|
};
|
||||||
|
if (standby_timeout > 0) {
|
||||||
|
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
|
||||||
|
lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
|
||||||
|
QUE_INF("%s", "stand-by timeout reached\n");
|
||||||
|
shutdown_handler(StandbyTimeout{});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
condition_tasks.wait(lock, pred);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
|
|||||||
LOG_DBG("response: %s\n", res.body.c_str());
|
LOG_DBG("response: %s\n", res.body.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::function<void(int)> shutdown_handler;
|
|
||||||
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
|
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
inline void signal_handler(int signal) {
|
inline void signal_handler(int signal) {
|
||||||
@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
|
|||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
shutdown_handler(signal);
|
shutdown_handler(Signal{ signal });
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
|
|||||||
ctx_server.queue_tasks.on_update_slots(std::bind(
|
ctx_server.queue_tasks.on_update_slots(std::bind(
|
||||||
&server_context::update_slots, &ctx_server));
|
&server_context::update_slots, &ctx_server));
|
||||||
|
|
||||||
shutdown_handler = [&](int) {
|
shutdown_handler = [&](ShutdownReason) {
|
||||||
ctx_server.queue_tasks.terminate();
|
ctx_server.queue_tasks.terminate();
|
||||||
};
|
};
|
||||||
|
|
||||||
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
|
||||||
|
|
||||||
ctx_server.queue_tasks.start_loop();
|
ctx_server.queue_tasks.start_loop(params.standby_timeout);
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
|
Loading…
Reference in New Issue
Block a user