From 9a8df14d5c6deca4d9dc4125e549dce3f17630e7 Mon Sep 17 00:00:00 2001 From: johannes Date: Mon, 9 Dec 2024 22:56:27 +0100 Subject: [PATCH 1/7] server: Add standby-timeout Add standby-timeout. A timeout for automatically terminating the server after being unused for a certain amount of time --- common/arg.cpp | 7 +++++++ common/common.h | 1 + examples/server/server.cpp | 37 +++++++++++++++++++++++++++++-------- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0db59f701..733947930 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.n_cache_reuse = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); + add_opt(common_arg( + {"--standby-timeout"}, "N", + string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout), + [](common_params & params, int value) { + params.standby_timeout = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT")); add_opt(common_arg( {"--metrics"}, string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), diff --git a/common/common.h b/common/common.h index 95d20401d..b1f19eb32 100644 --- a/common/common.h +++ b/common/common.h @@ -306,6 +306,7 @@ struct common_params { int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting + int32_t standby_timeout = 0; // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically. std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 47bfd6c4a..b3b321a47 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -29,6 +29,8 @@ #include #include #include +#include +#include using json = nlohmann::ordered_json; @@ -1162,6 +1164,16 @@ struct server_metrics { } }; +struct Signal { + int number; +}; + +struct StandbyTimeout {}; + +using ShutdownReason = std::variant; + +std::function shutdown_handler; + struct server_queue { int id = 0; bool running; @@ -1258,7 +1270,7 @@ struct server_queue { * - Check if multitask is finished * - Update all slots */ - void start_loop() { + void start_loop(int standby_timeout) { running = true; while (true) { @@ -1291,9 +1303,19 @@ struct server_queue { QUE_DBG("%s", "terminate\n"); return; } - condition_tasks.wait(lock, [&]{ - return (!queue_tasks.empty() || !running); - }); + const auto pred = [&] { + return (!queue_tasks.empty() || !running); + }; + if (standby_timeout > 0) { + if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) { + lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it + QUE_INF("%s", "stand-by timeout reached\n"); + shutdown_handler(StandbyTimeout{}); + break; + } + } else { + condition_tasks.wait(lock, pred); + } } } } @@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp LOG_DBG("response: %s\n", res.body.c_str()); } -std::function shutdown_handler; std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; inline void signal_handler(int signal) { @@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) { exit(1); } - shutdown_handler(signal); + shutdown_handler(Signal{ signal }); } int main(int argc, char ** argv) { @@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); - shutdown_handler = [&](int) { + shutdown_handler = [&](ShutdownReason) { ctx_server.queue_tasks.terminate(); }; LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); - ctx_server.queue_tasks.start_loop(); + ctx_server.queue_tasks.start_loop(params.standby_timeout); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; From 0468a01c9c6da13ed833ddb8a5519d2e6e9bde26 Mon Sep 17 00:00:00 2001 From: johannes Date: Mon, 9 Dec 2024 23:55:22 +0100 Subject: [PATCH 2/7] server: Improve wording to make clear that standby-timeout is measured in seconds --- common/arg.cpp | 2 +- common/common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 733947930..1e20d7464 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1787,7 +1787,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); add_opt(common_arg( {"--standby-timeout"}, "N", - string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout), + string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout), [](common_params & params, int value) { params.standby_timeout = value; } diff --git a/common/common.h b/common/common.h index b1f19eb32..9f0582365 100644 --- a/common/common.h +++ b/common/common.h @@ -306,7 +306,7 @@ struct common_params { int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting - int32_t standby_timeout = 0; // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically. + int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically. std::string hostname = "127.0.0.1"; std::string public_path = ""; // NOLINT From 4fd985af9144676dc711429372845d64a2cd4b61 Mon Sep 17 00:00:00 2001 From: johannes Date: Mon, 9 Dec 2024 23:55:36 +0100 Subject: [PATCH 3/7] server: Update README to include standby-timeout --- examples/server/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/README.md b/examples/server/README.md index 117c52d3f..7f32d0d37 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co | `-to, --timeout N` | server read/write timeout in seconds (default: 600)
(env: LLAMA_ARG_TIMEOUT) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
(env: LLAMA_ARG_CACHE_REUSE) | +| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)
(env: LLAMA_ARG_STANDBY_TIMEOUT) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--slots` | enable slots monitoring endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | From a4108f59bd688b94ac3c99321edf21c58a7ec651 Mon Sep 17 00:00:00 2001 From: johannes Date: Mon, 9 Dec 2024 23:55:51 +0100 Subject: [PATCH 4/7] server: Adhere to naming conventions for shutdown_reasons --- examples/server/server.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b3b321a47..8b58a275b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1164,15 +1164,15 @@ struct server_metrics { } }; -struct Signal { +struct termination_signal { int number; }; -struct StandbyTimeout {}; +struct standby_timeout {}; -using ShutdownReason = std::variant; +using shutdown_reason = std::variant; -std::function shutdown_handler; +std::function shutdown_handler; struct server_queue { int id = 0; @@ -1310,7 +1310,7 @@ struct server_queue { if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) { lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it QUE_INF("%s", "stand-by timeout reached\n"); - shutdown_handler(StandbyTimeout{}); + shutdown_handler(::standby_timeout{}); break; } } else { @@ -2916,7 +2916,7 @@ inline void signal_handler(int signal) { exit(1); } - shutdown_handler(Signal{ signal }); + shutdown_handler(termination_signal{ signal }); } int main(int argc, char ** argv) { @@ -3956,7 +3956,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); - shutdown_handler = [&](ShutdownReason) { + shutdown_handler = [&](shutdown_reason) { ctx_server.queue_tasks.terminate(); }; From acbac00f0d6e1a0e42f2d5d0fc391a7a628a3b27 Mon Sep 17 00:00:00 2001 From: johannes Date: Wed, 11 Dec 2024 08:32:12 +0100 Subject: [PATCH 5/7] server: Return shutdown_handler to its initial state and use running = false for termination --- examples/server/server.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8b58a275b..fe2d0bd1a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1164,16 +1164,6 @@ struct server_metrics { } }; -struct termination_signal { - int number; -}; - -struct standby_timeout {}; - -using shutdown_reason = std::variant; - -std::function shutdown_handler; - struct server_queue { int id = 0; bool running; @@ -1308,9 +1298,8 @@ struct server_queue { }; if (standby_timeout > 0) { if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) { - lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it QUE_INF("%s", "stand-by timeout reached\n"); - shutdown_handler(::standby_timeout{}); + running = false; break; } } else { @@ -2906,6 +2895,7 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp LOG_DBG("response: %s\n", res.body.c_str()); } +std::function shutdown_handler; std::atomic_flag is_terminating = ATOMIC_FLAG_INIT; inline void signal_handler(int signal) { @@ -2916,7 +2906,7 @@ inline void signal_handler(int signal) { exit(1); } - shutdown_handler(termination_signal{ signal }); + shutdown_handler(signal); } int main(int argc, char ** argv) { @@ -3956,7 +3946,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); - shutdown_handler = [&](shutdown_reason) { + shutdown_handler = [&](int) { ctx_server.queue_tasks.terminate(); }; From 4fd58a8013c9bdacf86e817ffd80e0136f5846f8 Mon Sep 17 00:00:00 2001 From: johannes Date: Wed, 11 Dec 2024 08:33:24 +0100 Subject: [PATCH 6/7] server: Initialize standby_timeout over constructor instead of passing as argument --- examples/server/server.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index fe2d0bd1a..a9ba966b7 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1179,6 +1179,8 @@ struct server_queue { std::function callback_new_task; std::function callback_update_slots; + int standby_timeout; + // Add a new task to the end of the queue int post(server_task task, bool front = false) { std::unique_lock lock(mutex_tasks); @@ -1260,7 +1262,7 @@ struct server_queue { * - Check if multitask is finished * - Update all slots */ - void start_loop(int standby_timeout) { + void start_loop() { running = true; while (true) { @@ -1428,6 +1430,10 @@ struct server_context { // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; + server_context() { + queue_tasks.standby_timeout = params_base.standby_timeout; + } + ~server_context() { if (ctx) { llama_free(ctx); @@ -3952,7 +3958,7 @@ int main(int argc, char ** argv) { LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port); - ctx_server.queue_tasks.start_loop(params.standby_timeout); + ctx_server.queue_tasks.start_loop(); #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) struct sigaction sigint_action; From 7006dd784c1facc919cd5cbe649308a7a45a0bf7 Mon Sep 17 00:00:00 2001 From: johannes Date: Wed, 11 Dec 2024 08:41:51 +0100 Subject: [PATCH 7/7] server: Propagate standby_timeout after it has been initialized --- examples/server/server.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a9ba966b7..e009d305b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1430,10 +1430,6 @@ struct server_context { // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; - server_context() { - queue_tasks.standby_timeout = params_base.standby_timeout; - } - ~server_context() { if (ctx) { llama_free(ctx); @@ -1485,6 +1481,8 @@ struct server_context { n_ctx = llama_n_ctx(ctx); + queue_tasks.standby_timeout = params.standby_timeout; + add_bos_token = llama_add_bos_token(model); has_eos_token = !llama_add_eos_token(model);