server: Add standby-timeout

Add standby-timeout. A timeout for automatically terminating the server after being unused for a certain amount of time
2024-12-26 11:24:35 +00:00 · 2024-12-09 22:56:27 +01:00 · 2024-12-09 22:56:27 +01:00 · 9a8df14d5c
commit 9a8df14d5c
parent 26a8406ba9
3 changed files with 37 additions and 8 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.n_cache_reuse = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+    add_opt(common_arg(
+        {"--standby-timeout"}, "N",
+        string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
+        [](common_params & params, int value) {
+            params.standby_timeout = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
    add_opt(common_arg(
        {"--metrics"},
        string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
--- a/common/common.h
+++ b/common/common.h
@ -306,6 +306,7 @@ struct common_params {
    int32_t timeout_write  = timeout_read; // http write timeout in seconds
    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
+    int32_t standby_timeout  = 0;          // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.

    std::string hostname      = "127.0.0.1";
    std::string public_path   = "";                                                                         // NOLINT
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -29,6 +29,8 @@
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
+#include <chrono>
+#include <variant>

 using json = nlohmann::ordered_json;

@ -1162,6 +1164,16 @@ struct server_metrics {
    }
 };

+struct Signal {
+    int number;
+};
+
+struct StandbyTimeout {};
+
+using ShutdownReason = std::variant<Signal, StandbyTimeout>;
+
+std::function<void(ShutdownReason)> shutdown_handler;
+
 struct server_queue {
    int id = 0;
    bool running;
@ -1258,7 +1270,7 @@ struct server_queue {
     * - Check if multitask is finished
     * - Update all slots
     */
-    void start_loop() {
+    void start_loop(int standby_timeout) {
        running = true;

        while (true) {
@ -1291,9 +1303,19 @@ struct server_queue {
                        QUE_DBG("%s", "terminate\n");
                        return;
                    }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
+                    const auto pred = [&] {
+                            return (!queue_tasks.empty() || !running);
+                    };
+                    if (standby_timeout > 0) {
+                        if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
+                            lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
+                            QUE_INF("%s", "stand-by timeout reached\n");
+                            shutdown_handler(StandbyTimeout{});
+                            break;
+                        }
+                    } else {
+                        condition_tasks.wait(lock, pred);
+                    }
                }
            }
        }
@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
    LOG_DBG("response: %s\n", res.body.c_str());
 }

-std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;

 inline void signal_handler(int signal) {
@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
        exit(1);
    }

-    shutdown_handler(signal);
+    shutdown_handler(Signal{ signal });
 }

 int main(int argc, char ** argv) {
@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
    ctx_server.queue_tasks.on_update_slots(std::bind(
                &server_context::update_slots, &ctx_server));

-    shutdown_handler = [&](int) {
+    shutdown_handler = [&](ShutdownReason) {
        ctx_server.queue_tasks.terminate();
    };

    LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);

-    ctx_server.queue_tasks.start_loop();
+    ctx_server.queue_tasks.start_loop(params.standby_timeout);

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
    struct sigaction sigint_action;