From 9a8df14d5c6deca4d9dc4125e549dce3f17630e7 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Mon, 9 Dec 2024 22:56:27 +0100
Subject: [PATCH 1/7] server: Add standby-timeout

Add standby-timeout. A timeout for automatically terminating the server
after being unused for a certain amount of time
---
 common/arg.cpp             |  7 +++++++
 common/common.h            |  1 +
 examples/server/server.cpp | 37 +++++++++++++++++++++++++++++--------
 3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 0db59f701..733947930 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1785,6 +1785,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.n_cache_reuse = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
+    add_opt(common_arg(
+        {"--standby-timeout"}, "N",
+        string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
+        [](common_params & params, int value) {
+            params.standby_timeout = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
     add_opt(common_arg(
         {"--metrics"},
         string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),
diff --git a/common/common.h b/common/common.h
index 95d20401d..b1f19eb32 100644
--- a/common/common.h
+++ b/common/common.h
@@ -306,6 +306,7 @@ struct common_params {
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
     int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
+    int32_t standby_timeout  = 0;          // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 47bfd6c4a..b3b321a47 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -29,6 +29,8 @@
 #include <thread>
 #include <unordered_map>
 #include <unordered_set>
+#include <chrono>
+#include <variant>
 
 using json = nlohmann::ordered_json;
 
@@ -1162,6 +1164,16 @@ struct server_metrics {
     }
 };
 
+struct Signal {
+    int number;
+};
+
+struct StandbyTimeout {};
+
+using ShutdownReason = std::variant<Signal, StandbyTimeout>;
+
+std::function<void(ShutdownReason)> shutdown_handler;
+
 struct server_queue {
     int id = 0;
     bool running;
@@ -1258,7 +1270,7 @@ struct server_queue {
      * - Check if multitask is finished
      * - Update all slots
      */
-    void start_loop() {
+    void start_loop(int standby_timeout) {
         running = true;
 
         while (true) {
@@ -1291,9 +1303,19 @@ struct server_queue {
                         QUE_DBG("%s", "terminate\n");
                         return;
                     }
-                    condition_tasks.wait(lock, [&]{
-                        return (!queue_tasks.empty() || !running);
-                    });
+                    const auto pred = [&] {
+                            return (!queue_tasks.empty() || !running);
+                    };
+                    if (standby_timeout > 0) {
+                        if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
+                            lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
+                            QUE_INF("%s", "stand-by timeout reached\n");
+                            shutdown_handler(StandbyTimeout{});
+                            break;
+                        }
+                    } else {
+                        condition_tasks.wait(lock, pred);
+                    }
                 }
             }
         }
@@ -2884,7 +2906,6 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
     LOG_DBG("response: %s\n", res.body.c_str());
 }
 
-std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
 inline void signal_handler(int signal) {
@@ -2895,7 +2916,7 @@ inline void signal_handler(int signal) {
         exit(1);
     }
 
-    shutdown_handler(signal);
+    shutdown_handler(Signal{ signal });
 }
 
 int main(int argc, char ** argv) {
@@ -3935,13 +3956,13 @@ int main(int argc, char ** argv) {
     ctx_server.queue_tasks.on_update_slots(std::bind(
                 &server_context::update_slots, &ctx_server));
 
-    shutdown_handler = [&](int) {
+    shutdown_handler = [&](ShutdownReason) {
         ctx_server.queue_tasks.terminate();
     };
 
     LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
 
-    ctx_server.queue_tasks.start_loop();
+    ctx_server.queue_tasks.start_loop(params.standby_timeout);
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;

From 0468a01c9c6da13ed833ddb8a5519d2e6e9bde26 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Mon, 9 Dec 2024 23:55:22 +0100
Subject: [PATCH 2/7] server: Improve wording to make clear that
 standby-timeout is measured in seconds

---
 common/arg.cpp  | 2 +-
 common/common.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 733947930..1e20d7464 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1787,7 +1787,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
     add_opt(common_arg(
         {"--standby-timeout"}, "N",
-        string_format("time that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
+        string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
         [](common_params & params, int value) {
             params.standby_timeout = value;
         }
diff --git a/common/common.h b/common/common.h
index b1f19eb32..9f0582365 100644
--- a/common/common.h
+++ b/common/common.h
@@ -306,7 +306,7 @@ struct common_params {
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
     int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
-    int32_t standby_timeout  = 0;          // time that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
+    int32_t standby_timeout  = 0;          // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT

From 4fd985af9144676dc711429372845d64a2cd4b61 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Mon, 9 Dec 2024 23:55:36 +0100
Subject: [PATCH 3/7] server: Update README to include standby-timeout

---
 examples/server/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/server/README.md b/examples/server/README.md
index 117c52d3f..7f32d0d37 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -155,6 +155,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
 | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
+| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
 | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |

From a4108f59bd688b94ac3c99321edf21c58a7ec651 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Mon, 9 Dec 2024 23:55:51 +0100
Subject: [PATCH 4/7] server: Adhere to naming conventions for shutdown_reasons

---
 examples/server/server.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b3b321a47..8b58a275b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1164,15 +1164,15 @@ struct server_metrics {
     }
 };
 
-struct Signal {
+struct termination_signal {
     int number;
 };
 
-struct StandbyTimeout {};
+struct standby_timeout {};
 
-using ShutdownReason = std::variant<Signal, StandbyTimeout>;
+using shutdown_reason = std::variant<termination_signal, standby_timeout>;
 
-std::function<void(ShutdownReason)> shutdown_handler;
+std::function<void(shutdown_reason)> shutdown_handler;
 
 struct server_queue {
     int id = 0;
@@ -1310,7 +1310,7 @@ struct server_queue {
                         if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
                             lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
                             QUE_INF("%s", "stand-by timeout reached\n");
-                            shutdown_handler(StandbyTimeout{});
+                            shutdown_handler(::standby_timeout{});
                             break;
                         }
                     } else {
@@ -2916,7 +2916,7 @@ inline void signal_handler(int signal) {
         exit(1);
     }
 
-    shutdown_handler(Signal{ signal });
+    shutdown_handler(termination_signal{ signal });
 }
 
 int main(int argc, char ** argv) {
@@ -3956,7 +3956,7 @@ int main(int argc, char ** argv) {
     ctx_server.queue_tasks.on_update_slots(std::bind(
                 &server_context::update_slots, &ctx_server));
 
-    shutdown_handler = [&](ShutdownReason) {
+    shutdown_handler = [&](shutdown_reason) {
         ctx_server.queue_tasks.terminate();
     };
 

From acbac00f0d6e1a0e42f2d5d0fc391a7a628a3b27 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Wed, 11 Dec 2024 08:32:12 +0100
Subject: [PATCH 5/7] server: Return shutdown_handler to its initial state and
 use running = false for termination

---
 examples/server/server.cpp | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 8b58a275b..fe2d0bd1a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1164,16 +1164,6 @@ struct server_metrics {
     }
 };
 
-struct termination_signal {
-    int number;
-};
-
-struct standby_timeout {};
-
-using shutdown_reason = std::variant<termination_signal, standby_timeout>;
-
-std::function<void(shutdown_reason)> shutdown_handler;
-
 struct server_queue {
     int id = 0;
     bool running;
@@ -1308,9 +1298,8 @@ struct server_queue {
                     };
                     if (standby_timeout > 0) {
                         if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
-                            lock.release()->unlock(); // unlock the unique_lock, before calling the shutdown_handler, as it tries to lock it
                             QUE_INF("%s", "stand-by timeout reached\n");
-                            shutdown_handler(::standby_timeout{});
+                            running = false;
                             break;
                         }
                     } else {
@@ -2906,6 +2895,7 @@ static void log_server_request(const httplib::Request & req, const httplib::Resp
     LOG_DBG("response: %s\n", res.body.c_str());
 }
 
+std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
 inline void signal_handler(int signal) {
@@ -2916,7 +2906,7 @@ inline void signal_handler(int signal) {
         exit(1);
     }
 
-    shutdown_handler(termination_signal{ signal });
+    shutdown_handler(signal);
 }
 
 int main(int argc, char ** argv) {
@@ -3956,7 +3946,7 @@ int main(int argc, char ** argv) {
     ctx_server.queue_tasks.on_update_slots(std::bind(
                 &server_context::update_slots, &ctx_server));
 
-    shutdown_handler = [&](shutdown_reason) {
+    shutdown_handler = [&](int) {
         ctx_server.queue_tasks.terminate();
     };
 

From 4fd58a8013c9bdacf86e817ffd80e0136f5846f8 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Wed, 11 Dec 2024 08:33:24 +0100
Subject: [PATCH 6/7] server: Initialize standby_timeout over constructor
 instead of passing as argument

---
 examples/server/server.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fe2d0bd1a..a9ba966b7 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1179,6 +1179,8 @@ struct server_queue {
     std::function<void(server_task)> callback_new_task;
     std::function<void(void)>        callback_update_slots;
 
+    int standby_timeout;
+
     // Add a new task to the end of the queue
     int post(server_task task, bool front = false) {
         std::unique_lock<std::mutex> lock(mutex_tasks);
@@ -1260,7 +1262,7 @@ struct server_queue {
      * - Check if multitask is finished
      * - Update all slots
      */
-    void start_loop(int standby_timeout) {
+    void start_loop() {
         running = true;
 
         while (true) {
@@ -1428,6 +1430,10 @@ struct server_context {
     // Necessary similarity of prompt for slot selection
     float slot_prompt_similarity = 0.0f;
 
+    server_context() {
+        queue_tasks.standby_timeout = params_base.standby_timeout;
+    }
+
     ~server_context() {
         if (ctx) {
             llama_free(ctx);
@@ -3952,7 +3958,7 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: server is listening on http://%s:%d - starting the main loop\n", __func__, params.hostname.c_str(), params.port);
 
-    ctx_server.queue_tasks.start_loop(params.standby_timeout);
+    ctx_server.queue_tasks.start_loop();
 
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
     struct sigaction sigint_action;

From 7006dd784c1facc919cd5cbe649308a7a45a0bf7 Mon Sep 17 00:00:00 2001
From: johannes <johannesmiesenhardt@gmail.com>
Date: Wed, 11 Dec 2024 08:41:51 +0100
Subject: [PATCH 7/7] server: Propagate standby_timeout after it has been
 initialized

---
 examples/server/server.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a9ba966b7..e009d305b 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1430,10 +1430,6 @@ struct server_context {
     // Necessary similarity of prompt for slot selection
     float slot_prompt_similarity = 0.0f;
 
-    server_context() {
-        queue_tasks.standby_timeout = params_base.standby_timeout;
-    }
-
     ~server_context() {
         if (ctx) {
             llama_free(ctx);
@@ -1485,6 +1481,8 @@ struct server_context {
 
         n_ctx = llama_n_ctx(ctx);
 
+        queue_tasks.standby_timeout = params.standby_timeout;
+
         add_bos_token = llama_add_bos_token(model);
         has_eos_token = !llama_add_eos_token(model);