Compare commits

...

8 Commits

Author SHA1 Message Date
Sumandora
1e21e2a734
Merge 7006dd784c into 30caac3a68 2024-12-24 15:28:34 +03:00
johannes
7006dd784c server: Propagate standby_timeout after it has been initialized 2024-12-11 08:41:51 +01:00
johannes
4fd58a8013 server: Initialize standby_timeout over constructor instead of passing as argument 2024-12-11 08:33:24 +01:00
johannes
acbac00f0d server: Return shutdown_handler to its initial state and use running = false for termination 2024-12-11 08:32:12 +01:00
johannes
a4108f59bd server: Adhere to naming conventions for shutdown_reasons 2024-12-09 23:55:51 +01:00
johannes
4fd985af91 server: Update README to include standby-timeout 2024-12-09 23:55:36 +01:00
johannes
0468a01c9c server: Improve wording to make clear that standby-timeout is measured in seconds 2024-12-09 23:55:22 +01:00
johannes
9a8df14d5c server: Add standby-timeout
Add standby-timeout. A timeout for automatically terminating the server
after being unused for a certain amount of time
2024-12-09 22:56:27 +01:00
4 changed files with 27 additions and 3 deletions

View File

@ -1850,6 +1850,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_cache_reuse = value; params.n_cache_reuse = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CACHE_REUSE"));
add_opt(common_arg(
{"--standby-timeout"}, "N",
string_format("seconds that must pass since a request has been served, before the server stops automatically (default: %d)", params.standby_timeout),
[](common_params & params, int value) {
params.standby_timeout = value;
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_STANDBY_TIMEOUT"));
add_opt(common_arg( add_opt(common_arg(
{"--metrics"}, {"--metrics"},
string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"), string_format("enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled"),

View File

@ -320,6 +320,7 @@ struct common_params {
int32_t timeout_write = timeout_read; // http write timeout in seconds int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
int32_t standby_timeout = 0; // seconds that must pass since a request has been processed before server terminates in order to save resources. If -1, then never terminate automatically.
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; // NOLINT std::string public_path = ""; // NOLINT

View File

@ -156,6 +156,7 @@ The project is under active development, and we are [looking for feedback and co
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--standby-timeout N` | seconds that must pass since a request has been served, before the server stops automatically (default: 0)<br/>(env: LLAMA_ARG_STANDBY_TIMEOUT) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) | | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |

View File

@ -29,6 +29,8 @@
#include <thread> #include <thread>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <chrono>
#include <variant>
using json = nlohmann::ordered_json; using json = nlohmann::ordered_json;
@ -1281,6 +1283,8 @@ struct server_queue {
std::function<void(server_task)> callback_new_task; std::function<void(server_task)> callback_new_task;
std::function<void(void)> callback_update_slots; std::function<void(void)> callback_update_slots;
int standby_timeout;
// Add a new task to the end of the queue // Add a new task to the end of the queue
int post(server_task task, bool front = false) { int post(server_task task, bool front = false) {
std::unique_lock<std::mutex> lock(mutex_tasks); std::unique_lock<std::mutex> lock(mutex_tasks);
@ -1395,9 +1399,18 @@ struct server_queue {
QUE_DBG("%s", "terminate\n"); QUE_DBG("%s", "terminate\n");
return; return;
} }
condition_tasks.wait(lock, [&]{ const auto pred = [&] {
return (!queue_tasks.empty() || !running); return (!queue_tasks.empty() || !running);
}); };
if (standby_timeout > 0) {
if (!condition_tasks.wait_for(lock, std::chrono::seconds(standby_timeout), pred)) {
QUE_INF("%s", "stand-by timeout reached\n");
running = false;
break;
}
} else {
condition_tasks.wait(lock, pred);
}
} }
} }
} }
@ -1572,6 +1585,8 @@ struct server_context {
n_ctx = llama_n_ctx(ctx); n_ctx = llama_n_ctx(ctx);
queue_tasks.standby_timeout = params.standby_timeout;
add_bos_token = llama_add_bos_token(model); add_bos_token = llama_add_bos_token(model);
has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL;