server : slots monitoring endpoint (#5550)

2024-12-24 10:24:35 +00:00 · 2024-02-18 18:39:57 +01:00 · 2024-02-18 18:39:57 +01:00 · c145f8a132
commit c145f8a132
parent 689a091bbe
2 changed files with 96 additions and 0 deletions
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -40,6 +40,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
 - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
 - `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
 - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
 ## Build
@ -381,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`.
    }'
    ```
 - **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
 ### Result JSON
 ```json
 [
    {
        "dynatemp_exponent": 1.0,
        "dynatemp_range": 0.0,
        "frequency_penalty": 0.0,
        "grammar": "",
        "id": 0,
        "ignore_eos": false,
        "logit_bias": [],
        "min_p": 0.05000000074505806,
        "mirostat": 0,
        "mirostat_eta": 0.10000000149011612,
        "mirostat_tau": 5.0,
        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
        "n_ctx": 2048,
        "n_keep": 0,
        "n_predict": 100000,
        "n_probs": 0,
        "next_token": {
            "has_next_token": true,
            "n_remain": -1,
            "num_tokens_predicted": 0,
            "stopped_eos": false,
            "stopped_limit": false,
            "stopped_word": false,
            "stopping_word": ""
        },
        "penalize_nl": true,
        "penalty_prompt_tokens": [],
        "presence_penalty": 0.0,
        "prompt": "Say hello to llama.cpp",
        "repeat_last_n": 64,
        "repeat_penalty": 1.100000023841858,
        "samplers": [
            "top_k",
            "tfs_z",
            "typical_p",
            "top_p",
            "min_p",
            "temperature"
        ],
        "seed": 42,
        "state": 1,
        "stop": [
            "\n"
        ],
        "stream": false,
        "task_id": 0,
        "temperature": 0.0,
        "tfs_z": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
        "typical_p": 1.0,
        "use_penalty_prompt_tokens": false
    }
 ]
 ```
 ## More examples
 ### Change system prompt on runtime
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -41,6 +41,7 @@ struct server_params
    int32_t port = 8080;
    int32_t read_timeout = 600;
    int32_t write_timeout = 600;
    bool slots_endpoint = true;
 };
 bool server_verbose = false;
@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
    printf("  --mmproj MMPROJ_FILE      path to a multimodal projector file for LLaVA.\n");
    printf("  --log-disable             disables logging to a file.\n");
    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
    printf("\n");
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            log_set_target(stdout);
            LOG_INFO("logging to file is disabled.", {});
        }
        else if (arg == "--slots-endpoint-disable")
        {
            sparams.slots_endpoint = false;
        }
        else if (arg == "--chat-template")
        {
            if (++i >= argc)
@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
        }
    });
    if (sparams.slots_endpoint) {
        svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
            json slots;
            for (llama_client_slot & slot : llama.slots) {
                json slot_data = llama.get_formated_generation(slot);
                slot_data["id"] = slot.id;
                slot_data["task_id"] = slot.task_id;
                slot_data["state"] = slot.state;
                slot_data["prompt"] = slot.prompt;
                slot_data["next_token"] = {
                        {"has_next_token", slot.has_next_token},
                        {"n_remain", slot.n_remaining},
                        {"num_tokens_predicted", slot.n_decoded},
                        {"stopped_eos", slot.stopped_eos},
                        {"stopped_word", slot.stopped_word},
                        {"stopped_limit", slot.stopped_limit},
                        {"stopping_word", slot.stopping_word},
                };
                slots.push_back(slot_data);
            }
            res.set_content(slots.dump(), "application/json");
            res.status = 200; // HTTP OK
        });
    }
    svr.set_logger(log_server_request);
    svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)