mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
server : slots monitoring endpoint (#5550)
This commit is contained in:
parent
689a091bbe
commit
c145f8a132
@ -40,6 +40,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
|
|||||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||||
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
|
||||||
|
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
@ -381,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
|
||||||
|
|
||||||
|
### Result JSON
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"dynatemp_exponent": 1.0,
|
||||||
|
"dynatemp_range": 0.0,
|
||||||
|
"frequency_penalty": 0.0,
|
||||||
|
"grammar": "",
|
||||||
|
"id": 0,
|
||||||
|
"ignore_eos": false,
|
||||||
|
"logit_bias": [],
|
||||||
|
"min_p": 0.05000000074505806,
|
||||||
|
"mirostat": 0,
|
||||||
|
"mirostat_eta": 0.10000000149011612,
|
||||||
|
"mirostat_tau": 5.0,
|
||||||
|
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
|
||||||
|
"n_ctx": 2048,
|
||||||
|
"n_keep": 0,
|
||||||
|
"n_predict": 100000,
|
||||||
|
"n_probs": 0,
|
||||||
|
"next_token": {
|
||||||
|
"has_next_token": true,
|
||||||
|
"n_remain": -1,
|
||||||
|
"num_tokens_predicted": 0,
|
||||||
|
"stopped_eos": false,
|
||||||
|
"stopped_limit": false,
|
||||||
|
"stopped_word": false,
|
||||||
|
"stopping_word": ""
|
||||||
|
},
|
||||||
|
"penalize_nl": true,
|
||||||
|
"penalty_prompt_tokens": [],
|
||||||
|
"presence_penalty": 0.0,
|
||||||
|
"prompt": "Say hello to llama.cpp",
|
||||||
|
"repeat_last_n": 64,
|
||||||
|
"repeat_penalty": 1.100000023841858,
|
||||||
|
"samplers": [
|
||||||
|
"top_k",
|
||||||
|
"tfs_z",
|
||||||
|
"typical_p",
|
||||||
|
"top_p",
|
||||||
|
"min_p",
|
||||||
|
"temperature"
|
||||||
|
],
|
||||||
|
"seed": 42,
|
||||||
|
"state": 1,
|
||||||
|
"stop": [
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"stream": false,
|
||||||
|
"task_id": 0,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"tfs_z": 1.0,
|
||||||
|
"top_k": 40,
|
||||||
|
"top_p": 0.949999988079071,
|
||||||
|
"typical_p": 1.0,
|
||||||
|
"use_penalty_prompt_tokens": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
@ -41,6 +41,7 @@ struct server_params
|
|||||||
int32_t port = 8080;
|
int32_t port = 8080;
|
||||||
int32_t read_timeout = 600;
|
int32_t read_timeout = 600;
|
||||||
int32_t write_timeout = 600;
|
int32_t write_timeout = 600;
|
||||||
|
bool slots_endpoint = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||||||
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
|
||||||
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
|
||||||
printf(" --log-disable disables logging to a file.\n");
|
printf(" --log-disable disables logging to a file.\n");
|
||||||
|
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
|
||||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||||
@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
log_set_target(stdout);
|
log_set_target(stdout);
|
||||||
LOG_INFO("logging to file is disabled.", {});
|
LOG_INFO("logging to file is disabled.", {});
|
||||||
}
|
}
|
||||||
|
else if (arg == "--slots-endpoint-disable")
|
||||||
|
{
|
||||||
|
sparams.slots_endpoint = false;
|
||||||
|
}
|
||||||
else if (arg == "--chat-template")
|
else if (arg == "--chat-template")
|
||||||
{
|
{
|
||||||
if (++i >= argc)
|
if (++i >= argc)
|
||||||
@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (sparams.slots_endpoint) {
|
||||||
|
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
|
||||||
|
json slots;
|
||||||
|
for (llama_client_slot & slot : llama.slots) {
|
||||||
|
json slot_data = llama.get_formated_generation(slot);
|
||||||
|
slot_data["id"] = slot.id;
|
||||||
|
slot_data["task_id"] = slot.task_id;
|
||||||
|
slot_data["state"] = slot.state;
|
||||||
|
slot_data["prompt"] = slot.prompt;
|
||||||
|
slot_data["next_token"] = {
|
||||||
|
{"has_next_token", slot.has_next_token},
|
||||||
|
{"n_remain", slot.n_remaining},
|
||||||
|
{"num_tokens_predicted", slot.n_decoded},
|
||||||
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
{"stopped_word", slot.stopped_word},
|
||||||
|
{"stopped_limit", slot.stopped_limit},
|
||||||
|
{"stopping_word", slot.stopping_word},
|
||||||
|
};
|
||||||
|
|
||||||
|
slots.push_back(slot_data);
|
||||||
|
}
|
||||||
|
res.set_content(slots.dump(), "application/json");
|
||||||
|
res.status = 200; // HTTP OK
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
svr.set_logger(log_server_request);
|
svr.set_logger(log_server_request);
|
||||||
|
|
||||||
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)
|
||||||
|
Loading…
Reference in New Issue
Block a user