mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 19:21:46 +00:00
Server: normalize naming (#5779)
* server: normalize naming * fix spacing
This commit is contained in:
parent
d5ab29757e
commit
052051d8ae
@ -33,8 +33,7 @@
|
|||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
struct server_params
|
struct server_params {
|
||||||
{
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
std::string public_path = "examples/server/public";
|
std::string public_path = "examples/server/public";
|
||||||
@ -49,103 +48,50 @@ struct server_params
|
|||||||
bool server_verbose = false;
|
bool server_verbose = false;
|
||||||
bool server_log_json = true;
|
bool server_log_json = true;
|
||||||
|
|
||||||
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
enum stop_type {
|
||||||
{
|
|
||||||
size_t i;
|
|
||||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
enum stop_type
|
|
||||||
{
|
|
||||||
STOP_FULL,
|
STOP_FULL,
|
||||||
STOP_PARTIAL,
|
STOP_PARTIAL,
|
||||||
};
|
};
|
||||||
|
|
||||||
static bool ends_with(const std::string &str, const std::string &suffix)
|
// TODO: can become bool if we can't find use of more states
|
||||||
{
|
enum slot_state {
|
||||||
return str.size() >= suffix.size() &&
|
IDLE,
|
||||||
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
PROCESSING,
|
||||||
}
|
};
|
||||||
|
|
||||||
static size_t find_partial_stop_string(const std::string &stop,
|
enum slot_command {
|
||||||
const std::string &text)
|
NONE,
|
||||||
{
|
LOAD_PROMPT,
|
||||||
if (!text.empty() && !stop.empty())
|
RELEASE,
|
||||||
{
|
};
|
||||||
const char text_last_char = text.back();
|
|
||||||
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
|
||||||
{
|
|
||||||
if (stop[char_index] == text_last_char)
|
|
||||||
{
|
|
||||||
const std::string current_partial = stop.substr(0, char_index + 1);
|
|
||||||
if (ends_with(text, current_partial))
|
|
||||||
{
|
|
||||||
return text.size() - char_index - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return std::string::npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: reuse llama_detokenize
|
struct slot_params {
|
||||||
template <class Iter>
|
bool stream = true;
|
||||||
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||||
{
|
|
||||||
std::string ret;
|
|
||||||
for (; begin != end; ++begin)
|
|
||||||
{
|
|
||||||
ret += llama_token_to_piece(ctx, *begin);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
// format incomplete utf-8 multibyte character for output
|
uint32_t seed = -1; // RNG seed
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
{
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
|
||||||
// (size > 1 meaning it's already a known token)
|
|
||||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
|
||||||
{
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << std::hex << (out[0] & 0xff);
|
|
||||||
std::string res(ss.str());
|
|
||||||
out = "byte: \\x" + res;
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// convert a vector of completion_token_output to json
|
std::vector<std::string> antiprompt;
|
||||||
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
|
|
||||||
{
|
|
||||||
json out = json::array();
|
|
||||||
for (const auto &prob : probs)
|
|
||||||
{
|
|
||||||
json probs_for_token = json::array();
|
|
||||||
for (const auto &p : prob.probs)
|
|
||||||
{
|
|
||||||
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
|
||||||
probs_for_token.push_back(json
|
|
||||||
{
|
|
||||||
{"tok_str", tok_str},
|
|
||||||
{"prob", p.prob},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
|
||||||
out.push_back(json{
|
|
||||||
{"content", tok_str},
|
|
||||||
{"probs", probs_for_token},
|
|
||||||
});
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_client_slot
|
json input_prefix;
|
||||||
{
|
json input_suffix;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct slot_image {
|
||||||
|
int32_t id;
|
||||||
|
|
||||||
|
bool request_encode_image = false;
|
||||||
|
float * image_embedding = nullptr;
|
||||||
|
int32_t image_tokens = 0;
|
||||||
|
|
||||||
|
clip_image_u8 * img_data;
|
||||||
|
|
||||||
|
std::string prefix_prompt; // before of this image
|
||||||
|
};
|
||||||
|
|
||||||
|
struct server_slot {
|
||||||
int id;
|
int id;
|
||||||
int task_id = -1;
|
int task_id = -1;
|
||||||
|
|
||||||
@ -165,8 +111,8 @@ struct llama_client_slot
|
|||||||
int32_t i_batch = -1;
|
int32_t i_batch = -1;
|
||||||
int32_t n_predict = -1;
|
int32_t n_predict = -1;
|
||||||
|
|
||||||
int32_t num_prompt_tokens = 0;
|
int32_t n_prompt_tokens = 0;
|
||||||
int32_t num_prompt_tokens_processed = 0;
|
int32_t n_prompt_tokens_processed = 0;
|
||||||
|
|
||||||
json prompt;
|
json prompt;
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
@ -201,8 +147,8 @@ struct llama_client_slot
|
|||||||
std::vector<slot_image> images;
|
std::vector<slot_image> images;
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
size_t sent_count = 0;
|
size_t n_sent_text = 0; // number of sent text character
|
||||||
size_t sent_token_probs_index = 0;
|
size_t n_sent_token_probs = 0;
|
||||||
|
|
||||||
int64_t t_start_process_prompt;
|
int64_t t_start_process_prompt;
|
||||||
int64_t t_start_genereration;
|
int64_t t_start_genereration;
|
||||||
@ -214,7 +160,7 @@ struct llama_client_slot
|
|||||||
int multitask_id = -1;
|
int multitask_id = -1;
|
||||||
|
|
||||||
void reset() {
|
void reset() {
|
||||||
num_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
truncated = false;
|
truncated = false;
|
||||||
stopped_eos = false;
|
stopped_eos = false;
|
||||||
@ -222,16 +168,15 @@ struct llama_client_slot
|
|||||||
stopped_limit = false;
|
stopped_limit = false;
|
||||||
stopping_word = "";
|
stopping_word = "";
|
||||||
n_past = 0;
|
n_past = 0;
|
||||||
sent_count = 0;
|
n_sent_text = 0;
|
||||||
sent_token_probs_index = 0;
|
n_sent_token_probs = 0;
|
||||||
infill = false;
|
infill = false;
|
||||||
ga_i = 0;
|
ga_i = 0;
|
||||||
n_past_se = 0;
|
n_past_se = 0;
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
|
|
||||||
for (slot_image & img : images)
|
for (slot_image & img : images) {
|
||||||
{
|
|
||||||
free(img.image_embedding);
|
free(img.image_embedding);
|
||||||
if (img.img_data) {
|
if (img.img_data) {
|
||||||
clip_image_u8_free(img.img_data);
|
clip_image_u8_free(img.img_data);
|
||||||
@ -243,19 +188,15 @@ struct llama_client_slot
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(gpt_params &global_params) {
|
bool has_budget(gpt_params &global_params) {
|
||||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||||
{
|
|
||||||
return true; // limitless
|
return true; // limitless
|
||||||
}
|
}
|
||||||
|
|
||||||
n_remaining = -1;
|
n_remaining = -1;
|
||||||
|
|
||||||
if (params.n_predict != -1)
|
if (params.n_predict != -1) {
|
||||||
{
|
|
||||||
n_remaining = params.n_predict - n_decoded;
|
n_remaining = params.n_predict - n_decoded;
|
||||||
}
|
} else if (global_params.n_predict != -1) {
|
||||||
else if (global_params.n_predict != -1)
|
|
||||||
{
|
|
||||||
n_remaining = global_params.n_predict - n_decoded;
|
n_remaining = global_params.n_predict - n_decoded;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,8 +212,7 @@ struct llama_client_slot
|
|||||||
}
|
}
|
||||||
|
|
||||||
void add_token_string(const completion_token_output &token) {
|
void add_token_string(const completion_token_output &token) {
|
||||||
if (command == RELEASE)
|
if (command == RELEASE) {
|
||||||
{
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
cache_tokens.push_back(token.tok);
|
cache_tokens.push_back(token.tok);
|
||||||
@ -290,10 +230,10 @@ struct llama_client_slot
|
|||||||
json get_formated_timings() {
|
json get_formated_timings() {
|
||||||
return json
|
return json
|
||||||
{
|
{
|
||||||
{"prompt_n", num_prompt_tokens_processed},
|
{"prompt_n", n_prompt_tokens_processed},
|
||||||
{"prompt_ms", t_prompt_processing},
|
{"prompt_ms", t_prompt_processing},
|
||||||
{"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed},
|
{"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed},
|
||||||
{"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed},
|
{"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed},
|
||||||
|
|
||||||
{"predicted_n", n_decoded},
|
{"predicted_n", n_decoded},
|
||||||
{"predicted_ms", t_token_generation},
|
{"predicted_ms", t_token_generation},
|
||||||
@ -304,18 +244,18 @@ struct llama_client_slot
|
|||||||
|
|
||||||
void print_timings() const {
|
void print_timings() const {
|
||||||
char buffer[512];
|
char buffer[512];
|
||||||
double t_token = t_prompt_processing / num_prompt_tokens_processed;
|
double t_token = t_prompt_processing / n_prompt_tokens_processed;
|
||||||
double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed;
|
double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
|
||||||
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)",
|
||||||
t_prompt_processing, num_prompt_tokens_processed,
|
t_prompt_processing, n_prompt_tokens_processed,
|
||||||
t_token, n_tokens_second);
|
t_token, n_tokens_second);
|
||||||
LOG_INFO(buffer, {
|
LOG_INFO(buffer, {
|
||||||
{"slot_id", id},
|
{"slot_id", id},
|
||||||
{"task_id", task_id},
|
{"task_id", task_id},
|
||||||
{"t_prompt_processing", t_prompt_processing},
|
{"t_prompt_processing", t_prompt_processing},
|
||||||
{"num_prompt_tokens_processed", num_prompt_tokens_processed},
|
{"n_prompt_tokens_processed", n_prompt_tokens_processed},
|
||||||
{"t_token", t_token},
|
{"t_token", t_token},
|
||||||
{"n_tokens_second", n_tokens_second},
|
{"n_tokens_second", n_tokens_second},
|
||||||
});
|
});
|
||||||
|
|
||||||
t_token = t_token_generation / n_decoded;
|
t_token = t_token_generation / n_decoded;
|
||||||
@ -343,7 +283,7 @@ struct llama_client_slot
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_metrics {
|
struct server_metrics {
|
||||||
uint64_t n_prompt_tokens_processed_total = 0;
|
uint64_t n_prompt_tokens_processed_total = 0;
|
||||||
uint64_t n_tokens_predicted_total = 0;
|
uint64_t n_tokens_predicted_total = 0;
|
||||||
|
|
||||||
@ -354,18 +294,16 @@ struct llama_metrics {
|
|||||||
uint64_t t_tokens_generation = 0;
|
uint64_t t_tokens_generation = 0;
|
||||||
|
|
||||||
|
|
||||||
void on_prompt_eval(const llama_client_slot &slot) {
|
void on_prompt_eval(const server_slot &slot) {
|
||||||
n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed;
|
n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed;
|
||||||
|
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
|
||||||
n_prompt_tokens_processed += slot.num_prompt_tokens_processed;
|
t_prompt_processing += slot.t_prompt_processing;
|
||||||
t_prompt_processing += slot.t_prompt_processing;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void on_prediction(const llama_client_slot &slot) {
|
void on_prediction(const server_slot &slot) {
|
||||||
n_tokens_predicted_total += slot.n_decoded;
|
n_tokens_predicted_total += slot.n_decoded;
|
||||||
|
n_tokens_predicted += slot.n_decoded;
|
||||||
n_tokens_predicted += slot.n_decoded;
|
t_tokens_generation += slot.t_token_generation;
|
||||||
t_tokens_generation += slot.t_token_generation;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void reset_bucket() {
|
void reset_bucket() {
|
||||||
@ -404,13 +342,13 @@ struct llama_server_context
|
|||||||
std::string name_assistant;
|
std::string name_assistant;
|
||||||
|
|
||||||
// slots / clients
|
// slots / clients
|
||||||
std::vector<llama_client_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
|
||||||
llama_server_queue queue_tasks;
|
llama_server_queue queue_tasks;
|
||||||
llama_server_response queue_results;
|
llama_server_response queue_results;
|
||||||
|
|
||||||
llama_metrics metrics;
|
server_metrics metrics;
|
||||||
|
|
||||||
~llama_server_context()
|
~llama_server_context()
|
||||||
{
|
{
|
||||||
@ -487,7 +425,7 @@ struct llama_server_context
|
|||||||
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}});
|
||||||
for (int i = 0; i < params.n_parallel; i++)
|
for (int i = 0; i < params.n_parallel; i++)
|
||||||
{
|
{
|
||||||
llama_client_slot slot;
|
server_slot slot;
|
||||||
|
|
||||||
slot.id = i;
|
slot.id = i;
|
||||||
slot.n_ctx = n_ctx_slot;
|
slot.n_ctx = n_ctx_slot;
|
||||||
@ -579,11 +517,11 @@ struct llama_server_context
|
|||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_client_slot* get_slot(int id) {
|
server_slot* get_slot(int id) {
|
||||||
int64_t t_last = ggml_time_us();
|
int64_t t_last = ggml_time_us();
|
||||||
llama_client_slot *last_used = nullptr;
|
server_slot *last_used = nullptr;
|
||||||
|
|
||||||
for (llama_client_slot & slot : slots)
|
for (server_slot & slot : slots)
|
||||||
{
|
{
|
||||||
if (slot.id == id && slot.available())
|
if (slot.id == id && slot.available())
|
||||||
{
|
{
|
||||||
@ -600,7 +538,7 @@ struct llama_server_context
|
|||||||
return last_used;
|
return last_used;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
bool launch_slot_with_data(server_slot* &slot, json data) {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
@ -888,7 +826,7 @@ struct llama_server_context
|
|||||||
clean_kv_cache = false;
|
clean_kv_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_system_prompt() {
|
void system_prompt_update() {
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
system_tokens.clear();
|
system_tokens.clear();
|
||||||
|
|
||||||
@ -933,9 +871,9 @@ struct llama_server_context
|
|||||||
system_need_update = false;
|
system_need_update = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void notify_system_prompt_changed() {
|
void system_prompt_notify() {
|
||||||
// release all slots
|
// release all slots
|
||||||
for (llama_client_slot &slot : slots)
|
for (server_slot &slot : slots)
|
||||||
{
|
{
|
||||||
slot.release();
|
slot.release();
|
||||||
}
|
}
|
||||||
@ -943,17 +881,17 @@ struct llama_server_context
|
|||||||
system_need_update = true;
|
system_need_update = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void process_system_prompt_data(const json &sys_props) {
|
void system_prompt_process(const json &sys_props) {
|
||||||
system_prompt = sys_props.value("prompt", "");
|
system_prompt = sys_props.value("prompt", "");
|
||||||
name_user = sys_props.value("anti_prompt", "");
|
name_user = sys_props.value("anti_prompt", "");
|
||||||
name_assistant = sys_props.value("assistant_name", "");
|
name_assistant = sys_props.value("assistant_name", "");
|
||||||
|
|
||||||
|
|
||||||
notify_system_prompt_changed();
|
system_prompt_notify();
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||||
const stop_type type, llama_client_slot &slot)
|
const stop_type type, server_slot &slot)
|
||||||
{
|
{
|
||||||
size_t stop_pos = std::string::npos;
|
size_t stop_pos = std::string::npos;
|
||||||
|
|
||||||
@ -975,8 +913,8 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
if (type == STOP_FULL)
|
if (type == STOP_FULL)
|
||||||
{
|
{
|
||||||
slot.stopped_word = true;
|
slot.stopped_word = true;
|
||||||
slot.stopping_word = word;
|
slot.stopping_word = word;
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
}
|
}
|
||||||
stop_pos = pos;
|
stop_pos = pos;
|
||||||
@ -986,7 +924,7 @@ struct llama_server_context
|
|||||||
return stop_pos;
|
return stop_pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool process_token(completion_token_output &result, llama_client_slot &slot) {
|
bool process_token(completion_token_output &result, server_slot &slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
@ -1032,7 +970,7 @@ struct llama_server_context
|
|||||||
|
|
||||||
if (!incomplete)
|
if (!incomplete)
|
||||||
{
|
{
|
||||||
size_t pos = std::min(slot.sent_count, slot.generated_text.size());
|
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
const std::string str_test = slot.generated_text.substr(pos);
|
const std::string str_test = slot.generated_text.substr(pos);
|
||||||
bool is_stop_full = false;
|
bool is_stop_full = false;
|
||||||
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
|
||||||
@ -1042,7 +980,7 @@ struct llama_server_context
|
|||||||
slot.generated_text.erase(
|
slot.generated_text.erase(
|
||||||
slot.generated_text.begin() + pos + stop_pos,
|
slot.generated_text.begin() + pos + stop_pos,
|
||||||
slot.generated_text.end());
|
slot.generated_text.end());
|
||||||
pos = std::min(slot.sent_count, slot.generated_text.size());
|
pos = std::min(slot.n_sent_text, slot.generated_text.size());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1055,7 +993,7 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
// no send the stop word in the response
|
// no send the stop word in the response
|
||||||
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
result.text_to_send = slot.generated_text.substr(pos, std::string::npos);
|
||||||
slot.sent_count += result.text_to_send.size();
|
slot.n_sent_text += result.text_to_send.size();
|
||||||
// add the token to slot queue and cache
|
// add the token to slot queue and cache
|
||||||
}
|
}
|
||||||
slot.add_token_string(result);
|
slot.add_token_string(result);
|
||||||
@ -1099,7 +1037,7 @@ struct llama_server_context
|
|||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
|
||||||
bool process_images(llama_client_slot &slot) const
|
bool process_images(server_slot &slot) const
|
||||||
{
|
{
|
||||||
for (slot_image &img : slot.images)
|
for (slot_image &img : slot.images)
|
||||||
{
|
{
|
||||||
@ -1132,7 +1070,7 @@ struct llama_server_context
|
|||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
json get_formated_generation(llama_client_slot &slot)
|
json get_formated_generation(server_slot &slot)
|
||||||
{
|
{
|
||||||
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
|
||||||
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
|
||||||
@ -1179,7 +1117,7 @@ struct llama_server_context
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_partial_response(llama_client_slot &slot, completion_token_output tkn)
|
void send_partial_response(server_slot &slot, completion_token_output tkn)
|
||||||
{
|
{
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
@ -1199,13 +1137,13 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
std::vector<completion_token_output> probs_output = {};
|
std::vector<completion_token_output> probs_output = {};
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
||||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
||||||
if (probs_pos < probs_stop_pos)
|
if (probs_pos < probs_stop_pos)
|
||||||
{
|
{
|
||||||
probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
|
probs_output = std::vector<completion_token_output>(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos);
|
||||||
}
|
}
|
||||||
slot.sent_token_probs_index = probs_stop_pos;
|
slot.n_sent_token_probs = probs_stop_pos;
|
||||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1218,7 +1156,7 @@ struct llama_server_context
|
|||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_final_response(llama_client_slot &slot)
|
void send_final_response(server_slot &slot)
|
||||||
{
|
{
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
@ -1233,7 +1171,7 @@ struct llama_server_context
|
|||||||
{"stop", true},
|
{"stop", true},
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"tokens_predicted", slot.n_decoded},
|
{"tokens_predicted", slot.n_decoded},
|
||||||
{"tokens_evaluated", slot.num_prompt_tokens},
|
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||||
{"generation_settings", get_formated_generation(slot)},
|
{"generation_settings", get_formated_generation(slot)},
|
||||||
{"prompt", slot.prompt},
|
{"prompt", slot.prompt},
|
||||||
{"truncated", slot.truncated},
|
{"truncated", slot.truncated},
|
||||||
@ -1271,7 +1209,7 @@ struct llama_server_context
|
|||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
void send_embedding(llama_client_slot &slot)
|
void send_embedding(server_slot &slot)
|
||||||
{
|
{
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = slot.task_id;
|
res.id = slot.task_id;
|
||||||
@ -1282,9 +1220,7 @@ struct llama_server_context
|
|||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
if (!params.embedding)
|
if (!params.embedding)
|
||||||
{
|
{
|
||||||
LOG_WARNING("embedding disabled", {
|
LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}});
|
||||||
{"params.embedding", params.embedding},
|
|
||||||
});
|
|
||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
{"embedding", std::vector<float>(n_embd, 0.0f)},
|
||||||
@ -1296,7 +1232,7 @@ struct llama_server_context
|
|||||||
std::vector<float> embedding(data, data + n_embd);
|
std::vector<float> embedding(data, data + n_embd);
|
||||||
res.result_json = json
|
res.result_json = json
|
||||||
{
|
{
|
||||||
{"embedding", embedding },
|
{"embedding", embedding},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
@ -1345,7 +1281,7 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
// for multiple images processing
|
// for multiple images processing
|
||||||
bool ingest_images(llama_client_slot &slot, int n_batch)
|
bool ingest_images(server_slot &slot, int n_batch)
|
||||||
{
|
{
|
||||||
int image_idx = 0;
|
int image_idx = 0;
|
||||||
|
|
||||||
@ -1384,7 +1320,17 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, };
|
llama_batch batch_img = {
|
||||||
|
n_eval,
|
||||||
|
nullptr,
|
||||||
|
(img.image_embedding + i * n_embd),
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
slot.n_past,
|
||||||
|
1, 0
|
||||||
|
};
|
||||||
if (llama_decode(ctx, batch_img))
|
if (llama_decode(ctx, batch_img))
|
||||||
{
|
{
|
||||||
LOG_TEE("%s : failed to eval image\n", __func__);
|
LOG_TEE("%s : failed to eval image\n", __func__);
|
||||||
@ -1454,7 +1400,7 @@ struct llama_server_context
|
|||||||
switch (task.type)
|
switch (task.type)
|
||||||
{
|
{
|
||||||
case TASK_TYPE_COMPLETION: {
|
case TASK_TYPE_COMPLETION: {
|
||||||
llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
server_slot *slot = get_slot(json_value(task.data, "slot_id", -1));
|
||||||
if (slot == nullptr)
|
if (slot == nullptr)
|
||||||
{
|
{
|
||||||
// if no slot is available, we defer this task for processing later
|
// if no slot is available, we defer this task for processing later
|
||||||
@ -1469,10 +1415,10 @@ struct llama_server_context
|
|||||||
send_error(task, "system prompt can only be updated when all slots are idle");
|
send_error(task, "system prompt can only be updated when all slots are idle");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
process_system_prompt_data(task.data["system_prompt"]);
|
system_prompt_process(task.data["system_prompt"]);
|
||||||
|
|
||||||
// reset cache_tokens for all slots
|
// reset cache_tokens for all slots
|
||||||
for (llama_client_slot &slot : slots)
|
for (server_slot &slot : slots)
|
||||||
{
|
{
|
||||||
slot.cache_tokens.clear();
|
slot.cache_tokens.clear();
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
@ -1512,20 +1458,20 @@ struct llama_server_context
|
|||||||
int n_idle_slots = 0;
|
int n_idle_slots = 0;
|
||||||
int n_processing_slots = 0;
|
int n_processing_slots = 0;
|
||||||
|
|
||||||
for (llama_client_slot &slot: slots) {
|
for (server_slot &slot: slots) {
|
||||||
json slot_data = get_formated_generation(slot);
|
json slot_data = get_formated_generation(slot);
|
||||||
slot_data["id"] = slot.id;
|
slot_data["id"] = slot.id;
|
||||||
slot_data["task_id"] = slot.task_id;
|
slot_data["task_id"] = slot.task_id;
|
||||||
slot_data["state"] = slot.state;
|
slot_data["state"] = slot.state;
|
||||||
slot_data["prompt"] = slot.prompt;
|
slot_data["prompt"] = slot.prompt;
|
||||||
slot_data["next_token"] = {
|
slot_data["next_token"] = {
|
||||||
{"has_next_token", slot.has_next_token},
|
{"has_next_token", slot.has_next_token},
|
||||||
{"n_remain", slot.n_remaining},
|
{"n_remain", slot.n_remaining},
|
||||||
{"num_tokens_predicted", slot.n_decoded},
|
{"num_tokens_predicted", slot.n_decoded},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
{"stopped_word", slot.stopped_word},
|
{"stopped_word", slot.stopped_word},
|
||||||
{"stopped_limit", slot.stopped_limit},
|
{"stopped_limit", slot.stopped_limit},
|
||||||
{"stopping_word", slot.stopping_word},
|
{"stopping_word", slot.stopping_word},
|
||||||
};
|
};
|
||||||
if (slot_data["state"] == IDLE) {
|
if (slot_data["state"] == IDLE) {
|
||||||
n_idle_slots++;
|
n_idle_slots++;
|
||||||
@ -1563,10 +1509,10 @@ struct llama_server_context
|
|||||||
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
{ "n_tokens_predicted", metrics.n_tokens_predicted},
|
||||||
{ "t_tokens_generation", metrics.t_tokens_generation},
|
{ "t_tokens_generation", metrics.t_tokens_generation},
|
||||||
|
|
||||||
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
{ "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)},
|
||||||
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
{ "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)},
|
||||||
|
|
||||||
{ "slots", slots_data },
|
{ "slots", slots_data },
|
||||||
};
|
};
|
||||||
metrics.reset_bucket();
|
metrics.reset_bucket();
|
||||||
queue_results.send(res);
|
queue_results.send(res);
|
||||||
@ -1597,7 +1543,7 @@ struct llama_server_context
|
|||||||
if (system_need_update)
|
if (system_need_update)
|
||||||
{
|
{
|
||||||
LOG_INFO("updating system prompt", {});
|
LOG_INFO("updating system prompt", {});
|
||||||
update_system_prompt();
|
system_prompt_update();
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
@ -1618,7 +1564,7 @@ struct llama_server_context
|
|||||||
task.target_id = -1;
|
task.target_id = -1;
|
||||||
queue_tasks.post(task);
|
queue_tasks.post(task);
|
||||||
|
|
||||||
for (llama_client_slot &slot : slots)
|
for (server_slot &slot : slots)
|
||||||
{
|
{
|
||||||
if (slot.ga_n == 1)
|
if (slot.ga_n == 1)
|
||||||
{
|
{
|
||||||
@ -1754,45 +1700,50 @@ struct llama_server_context
|
|||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens = prompt_tokens.size();
|
slot.n_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (slot.params.n_keep < 0)
|
if (slot.params.n_keep < 0)
|
||||||
{
|
{
|
||||||
slot.params.n_keep = slot.num_prompt_tokens;
|
slot.params.n_keep = slot.n_prompt_tokens;
|
||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate it
|
// if input prompt is too big, truncate it
|
||||||
if (slot.num_prompt_tokens >= slot.n_ctx)
|
if (slot.n_prompt_tokens >= slot.n_ctx)
|
||||||
{
|
{
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
const int n_block_size = n_left / 2;
|
const int n_block_size = n_left / 2;
|
||||||
const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||||
|
|
||||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep);
|
std::vector<llama_token> new_tokens(
|
||||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
|
prompt_tokens.begin(),
|
||||||
|
prompt_tokens.begin() + slot.params.n_keep);
|
||||||
|
new_tokens.insert(
|
||||||
|
new_tokens.end(),
|
||||||
|
prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size,
|
||||||
|
prompt_tokens.end());
|
||||||
|
|
||||||
LOG_VERBOSE("input truncated", {
|
LOG_VERBOSE("input truncated", {
|
||||||
{"n_ctx", slot.n_ctx},
|
{"n_ctx", slot.n_ctx},
|
||||||
{"n_keep", slot.params.n_keep},
|
{"n_keep", slot.params.n_keep},
|
||||||
{"n_left", n_left},
|
{"n_left", n_left},
|
||||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||||
});
|
});
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
|
|
||||||
slot.num_prompt_tokens = prompt_tokens.size();
|
slot.n_prompt_tokens = prompt_tokens.size();
|
||||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
if (!slot.params.cache_prompt)
|
||||||
{
|
{
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
slot.n_past_se = 0;
|
slot.n_past_se = 0;
|
||||||
slot.ga_i = 0;
|
slot.ga_i = 0;
|
||||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -1811,7 +1762,7 @@ struct llama_server_context
|
|||||||
slot.n_past -= 1;
|
slot.n_past -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
|
slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past;
|
||||||
|
|
||||||
if (slot.ga_n != 1)
|
if (slot.ga_n != 1)
|
||||||
{
|
{
|
||||||
@ -1836,13 +1787,13 @@ struct llama_server_context
|
|||||||
{ "slot_id", slot.id },
|
{ "slot_id", slot.id },
|
||||||
{ "task_id", slot.task_id },
|
{ "task_id", slot.task_id },
|
||||||
{ "n_past", slot.n_past },
|
{ "n_past", slot.n_past },
|
||||||
{ "num_prompt_tokens_processed", slot.num_prompt_tokens_processed }
|
{ "n_prompt_tokens_processed", slot.n_prompt_tokens_processed }
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.cache_tokens = prompt_tokens;
|
slot.cache_tokens = prompt_tokens;
|
||||||
|
|
||||||
if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0)
|
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0)
|
||||||
{
|
{
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
LOG_INFO("we have to evaluate at least 1 token to generate logits", {
|
||||||
@ -1898,8 +1849,8 @@ struct llama_server_context
|
|||||||
if (has_images && !ingest_images(slot, n_batch))
|
if (has_images && !ingest_images(slot, n_batch))
|
||||||
{
|
{
|
||||||
LOG_ERROR("failed processing images", {
|
LOG_ERROR("failed processing images", {
|
||||||
"slot_id", slot.id,
|
{"slot_id", slot.id},
|
||||||
"task_id", slot.task_id,
|
{"task_id", slot.task_id},
|
||||||
});
|
});
|
||||||
// FIXME @phymbert: to be properly tested
|
// FIXME @phymbert: to be properly tested
|
||||||
// early returning without changing the slot state will block the slot for ever
|
// early returning without changing the slot state will block the slot for ever
|
||||||
@ -2049,10 +2000,6 @@ struct llama_server_context
|
|||||||
LOG_VERBOSE("slots updated", {});
|
LOG_VERBOSE("slots updated", {});
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void run_on_all_tasks_finished() {
|
|
||||||
update_slots();
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
@ -2561,7 +2508,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
std::istreambuf_iterator<char>(),
|
std::istreambuf_iterator<char>(),
|
||||||
std::back_inserter(systm_content)
|
std::back_inserter(systm_content)
|
||||||
);
|
);
|
||||||
llama.process_system_prompt_data(json::parse(systm_content));
|
llama.system_prompt_process(json::parse(systm_content));
|
||||||
}
|
}
|
||||||
else if (arg == "-ctk" || arg == "--cache-type-k") {
|
else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||||
params.cache_type_k = argv[++i];
|
params.cache_type_k = argv[++i];
|
||||||
@ -2692,7 +2639,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||||||
|
|
||||||
/* llama.cpp completion api semantics */
|
/* llama.cpp completion api semantics */
|
||||||
static json format_partial_response(
|
static json format_partial_response(
|
||||||
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
||||||
) {
|
) {
|
||||||
json res = json
|
json res = json
|
||||||
{
|
{
|
||||||
@ -2748,14 +2695,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
struct token_translator
|
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot)
|
||||||
{
|
|
||||||
llama_context * ctx;
|
|
||||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
|
||||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
|
||||||
};
|
|
||||||
|
|
||||||
static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot)
|
|
||||||
{
|
{
|
||||||
auto & gtps = slot->generated_token_probs;
|
auto & gtps = slot->generated_token_probs;
|
||||||
auto translator = token_translator{llama.ctx};
|
auto translator = token_translator{llama.ctx};
|
||||||
@ -3526,8 +3466,8 @@ int main(int argc, char **argv)
|
|||||||
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
&llama_server_context::process_single_task, &llama, std::placeholders::_1));
|
||||||
llama.queue_tasks.on_finish_multitask(std::bind(
|
llama.queue_tasks.on_finish_multitask(std::bind(
|
||||||
&llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
|
&llama_server_context::on_finish_multitask, &llama, std::placeholders::_1));
|
||||||
llama.queue_tasks.on_all_tasks_finished(std::bind(
|
llama.queue_tasks.on_run_slots(std::bind(
|
||||||
&llama_server_context::run_on_all_tasks_finished, &llama));
|
&llama_server_context::update_slots, &llama));
|
||||||
llama.queue_results.on_multitask_update(std::bind(
|
llama.queue_results.on_multitask_update(std::bind(
|
||||||
&llama_server_queue::update_multitask,
|
&llama_server_queue::update_multitask,
|
||||||
&llama.queue_tasks,
|
&llama.queue_tasks,
|
||||||
|
@ -37,10 +37,6 @@ extern bool server_log_json;
|
|||||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
//
|
|
||||||
// parallel
|
|
||||||
//
|
|
||||||
|
|
||||||
enum server_state {
|
enum server_state {
|
||||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||||
@ -78,51 +74,8 @@ struct task_multi {
|
|||||||
std::vector<task_result> results{};
|
std::vector<task_result> results{};
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: can become bool if we can't find use of more states
|
|
||||||
enum slot_state
|
|
||||||
{
|
|
||||||
IDLE,
|
|
||||||
PROCESSING,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum slot_command
|
|
||||||
{
|
|
||||||
NONE,
|
|
||||||
LOAD_PROMPT,
|
|
||||||
RELEASE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct slot_params
|
|
||||||
{
|
|
||||||
bool stream = true;
|
|
||||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
|
||||||
|
|
||||||
uint32_t seed = -1; // RNG seed
|
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
|
||||||
|
|
||||||
json input_prefix;
|
|
||||||
json input_suffix;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct slot_image
|
|
||||||
{
|
|
||||||
int32_t id;
|
|
||||||
|
|
||||||
bool request_encode_image = false;
|
|
||||||
float * image_embedding = nullptr;
|
|
||||||
int32_t image_tokens = 0;
|
|
||||||
|
|
||||||
clip_image_u8 * img_data;
|
|
||||||
|
|
||||||
std::string prefix_prompt; // before of this image
|
|
||||||
};
|
|
||||||
|
|
||||||
// completion token output with probabilities
|
// completion token output with probabilities
|
||||||
struct completion_token_output
|
struct completion_token_output {
|
||||||
{
|
|
||||||
struct token_prob
|
struct token_prob
|
||||||
{
|
{
|
||||||
llama_token tok;
|
llama_token tok;
|
||||||
@ -134,8 +87,13 @@ struct completion_token_output
|
|||||||
std::string text_to_send;
|
std::string text_to_send;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra)
|
struct token_translator {
|
||||||
{
|
llama_context * ctx;
|
||||||
|
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||||
|
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
|
||||||
std::stringstream ss_tid;
|
std::stringstream ss_tid;
|
||||||
ss_tid << std::this_thread::get_id();
|
ss_tid << std::this_thread::get_id();
|
||||||
json log = nlohmann::ordered_json{
|
json log = nlohmann::ordered_json{
|
||||||
@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line,
|
|||||||
//
|
//
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
||||||
{
|
|
||||||
// Fallback null to default value
|
// Fallback null to default value
|
||||||
return body.contains(key) && !body.at(key).is_null()
|
return body.contains(key) && !body.at(key).is_null()
|
||||||
? body.value(key, default_value)
|
? body.value(key, default_value)
|
||||||
@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages)
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||||
{
|
|
||||||
size_t alloc_size = 0;
|
size_t alloc_size = 0;
|
||||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||||
std::vector<std::string> str(messages.size() * 2);
|
std::vector<std::string> str(messages.size() * 2);
|
||||||
@ -250,7 +206,7 @@ struct llama_server_queue {
|
|||||||
// callback functions
|
// callback functions
|
||||||
std::function<void(task_server&)> callback_new_task;
|
std::function<void(task_server&)> callback_new_task;
|
||||||
std::function<void(task_multi&)> callback_finish_multitask;
|
std::function<void(task_multi&)> callback_finish_multitask;
|
||||||
std::function<void(void)> callback_all_task_finished;
|
std::function<void(void)> callback_run_slots;
|
||||||
|
|
||||||
// Add a new task to the end of the queue
|
// Add a new task to the end of the queue
|
||||||
int post(task_server task) {
|
int post(task_server task) {
|
||||||
@ -283,14 +239,14 @@ struct llama_server_queue {
|
|||||||
callback_new_task = callback;
|
callback_new_task = callback;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register function to process a multitask
|
// Register function to process a multitask when it is finished
|
||||||
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
||||||
callback_finish_multitask = callback;
|
callback_finish_multitask = callback;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Register the function to be called when the batch of tasks is finished
|
// Register the function to be called when all slots data is ready to be processed
|
||||||
void on_all_tasks_finished(std::function<void(void)> callback) {
|
void on_run_slots(std::function<void(void)> callback) {
|
||||||
callback_all_task_finished = callback;
|
callback_run_slots = callback;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call when the state of one slot is changed
|
// Call when the state of one slot is changed
|
||||||
@ -312,7 +268,13 @@ struct llama_server_queue {
|
|||||||
condition_tasks.notify_all();
|
condition_tasks.notify_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start the main loop.
|
/**
|
||||||
|
* Main loop consists of these steps:
|
||||||
|
* - Wait until a new task arrives
|
||||||
|
* - Process the task (i.e. maybe copy data into slot)
|
||||||
|
* - Check if multitask is finished
|
||||||
|
* - Run all slots
|
||||||
|
*/
|
||||||
void start_loop() {
|
void start_loop() {
|
||||||
running = true;
|
running = true;
|
||||||
while (true) {
|
while (true) {
|
||||||
@ -331,8 +293,8 @@ struct llama_server_queue {
|
|||||||
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
LOG_VERBOSE("callback_new_task", {{"task_id", task.id}});
|
||||||
callback_new_task(task);
|
callback_new_task(task);
|
||||||
}
|
}
|
||||||
LOG_VERBOSE("callback_all_task_finished", {});
|
LOG_VERBOSE("update_multitasks", {});
|
||||||
// process and update all the multitasks
|
// check if we have any finished multitasks
|
||||||
auto queue_iterator = queue_multitasks.begin();
|
auto queue_iterator = queue_multitasks.begin();
|
||||||
while (queue_iterator != queue_multitasks.end())
|
while (queue_iterator != queue_multitasks.end())
|
||||||
{
|
{
|
||||||
@ -349,8 +311,9 @@ struct llama_server_queue {
|
|||||||
++queue_iterator;
|
++queue_iterator;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// all tasks in the current loop is finished
|
// all tasks in the current loop is processed, slots data is now ready
|
||||||
callback_all_task_finished();
|
LOG_VERBOSE("callback_run_slots", {});
|
||||||
|
callback_run_slots();
|
||||||
}
|
}
|
||||||
LOG_VERBOSE("wait for new task", {});
|
LOG_VERBOSE("wait for new task", {});
|
||||||
// wait for new task
|
// wait for new task
|
||||||
@ -408,12 +371,14 @@ struct llama_server_response {
|
|||||||
std::mutex mutex_results;
|
std::mutex mutex_results;
|
||||||
std::condition_variable condition_results;
|
std::condition_variable condition_results;
|
||||||
|
|
||||||
|
// add the task_id to the list of tasks waiting for response
|
||||||
void add_waiting_task_id(int task_id) {
|
void add_waiting_task_id(int task_id) {
|
||||||
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
LOG_VERBOSE("waiting for task id", {{"task_id", task_id}});
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
waiting_task_ids.insert(task_id);
|
waiting_task_ids.insert(task_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// when the request is finished, we can remove task associated with it
|
||||||
void remove_waiting_task_id(int task_id) {
|
void remove_waiting_task_id(int task_id) {
|
||||||
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}});
|
||||||
std::unique_lock<std::mutex> lock(mutex_results);
|
std::unique_lock<std::mutex> lock(mutex_results);
|
||||||
@ -574,3 +539,96 @@ static std::string gen_chatcmplid()
|
|||||||
chatcmplid << "chatcmpl-" << random_string();
|
chatcmplid << "chatcmpl-" << random_string();
|
||||||
return chatcmplid.str();
|
return chatcmplid.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// other common utils
|
||||||
|
//
|
||||||
|
|
||||||
|
static size_t common_part(const std::vector<llama_token> &a, const std::vector<llama_token> &b)
|
||||||
|
{
|
||||||
|
size_t i;
|
||||||
|
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ends_with(const std::string &str, const std::string &suffix)
|
||||||
|
{
|
||||||
|
return str.size() >= suffix.size() &&
|
||||||
|
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t find_partial_stop_string(const std::string &stop,
|
||||||
|
const std::string &text)
|
||||||
|
{
|
||||||
|
if (!text.empty() && !stop.empty())
|
||||||
|
{
|
||||||
|
const char text_last_char = text.back();
|
||||||
|
for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--)
|
||||||
|
{
|
||||||
|
if (stop[char_index] == text_last_char)
|
||||||
|
{
|
||||||
|
const std::string current_partial = stop.substr(0, char_index + 1);
|
||||||
|
if (ends_with(text, current_partial))
|
||||||
|
{
|
||||||
|
return text.size() - char_index - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::string::npos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: reuse llama_detokenize
|
||||||
|
template <class Iter>
|
||||||
|
static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||||
|
{
|
||||||
|
std::string ret;
|
||||||
|
for (; begin != end; ++begin)
|
||||||
|
{
|
||||||
|
ret += llama_token_to_piece(ctx, *begin);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
// format incomplete utf-8 multibyte character for output
|
||||||
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||||
|
{
|
||||||
|
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||||
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
|
// (size > 1 meaning it's already a known token)
|
||||||
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||||
|
{
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << std::hex << (out[0] & 0xff);
|
||||||
|
std::string res(ss.str());
|
||||||
|
out = "byte: \\x" + res;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert a vector of completion_token_output to json
|
||||||
|
static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> &probs)
|
||||||
|
{
|
||||||
|
json out = json::array();
|
||||||
|
for (const auto &prob : probs)
|
||||||
|
{
|
||||||
|
json probs_for_token = json::array();
|
||||||
|
for (const auto &p : prob.probs)
|
||||||
|
{
|
||||||
|
std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
|
||||||
|
probs_for_token.push_back(json
|
||||||
|
{
|
||||||
|
{"tok_str", tok_str},
|
||||||
|
{"prob", p.prob},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
|
||||||
|
out.push_back(json{
|
||||||
|
{"content", tok_str},
|
||||||
|
{"probs", probs_for_token},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user