mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-07 17:21:46 +00:00
server : remove hack for extra parallel slot
Some checks failed
flake8 Lint / Lint (push) Has been cancelled
Some checks failed
flake8 Lint / Lint (push) Has been cancelled
ggml-ci
This commit is contained in:
parent
b8deef0ec0
commit
c3beb9b9dc
@ -378,7 +378,7 @@ struct server_queue {
|
|||||||
std::condition_variable condition_tasks;
|
std::condition_variable condition_tasks;
|
||||||
|
|
||||||
// callback functions
|
// callback functions
|
||||||
std::function<void(server_task&)> callback_new_task;
|
std::function<void(server_task)> callback_new_task;
|
||||||
std::function<void(void)> callback_update_slots;
|
std::function<void(void)> callback_update_slots;
|
||||||
|
|
||||||
// Add a new task to the end of the queue
|
// Add a new task to the end of the queue
|
||||||
@ -431,7 +431,7 @@ struct server_queue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Register function to process a new task
|
// Register function to process a new task
|
||||||
void on_new_task(std::function<void(server_task &)> callback) {
|
void on_new_task(std::function<void(server_task)> callback) {
|
||||||
callback_new_task = std::move(callback);
|
callback_new_task = std::move(callback);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -481,7 +481,7 @@ struct server_queue {
|
|||||||
lock.unlock();
|
lock.unlock();
|
||||||
|
|
||||||
QUE_DBG("processing task, id = %d\n", task.id);
|
QUE_DBG("processing task, id = %d\n", task.id);
|
||||||
callback_new_task(task);
|
callback_new_task(std::move(task));
|
||||||
}
|
}
|
||||||
|
|
||||||
// all tasks in the current loop is processed, slots data is now ready
|
// all tasks in the current loop is processed, slots data is now ready
|
||||||
@ -644,17 +644,12 @@ struct server_context {
|
|||||||
bool load_model(const common_params & params_) {
|
bool load_model(const common_params & params_) {
|
||||||
params = params_;
|
params = params_;
|
||||||
|
|
||||||
// reserve one extra sequence (seq_id == 0) for extra features
|
|
||||||
params.n_parallel += 1;
|
|
||||||
|
|
||||||
common_init_result llama_init = common_init_from_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
loras = llama_init.lora_adapters;
|
loras = llama_init.lora_adapters;
|
||||||
|
|
||||||
params.n_parallel -= 1; // but be sneaky about it
|
|
||||||
|
|
||||||
if (model == nullptr) {
|
if (model == nullptr) {
|
||||||
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
|
SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
|
||||||
return false;
|
return false;
|
||||||
@ -1297,7 +1292,7 @@ struct server_context {
|
|||||||
std::vector<float> embd_res(n_embd, 0.0f);
|
std::vector<float> embd_res(n_embd, 0.0f);
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
|
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1337,7 +1332,7 @@ struct server_context {
|
|||||||
res.stop = true;
|
res.stop = true;
|
||||||
|
|
||||||
for (int i = 0; i < batch.n_tokens; ++i) {
|
for (int i = 0; i < batch.n_tokens; ++i) {
|
||||||
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
|
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1510,7 +1505,7 @@ struct server_context {
|
|||||||
// Functions to process the task
|
// Functions to process the task
|
||||||
//
|
//
|
||||||
|
|
||||||
void process_single_task(const server_task & task) {
|
void process_single_task(server_task task) {
|
||||||
switch (task.type) {
|
switch (task.type) {
|
||||||
case SERVER_TASK_TYPE_INFERENCE:
|
case SERVER_TASK_TYPE_INFERENCE:
|
||||||
{
|
{
|
||||||
@ -1646,7 +1641,7 @@ struct server_context {
|
|||||||
std::string filename = task.data.at("filename");
|
std::string filename = task.data.at("filename");
|
||||||
std::string filepath = task.data.at("filepath");
|
std::string filepath = task.data.at("filepath");
|
||||||
|
|
||||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count);
|
||||||
|
|
||||||
const int64_t t_end = ggml_time_us();
|
const int64_t t_end = ggml_time_us();
|
||||||
const double t_save_ms = (t_end - t_start) / 1000.0;
|
const double t_save_ms = (t_end - t_start) / 1000.0;
|
||||||
@ -1688,7 +1683,7 @@ struct server_context {
|
|||||||
|
|
||||||
slot->cache_tokens.resize(slot->n_ctx);
|
slot->cache_tokens.resize(slot->n_ctx);
|
||||||
size_t token_count = 0;
|
size_t token_count = 0;
|
||||||
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count);
|
||||||
if (nread == 0) {
|
if (nread == 0) {
|
||||||
slot->cache_tokens.resize(0);
|
slot->cache_tokens.resize(0);
|
||||||
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST);
|
||||||
@ -1731,7 +1726,7 @@ struct server_context {
|
|||||||
|
|
||||||
// Erase token cache
|
// Erase token cache
|
||||||
const size_t n_erased = slot->cache_tokens.size();
|
const size_t n_erased = slot->cache_tokens.size();
|
||||||
llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
|
||||||
slot->cache_tokens.clear();
|
slot->cache_tokens.clear();
|
||||||
|
|
||||||
server_task_result result;
|
server_task_result result;
|
||||||
@ -1808,8 +1803,8 @@ struct server_context {
|
|||||||
|
|
||||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past, -n_discard);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||||
@ -1836,7 +1831,7 @@ struct server_context {
|
|||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
|
||||||
@ -1983,8 +1978,8 @@ struct server_context {
|
|||||||
|
|
||||||
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c);
|
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1, kv_shift);
|
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
|
||||||
|
|
||||||
for (size_t i = 0; i < n_match; i++) {
|
for (size_t i = 0; i < n_match; i++) {
|
||||||
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||||
@ -2033,9 +2028,9 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// keep only the common part
|
// keep only the common part
|
||||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
|
if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
|
||||||
// could not partially delete (likely using a non-Transformer model)
|
// could not partially delete (likely using a non-Transformer model)
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
|
||||||
|
|
||||||
// there is no common part left
|
// there is no common part left
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
@ -2048,7 +2043,7 @@ struct server_context {
|
|||||||
|
|
||||||
// add prompt tokens for processing in the current batch
|
// add prompt tokens for processing in the current batch
|
||||||
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
||||||
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
|
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||||
|
Loading…
Reference in New Issue
Block a user