mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 11:40:17 +00:00
server : coherent log output for KV cache full (#6637)
This commit is contained in:
parent
91c736015b
commit
24ee66ed0d
@ -1083,7 +1083,7 @@ struct server_context {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if (llama_decode(ctx, batch_view) != 0) {
|
if (llama_decode(ctx, batch_view) != 0) {
|
||||||
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
LOG_ERROR("llama_decode() failed", {});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1281,7 +1281,11 @@ struct server_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
|
||||||
LOG_TEE("task %i - error: %s\n", id_task, error.c_str());
|
LOG_ERROR("task error", {
|
||||||
|
{"id_multi", id_multi},
|
||||||
|
{"id_task", id_task},
|
||||||
|
{"error", error},
|
||||||
|
});
|
||||||
|
|
||||||
server_task_result res;
|
server_task_result res;
|
||||||
res.id = id_task;
|
res.id = id_task;
|
||||||
@ -2186,7 +2190,11 @@ struct server_context {
|
|||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
if (n_batch == 1 || ret < 0) {
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
// if you get here, it means the KV cache is full - try increasing it via the context size
|
||||||
LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret);
|
LOG_ERROR("failed to decode the batch: KV cache is full - try increasing it via the context size", {
|
||||||
|
{"i", i},
|
||||||
|
{"n_batch", ret},
|
||||||
|
{"ret", ret},
|
||||||
|
});
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
slot.state = SLOT_STATE_PROCESSING;
|
slot.state = SLOT_STATE_PROCESSING;
|
||||||
slot.command = SLOT_COMMAND_NONE;
|
slot.command = SLOT_COMMAND_NONE;
|
||||||
@ -2196,12 +2204,16 @@ struct server_context {
|
|||||||
break; // break loop of n_batch
|
break; // break loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
|
|
||||||
|
|
||||||
// retry with half the batch size to try to find a free slot in the KV cache
|
// retry with half the batch size to try to find a free slot in the KV cache
|
||||||
n_batch /= 2;
|
n_batch /= 2;
|
||||||
i -= n_batch;
|
i -= n_batch;
|
||||||
|
|
||||||
|
LOG_WARNING("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation", {
|
||||||
|
{"i", i},
|
||||||
|
{"n_batch", n_batch},
|
||||||
|
{"ret", ret},
|
||||||
|
});
|
||||||
|
|
||||||
continue; // continue loop of n_batch
|
continue; // continue loop of n_batch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user