mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-09-22 21:16:20 +00:00
Merge 4af076b494
into a5b57b08ce
This commit is contained in:
commit
7fca5a04f3
@ -691,7 +691,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
[](gpt_params & params) {
|
[](gpt_params & params) {
|
||||||
params.ctx_shift = false;
|
params.ctx_shift = false;
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_MAIN}));
|
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--chunks"}, "N",
|
{"--chunks"}, "N",
|
||||||
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
||||||
|
@ -37,6 +37,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||||
|
| `--no-context-shift` | stop generation when context window is full (default: context shift is enabled)<br/> |
|
||||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
|
@ -1476,11 +1476,10 @@ struct server_context {
|
|||||||
std::vector<server_task_result> results(id_tasks.size());
|
std::vector<server_task_result> results(id_tasks.size());
|
||||||
for (size_t i = 0; i < id_tasks.size(); i++) {
|
for (size_t i = 0; i < id_tasks.size(); i++) {
|
||||||
server_task_result result = queue_results.recv(id_tasks);
|
server_task_result result = queue_results.recv(id_tasks);
|
||||||
|
|
||||||
if (result.error) {
|
if (result.error) {
|
||||||
error_handler(result.data);
|
error_handler(result.data);
|
||||||
cancel_tasks(id_tasks);
|
cancel_tasks(id_tasks);
|
||||||
break;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t idx = result.data["index"];
|
size_t idx = result.data["index"];
|
||||||
@ -1827,6 +1826,13 @@ struct server_context {
|
|||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
if (slot.ga_n == 1) {
|
if (slot.ga_n == 1) {
|
||||||
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
||||||
|
if (!params.ctx_shift){
|
||||||
|
slot.release();
|
||||||
|
slot.print_timings();
|
||||||
|
send_final_response(slot);
|
||||||
|
metrics.on_prediction(slot);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
// Shift context
|
// Shift context
|
||||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
||||||
@ -1952,6 +1958,12 @@ struct server_context {
|
|||||||
send_final_response(slot);
|
send_final_response(slot);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// context shift is disabled and prompt is too large - discard it
|
||||||
|
if (!params.ctx_shift && (slot.n_prompt_tokens > slot.n_ctx) ){
|
||||||
|
slot.release();
|
||||||
|
send_error(slot, "Input is too large to process. Either enable context shift or increase the context length.", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_EMBEDDING) {
|
||||||
// this prompt is too large to process - discard it
|
// this prompt is too large to process - discard it
|
||||||
|
Loading…
Reference in New Issue
Block a user