mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
common : add command-line arg to disable KV cache offloading
This commit is contained in:
parent
c80b8a2bff
commit
e262947d43
@ -498,6 +498,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||||||
params.infill = true;
|
params.infill = true;
|
||||||
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||||
params.dump_kv_cache = true;
|
params.dump_kv_cache = true;
|
||||||
|
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||||
|
params.no_kv_offload = true;
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
params.multiline_input = true;
|
params.multiline_input = true;
|
||||||
} else if (arg == "--simple-io") {
|
} else if (arg == "--simple-io") {
|
||||||
@ -840,6 +842,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
printf(" -dkvc, --dump-kv-cache\n");
|
printf(" -dkvc, --dump-kv-cache\n");
|
||||||
printf(" verbose print of the KV cache\n");
|
printf(" verbose print of the KV cache\n");
|
||||||
|
printf(" -nkvo, --no-kv-offload\n");
|
||||||
|
printf(" disable KV offload\n");
|
||||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
@ -924,6 +928,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
@ -123,6 +123,7 @@ struct gpt_params {
|
|||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
|
96
llama.cpp
96
llama.cpp
@ -1245,8 +1245,7 @@ struct llama_cparams {
|
|||||||
float yarn_beta_slow;
|
float yarn_beta_slow;
|
||||||
|
|
||||||
bool mul_mat_q;
|
bool mul_mat_q;
|
||||||
bool offload_k;
|
bool offload_kqv;
|
||||||
bool offload_v;
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -1526,8 +1525,7 @@ static bool llama_kv_cache_init(
|
|||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
uint32_t n_ctx,
|
uint32_t n_ctx,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
bool offload_k,
|
bool offload) {
|
||||||
bool offload_v) {
|
|
||||||
const uint32_t n_embd = hparams.n_embd_gqa();
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
@ -1574,11 +1572,9 @@ static bool llama_kv_cache_init(
|
|||||||
cache.v_l.push_back(v);
|
cache.v_l.push_back(v);
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (i >= i_gpu_start) {
|
if (i >= i_gpu_start) {
|
||||||
if (offload_k) {
|
if (offload) {
|
||||||
ggml_cuda_assign_buffers_no_scratch(k);
|
ggml_cuda_assign_buffers_no_scratch(k);
|
||||||
vram_kv_cache += ggml_nbytes(k);
|
vram_kv_cache += ggml_nbytes(k);
|
||||||
}
|
|
||||||
if (offload_v) {
|
|
||||||
ggml_cuda_assign_buffers_no_scratch(v);
|
ggml_cuda_assign_buffers_no_scratch(v);
|
||||||
vram_kv_cache += ggml_nbytes(v);
|
vram_kv_cache += ggml_nbytes(v);
|
||||||
}
|
}
|
||||||
@ -5101,6 +5097,7 @@ enum llm_offload_func_e {
|
|||||||
OFFLOAD_FUNC_NOP,
|
OFFLOAD_FUNC_NOP,
|
||||||
OFFLOAD_FUNC,
|
OFFLOAD_FUNC,
|
||||||
OFFLOAD_FUNC_FRC, // force offload
|
OFFLOAD_FUNC_FRC, // force offload
|
||||||
|
OFFLOAD_FUNC_KQV,
|
||||||
OFFLOAD_FUNC_NR,
|
OFFLOAD_FUNC_NR,
|
||||||
OFFLOAD_FUNC_EMB,
|
OFFLOAD_FUNC_EMB,
|
||||||
OFFLOAD_FUNC_OUT,
|
OFFLOAD_FUNC_OUT,
|
||||||
@ -5204,38 +5201,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||||||
{ "attn_norm", OFFLOAD_FUNC },
|
{ "attn_norm", OFFLOAD_FUNC },
|
||||||
{ "attn_norm_2", OFFLOAD_FUNC },
|
{ "attn_norm_2", OFFLOAD_FUNC },
|
||||||
|
|
||||||
{ "wqkv", OFFLOAD_FUNC },
|
{ "wqkv", OFFLOAD_FUNC_KQV },
|
||||||
{ "bqkv", OFFLOAD_FUNC },
|
{ "bqkv", OFFLOAD_FUNC_KQV },
|
||||||
{ "wqkv_clamped", OFFLOAD_FUNC },
|
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
||||||
|
|
||||||
{ "tmpk", OFFLOAD_FUNC },
|
{ "tmpk", OFFLOAD_FUNC_KQV },
|
||||||
{ "tmpq", OFFLOAD_FUNC },
|
{ "tmpq", OFFLOAD_FUNC_KQV },
|
||||||
{ "tmpv", OFFLOAD_FUNC },
|
{ "tmpv", OFFLOAD_FUNC_KQV },
|
||||||
{ "Kcur", OFFLOAD_FUNC },
|
{ "Kcur", OFFLOAD_FUNC_KQV },
|
||||||
{ "Qcur", OFFLOAD_FUNC },
|
{ "Qcur", OFFLOAD_FUNC_KQV },
|
||||||
{ "Vcur", OFFLOAD_FUNC },
|
{ "Vcur", OFFLOAD_FUNC_KQV },
|
||||||
|
|
||||||
{ "krot", OFFLOAD_FUNC },
|
{ "krot", OFFLOAD_FUNC_KQV },
|
||||||
{ "qrot", OFFLOAD_FUNC },
|
{ "qrot", OFFLOAD_FUNC_KQV },
|
||||||
{ "kpass", OFFLOAD_FUNC },
|
{ "kpass", OFFLOAD_FUNC_KQV },
|
||||||
{ "qpass", OFFLOAD_FUNC },
|
{ "qpass", OFFLOAD_FUNC_KQV },
|
||||||
{ "krotated", OFFLOAD_FUNC },
|
{ "krotated", OFFLOAD_FUNC_KQV },
|
||||||
{ "qrotated", OFFLOAD_FUNC },
|
{ "qrotated", OFFLOAD_FUNC_KQV },
|
||||||
|
|
||||||
{ "q", OFFLOAD_FUNC },
|
{ "q", OFFLOAD_FUNC_KQV },
|
||||||
{ "k", OFFLOAD_FUNC },
|
{ "k", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq", OFFLOAD_FUNC },
|
{ "kq", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq_scaled", OFFLOAD_FUNC },
|
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq_scaled_alibi", OFFLOAD_FUNC },
|
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq_masked", OFFLOAD_FUNC },
|
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq_soft_max", OFFLOAD_FUNC },
|
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
||||||
{ "kq_soft_max_ext", OFFLOAD_FUNC },
|
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
||||||
{ "v", OFFLOAD_FUNC },
|
{ "v", OFFLOAD_FUNC_KQV },
|
||||||
{ "kqv", OFFLOAD_FUNC },
|
{ "kqv", OFFLOAD_FUNC_KQV },
|
||||||
{ "kqv_merged", OFFLOAD_FUNC },
|
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
||||||
{ "kqv_merged_cont", OFFLOAD_FUNC },
|
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
||||||
{ "kqv_wo", OFFLOAD_FUNC },
|
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
||||||
{ "kqv_out", OFFLOAD_FUNC },
|
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
||||||
|
|
||||||
{ "ffn_inp", OFFLOAD_FUNC },
|
{ "ffn_inp", OFFLOAD_FUNC },
|
||||||
{ "ffn_norm", OFFLOAD_FUNC },
|
{ "ffn_norm", OFFLOAD_FUNC },
|
||||||
@ -5429,11 +5426,13 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
||||||
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
||||||
|
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
||||||
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
||||||
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
||||||
#else
|
#else
|
||||||
{ OFFLOAD_FUNC, "CPU" },
|
{ OFFLOAD_FUNC, "CPU" },
|
||||||
{ OFFLOAD_FUNC_FRC, "CPU" },
|
{ OFFLOAD_FUNC_FRC, "CPU" },
|
||||||
|
{ OFFLOAD_FUNC_KQV, "CPU" },
|
||||||
{ OFFLOAD_FUNC_NR, "CPU" },
|
{ OFFLOAD_FUNC_NR, "CPU" },
|
||||||
{ OFFLOAD_FUNC_EMB, "CPU" },
|
{ OFFLOAD_FUNC_EMB, "CPU" },
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
@ -5458,7 +5457,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
switch (func_e) {
|
switch (func_e) {
|
||||||
case OFFLOAD_FUNC_NOP:
|
case OFFLOAD_FUNC_NOP:
|
||||||
case OFFLOAD_FUNC_OUT:
|
case OFFLOAD_FUNC_OUT:
|
||||||
case OFFLOAD_FUNC_FRC:
|
|
||||||
break;
|
break;
|
||||||
case OFFLOAD_FUNC:
|
case OFFLOAD_FUNC:
|
||||||
if (n_gpu_layers < n_layer) {
|
if (n_gpu_layers < n_layer) {
|
||||||
@ -5467,6 +5465,21 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case OFFLOAD_FUNC_FRC:
|
||||||
|
if (!lctx.cparams.offload_kqv) {
|
||||||
|
func_e = OFFLOAD_FUNC_NOP;
|
||||||
|
} break;
|
||||||
|
case OFFLOAD_FUNC_KQV:
|
||||||
|
if (!lctx.cparams.offload_kqv) {
|
||||||
|
func_e = OFFLOAD_FUNC_NOP;
|
||||||
|
} else {
|
||||||
|
if (n_gpu_layers < n_layer) {
|
||||||
|
if (il < i_gpu_start) {
|
||||||
|
func_e = OFFLOAD_FUNC_NOP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
case OFFLOAD_FUNC_NR:
|
case OFFLOAD_FUNC_NR:
|
||||||
if (n_gpu_layers <= n_layer + 0) {
|
if (n_gpu_layers <= n_layer + 0) {
|
||||||
func_e = OFFLOAD_FUNC_NOP;
|
func_e = OFFLOAD_FUNC_NOP;
|
||||||
@ -5493,6 +5506,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|||||||
case OFFLOAD_FUNC_NOP:
|
case OFFLOAD_FUNC_NOP:
|
||||||
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
||||||
case OFFLOAD_FUNC:
|
case OFFLOAD_FUNC:
|
||||||
|
case OFFLOAD_FUNC_KQV:
|
||||||
case OFFLOAD_FUNC_FRC:
|
case OFFLOAD_FUNC_FRC:
|
||||||
case OFFLOAD_FUNC_NR:
|
case OFFLOAD_FUNC_NR:
|
||||||
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
||||||
@ -8567,8 +8581,7 @@ struct llama_context_params llama_context_default_params() {
|
|||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
/*.offload_k =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.offload_q =*/ true,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -8685,8 +8698,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.mul_mat_q = params.mul_mat_q;
|
cparams.mul_mat_q = params.mul_mat_q;
|
||||||
cparams.offload_k = params.offload_k;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.offload_v = params.offload_v;
|
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||||
@ -8724,7 +8736,7 @@ struct llama_context * llama_new_context_with_model(
|
|||||||
|
|
||||||
// reserve memory for context buffers
|
// reserve memory for context buffers
|
||||||
if (!hparams.vocab_only) {
|
if (!hparams.vocab_only) {
|
||||||
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_k, cparams.offload_v)) {
|
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
||||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
11
llama.h
11
llama.h
@ -192,12 +192,11 @@ extern "C" {
|
|||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool offload_k;
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool offload_v;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
Loading…
Reference in New Issue
Block a user