llama : add comments about experimental flags (#7544)

This commit is contained in:
Georgi Gerganov 2024-05-27 09:24:13 +03:00 committed by GitHub
parent d6ef0e77dd
commit eaf6e03174
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -265,6 +265,8 @@ extern "C" {
bool check_tensors; // validate model tensor data bool check_tensors; // validate model tensor data
}; };
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggerganov/llama.cpp/pull/7544
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model uint32_t n_ctx; // text context, 0 = from model
@ -291,14 +293,14 @@ extern "C" {
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;
enum ggml_type type_k; // data type for K cache enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
enum ggml_type type_v; // data type for V cache enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits) bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted