rename n_ctx to kv_size

This commit is contained in:
Pierrick HYMBERT 2024-02-18 20:59:26 +01:00 committed by Georgi Gerganov
parent ef96e8b1f7
commit 606873401c
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
48 changed files with 403 additions and 393 deletions

View File

@ -186,7 +186,7 @@ llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_ctx = 512
llm_load_print_meta: kv_size = 512
llm_load_print_meta: n_embd = 5120
llm_load_print_meta: n_head = 40
llm_load_print_meta: n_head_kv = 40
@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size = 75.41 MB
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0
Building a website can be done in 10 simple steps:

View File

@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
sparams.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx-size") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.kv_size = std::stoi(argv[i]);
fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
} else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
params.kv_size = std::stoi(argv[i]);
} else if (arg == "--grp-attn-n" || arg == "-gan") {
if (++i >= argc) {
invalid_param = true;
@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" -bf FNAME, --binary-file FNAME\n");
printf(" binary file containing multiple choice tasks.\n");
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
printf(" (default: %s)\n", sampler_type_names.c_str());
@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto cparams = llama_context_default_params();
cparams.n_ctx = params.n_ctx;
cparams.kv_size = params.kv_size;
cparams.n_batch = params.n_batch;
cparams.n_threads = params.n_threads;
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);

View File

@ -50,7 +50,7 @@ struct gpt_params {
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t kv_size = 512; // KV Cache size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 8; // number of tokens to draft during speculative decoding

View File

@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"
# Uncomment and adjust to the number of CPU cores you want to use.
#N_THREAD="${N_THREAD:-4}"
CTX_SIZE="${CTX_SIZE:-4096}"
KV_SIZE="${KV_SIZE:-4096}"
N_PREDICTS="${N_PREDICTS:-4096}"
GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--kv_size "$KV_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647

View File

@ -10,7 +10,7 @@ cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
--kv_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \

View File

@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
// Vcur shape [n_embd, N, 1, 1]
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
Qcur,
0, 2, 1, 3);
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// K shape [n_embd/n_head, n_past + N, n_head, 1]
struct ggml_tensor * K =
ggml_permute(ctx0,
@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(
assert_shape_3d(Vcur, N, n_embd, n_batch);
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
0, 2, 1, 3);
assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
struct ggml_tensor * K =
ggml_permute(ctx0,
@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
// split cached V into n_head heads
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
struct ggml_tensor * V =
ggml_view_4d(ctx0, vc,
@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
cur)),
n_embd, N)));
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
/* {
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
( kv_size)*ggml_element_size(kv_self.v),
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
Qcur,
0, 2, 1, 3);
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
// K shape [n_embd/n_head, n_past + N, n_head, 1]
struct ggml_tensor * K =
ggml_permute(ctx0,
@ -1470,7 +1470,7 @@ int main(int argc, char ** argv) {
/*
struct llama_model_lora model_lora;
// model.hparams.n_vocab = 6;
// model.hparams.n_ctx = 64;
// model.hparams.kv_size = 64;
// model.hparams.n_embd = 128;
// model.hparams.n_mult = 2;
// model.hparams.n_head = 8;
@ -1478,7 +1478,7 @@ int main(int argc, char ** argv) {
// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
model_lora.hparams.n_vocab = 16;
model_lora.hparams.n_ctx = 32;
model_lora.hparams.kv_size = 32;
model_lora.hparams.n_embd = 256;
model_lora.hparams.n_mult = 2;
model_lora.hparams.n_head = 16;

View File

@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.kv_size = n_kv_max;
ctx_params.n_batch = 512;
ctx_params.mul_mat_q = mmq;

View File

@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par
var context_params = llama_context_default_params()
context_params.seed = 1234
context_params.n_ctx = n_kv_req
context_params.kv_size = n_kv_req
context_params.n_batch = UInt32(max(n_len, n_parallel))
context_params.n_threads = 8
context_params.n_threads_batch = 8
@ -53,12 +53,12 @@ defer {
llama_free(context)
}
let n_ctx = llama_n_ctx(context)
let kv_size = llama_kv_size(context)
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
if n_kv_req > n_ctx {
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
if n_kv_req > kv_size {
print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
exit(1)
}

View File

@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt
...
main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113
Hello my name is

View File

@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.kv_size = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
return 1;
}
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
if (n_kv_req > kv_size) {
LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__, n_kv_req);
LOG_TEE("%s: either reduce n_parallel or increase kv_size\n", __func__);
return 1;
}

View File

@ -139,8 +139,8 @@ int main(int argc, char ** argv)
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
const size_t max_kv_size = llama_kv_size(ctx);
const size_t max_tokens_list_size = max_kv_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{

View File

@ -128,20 +128,20 @@ int main(int argc, char ** argv) {
// TODO: perform the bench for all types or for a user specified type
const ggml_type qtype = GGML_TYPE_Q4_1;
size_t ctx_size = 0;
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(qtype, sizex*sizey);
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
ctx_size += 1024*1024*16;
size_t kv_size = 0;
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizez);
kv_size += ggml_row_size(qtype, sizex * sizey);
kv_size += ggml_row_size(qtype, sizex * sizey);
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
kv_size += 1024 * 1024 * 16;
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size / 1024 / 1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_size =*/ kv_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};

View File

@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
rem if not defined N_THREAD set "N_THREAD=8"
rem Number of tokens to predict (made it larger than default because we want a long interaction)
if not defined N_PREDICTS set "N_PREDICTS=2048"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
rem Default main script paths
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"

View File

@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)

View File

@ -27,9 +27,9 @@ SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
CTX_SIZE=2048
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
KV_SIZE=2048
KV_ROTATE_POINT=$((KV_SIZE * 3 / 5)) # REVIEW
OPTS=(--model "$MODEL" --kv_size "$KV_SIZE" --repeat_last_n 256 "$@")
# An unbuffered `tail -c+N`
skip_bytes() {
@ -84,7 +84,7 @@ n_tokens=0
while read -e line; do
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
n_predict=$((KV_SIZE - n_tokens - ${#line} / 2 - 32))
# Swap prompts when we're about to run out of context
if ((n_predict <= 0)); then
@ -97,11 +97,11 @@ while read -e line; do
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
n_tokens=0
n_predict=$((CTX_SIZE / 2))
n_predict=$((KV_SIZE / 2))
fi
echo " ${line}" >>"$CUR_PROMPT_FILE"
if ((n_tokens > CTX_ROTATE_POINT)); then
if ((n_tokens > KV_ROTATE_POINT)); then
echo " ${line}" >>"$NEXT_PROMPT_FILE"
fi
@ -139,7 +139,7 @@ while read -e line; do
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
if ((n_tokens > CTX_ROTATE_POINT)); then
if ((n_tokens > KV_ROTATE_POINT)); then
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
fi

View File

@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)

View File

@ -226,7 +226,7 @@ struct llama_vocab {
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512; // this is provided as user input?
uint32_t kv_size = 512; // this is provided as user input?
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_mult = 4;
@ -326,7 +326,7 @@ struct train_params {
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: kv_size: %u\n", __func__, params->kv_size);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_mult: %u\n", __func__, params->n_mult);
printf("%s: n_head: %u\n", __func__, params->n_head);
@ -732,7 +732,7 @@ static void save_as_llama_model(
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.kv_size);
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
@ -937,7 +937,7 @@ int main(int argc, char ** argv) {
struct my_llama_model model;
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
model.hparams.n_ctx = params.n_ctx;
model.hparams.kv_size = params.n_ctx;
model.hparams.n_embd = config.dim; //params.n_embd;
model.hparams.n_ff = config.hidden_dim;
model.hparams.n_mult = 32;//params.n_mult;

View File

@ -88,11 +88,11 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
if (n_ctx > n_ctx_train) {
if (kv_size > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, kv_size);
}
// print system information
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
// max batch size
const uint64_t n_batch = params.n_batch;
GGML_ASSERT(params.n_batch == params.n_ctx);
GGML_ASSERT(params.n_batch == params.kv_size);
// tokenize the prompts and trim
std::vector<std::vector<int32_t>> inputs;

View File

@ -16,7 +16,7 @@
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512;
uint32_t kv_size = 512;
uint32_t n_embd = 4096;
uint32_t n_ff = 11008;
uint32_t n_head = 32;
@ -190,7 +190,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab : %u\n", __func__, params->n_vocab);
printf("%s: n_ctx : %u\n", __func__, params->n_ctx);
printf("%s: kv_size : %u\n", __func__, params->kv_size);
printf("%s: n_embd : %u\n", __func__, params->n_embd);
printf("%s: n_ff : %u\n", __func__, params->n_ff);
printf("%s: n_head : %u\n", __func__, params->n_head);
@ -250,7 +250,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
};
GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
GGUF_GET_KEY(ctx, hparams->kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
@ -268,7 +268,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
}
}
static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t kv_size) {
auto & hparams = model->hparams;
std::vector<char> tn_buf;
@ -298,7 +298,7 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
gguf_free(mctx);
}
hparams.n_vocab = llama_n_vocab(input);
hparams.n_ctx = n_ctx;
hparams.kv_size = kv_size;
// get tensors from llama_model (possibly mmapped)
model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
@ -529,7 +529,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int n_past = 0;
const int N = n_tokens;
const auto & hparams = model->hparams;
const int n_ctx = hparams.n_ctx;
const int kv_size = hparams.kv_size;
const int n_vocab = hparams.n_vocab;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
@ -558,13 +558,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
ggml_set_input(KQ_pos);
// rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
(struct ggml_tensor * t) -> struct ggml_tensor * {
// not capturing these, to silcence warnings
const int rope_mode = 0;
return ggml_rope_custom(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
t, KQ_pos, n_rot, rope_mode, kv_size, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
@ -848,7 +848,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx);
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size);
gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd);
gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff);
gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head);
@ -1554,9 +1554,9 @@ int main(int argc, char ** argv) {
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
if (existed) {
// overwrite last n_ctx with user provided n_ctx
// overwrite last kv_size with user provided kv_size
if (params.common.custom_n_ctx) {
model.hparams.n_ctx = params.common.n_ctx;
model.hparams.kv_size = params.common.n_ctx;
}
const bool opt_param_count_changed = (
@ -1625,7 +1625,7 @@ int main(int argc, char ** argv) {
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
printf("%s: opt iter %d\n", __func__, opt->iter);
int n_tokens = model.hparams.n_ctx;
int n_tokens = model.hparams.kv_size;
int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch;

View File

@ -10,6 +10,6 @@ cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \
--batch_size 8 --kv_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View File

@ -325,7 +325,7 @@ static void process_logits(
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
auto tim1 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,17 +336,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
if (size_t((from_chunk + 2)*kv_size) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk * kv_size);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk * kv_size);
}
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
n_ctx);
if (int(tokens.size()) < 2*kv_size) {
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2 * kv_size,
kv_size);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return false;
}
@ -359,7 +359,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
prob_history.resize(tokens.size());
}
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk_max = tokens.size() / kv_size;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@ -373,16 +373,16 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
const int num_batches = (kv_size + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
logits.reserve((size_t)kv_size * n_vocab);
}
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int start = i * kv_size;
const int end = start + kv_size;
std::vector<float> logits;
@ -431,11 +431,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
if (compute_ppl) {
const int first = n_ctx/2;
const int first = kv_size / 2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
count += n_ctx - first - 1;
count += kv_size - first - 1;
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
@ -553,7 +553,7 @@ int main(int argc, char ** argv) {
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
params.n_batch = std::min(params.n_batch, params.kv_size);
print_build_info();
@ -593,9 +593,9 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
if (params.kv_size > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
__func__, n_ctx_train, params.kv_size);
}
// print system information

View File

@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
- `-kv N`, `--kv-size N`: Specify the total size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts

View File

@ -135,9 +135,9 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
if (params.kv_size != 0 && params.kv_size < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
params.kv_size = 8;
}
if (params.instruct) {
printf("\n************\n");
@ -225,12 +225,12 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
const int kv_size = llama_kv_size(ctx);
LOG("kv_size: %d\n", kv_size);
if (n_ctx > n_ctx_train) {
if (kv_size > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, kv_size);
}
// print system information
@ -291,8 +291,8 @@ int main(int argc, char ** argv) {
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
if ((int) embd_inp.size() > kv_size - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
return 1;
}
@ -366,7 +366,7 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
LOG_TEE("\n##### Infill mode #####\n\n");
@ -416,9 +416,9 @@ int main(int argc, char ** argv) {
while (n_remain != 0 || params.interactive) {
// predict
if (!embd.empty()) {
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
// Note: kv_size - 4 here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
int max_embd_size = kv_size - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
@ -434,8 +434,8 @@ int main(int argc, char ** argv) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
// - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@ -444,8 +444,8 @@ int main(int argc, char ** argv) {
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, kv_size, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);

View File

@ -514,7 +514,7 @@ struct cmd_params_instance {
llama_context_params to_llama_cparams() const {
llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = n_prompt + n_gen;
cparams.kv_size = n_prompt + n_gen;
cparams.n_batch = n_batch;
cparams.type_k = type_k;
cparams.type_v = type_v;

View File

@ -68,8 +68,8 @@ actor LlamaContext {
print("Using \(n_threads) threads")
var ctx_params = llama_context_default_params()
ctx_params.seed = 1234
ctx_params.n_ctx = 2048
ctx_params.seed = 1234
ctx_params.kv_size = 2048
ctx_params.n_threads = UInt32(n_threads)
ctx_params.n_threads_batch = UInt32(n_threads)
@ -112,13 +112,13 @@ actor LlamaContext {
tokens_list = tokenize(text: text, add_bos: true)
temporary_invalid_cchars = []
let n_ctx = llama_n_ctx(context)
let kv_size = llama_kv_size(context)
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
print("\n n_len = \(n_len), kv_size = \(kv_size), n_kv_req = \(n_kv_req)")
if n_kv_req > n_ctx {
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
if n_kv_req > kv_size {
print("error: n_kv_req > kv_size, the required KV cache size is not big enough")
}
for id in tokens_list {

View File

@ -9,7 +9,7 @@ cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
--kv_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \

View File

@ -9,7 +9,7 @@ cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
--kv_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \

View File

@ -230,7 +230,7 @@ static struct llava_context * llava_init(gpt_params * params) {
}
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
ctx_params.kv_size = params->kv_size < 2048 ? 2048 : params->kv_size; // we need a longer context size to process image embeddings
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

View File

@ -103,15 +103,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
const size_t num_images = num_patches_width * num_patches_height + 1;
// TODO: size calculation is not calculated - it's only tens of MB
size_t ctx_size = 0;
size_t kv_size = 0;
{
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
kv_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
kv_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
}
struct ggml_init_params params {
/*.mem_size =*/ ctx_size,
/*.mem_size =*/ kv_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
};

View File

@ -73,8 +73,8 @@ int main(int argc, char ** argv) {
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
all = inp;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
const int max_kv_size = llama_kv_size(ctx);
const int max_tokens_list_size = max_kv_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
// seq_id == 0 : the current input token
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
// seq_id [W + 1, W + G] : verification n-grams
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
llama_batch batch = llama_batch_init(params.kv_size, 0, W + G + 1);
// target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);

View File

@ -47,8 +47,8 @@ int main(int argc, char ** argv){
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
const int max_kv_size = llama_kv_size(ctx);
const int max_tokens_list_size = max_kv_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@ -86,7 +86,7 @@ int main(int argc, char ** argv){
std::vector<llama_token> draft;
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, 1);
// debug
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);

View File

@ -70,7 +70,8 @@ In this section, we cover the most commonly used options for running the `main`
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
- `-kv N`, `--kv-size N`: Set the size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
## Input Prompts
@ -134,15 +135,15 @@ By understanding and utilizing these interaction options, you can create engagin
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
### Context Size
### KV Context Size
The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
The `--kv-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
- `-c N, --kv-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
### Extended Context Size
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--kv-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
@ -152,7 +153,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
By utilizing context management options like `--kv-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
## Generation Flags
@ -181,12 +182,12 @@ Example usage: `--temp 0.5`
### Repeat Penalty
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`kv-size`).
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.

View File

@ -157,9 +157,9 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.n_ctx != 0 && params.n_ctx < 8) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
if (params.kv_size != 0 && params.kv_size < 8) {
LOG_TEE("%s: warning: minimum KV size is 8, using minimum size.\n", __func__);
params.kv_size = 8;
}
if (params.rope_freq_base != 0.0) {
@ -208,12 +208,12 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx);
const int kv_size = llama_kv_size(ctx);
LOG("kv_size: %d\n", kv_size);
if (n_ctx > n_ctx_train) {
if (kv_size > n_ctx_train) {
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
__func__, n_ctx_train, kv_size);
}
// print system information
@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
} else {
// The file exists and is not empty
session_tokens.resize(n_ctx);
session_tokens.resize(kv_size);
size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@ -289,8 +289,8 @@ int main(int argc, char ** argv) {
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
if ((int) embd_inp.size() > kv_size - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
return 1;
}
@ -450,7 +450,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
// group-attention state
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -463,7 +463,7 @@ int main(int argc, char ** argv) {
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
//GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
}
LOG_TEE("\n\n");
@ -514,9 +514,9 @@ int main(int argc, char ** argv) {
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (!embd.empty()) {
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
// Note: (kv_size - 4) here is to match the logic for commandline prompt handling via
// --prompt or --file which uses the same value.
int max_embd_size = n_ctx - 4;
int max_embd_size = kv_size - 4;
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
if ((int) embd.size() > max_embd_size) {
@ -533,8 +533,8 @@ int main(int argc, char ** argv) {
// infinite text generation via context shifting
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
// - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@ -543,8 +543,8 @@ int main(int argc, char ** argv) {
const int n_left = n_past - params.n_keep - 1;
const int n_discard = n_left/2;
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, n_ctx, params.n_keep, n_discard);
LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
n_past, n_left, kv_size, params.n_keep, n_discard);
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
@ -666,7 +666,7 @@ int main(int argc, char ** argv) {
LOG("n_past = %d\n", n_past);
// Display total tokens alongside total time
if (params.n_print > 0 && n_past % params.n_print == 0) {
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, kv_size);
}
}

View File

@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n\n");
fflush(stderr);
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
std::vector<client> clients(n_clients);
for (size_t i = 0; i < clients.size(); ++i) {
@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
llama_batch batch = llama_batch_init(kv_size, 0, 1);
int32_t n_total_prompt = 0;
int32_t n_total_gen = 0;

View File

@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = seed;
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
ctx_params.kv_size = llama_n_ctx_train(model)*n_grp + n_keep;
ctx_params.n_batch = 512;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -121,12 +121,12 @@ int main(int argc, char ** argv) {
// total length of the sequences including the prompt
const int n_len = n_tokens_all + n_predict;
const int n_ctx = llama_n_ctx(ctx) - n_keep;
const int n_kv_req = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx) - n_keep;
const int n_kv_req = llama_kv_size(ctx);
const int n_batch = ctx_params.n_batch;
const int n_batch_grp = ctx_params.n_batch/n_grp;
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, kv_size, n_kv_req, n_grp, n_batch);
// print the prompt token-by-token
@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
int n_past = 0;
// fill the KV cache
for (int i = 0; i < n_ctx; i += n_batch) {
for (int i = 0; i < kv_size; i += n_batch) {
if (i > 0 && n_grp > 1) {
// if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
const int ib = i/n_batch - 1;
@ -174,13 +174,13 @@ int main(int argc, char ** argv) {
}
}
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
for (int i = kv_size; i < n_tokens_all; i += n_batch) {
const int n_discard = n_batch;
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);
n_past -= n_discard;
@ -203,13 +203,13 @@ int main(int argc, char ** argv) {
}
{
const int n_discard = n_past - n_ctx + n_predict;
const int n_discard = n_past - kv_size + n_predict;
if (n_discard > 0) {
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);
n_past -= n_discard;
}

View File

@ -320,11 +320,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
if (int(tokens.size()) < 2*kv_size) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
kv_size);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
@ -340,13 +340,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
return {tokens, -1, logit_history, prob_history};
}
const int calc_chunk = n_ctx;
const int calc_chunk = kv_size;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), n_ctx, params.ppl_stride);
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n", __func__,
tokens.size(), kv_size, params.ppl_stride);
return {tokens, -1, logit_history, prob_history};
}
@ -414,8 +414,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.kv_size - params.ppl_stride + start, params.kv_size + start);
for (int j = kv_size - params.ppl_stride - 1; j < kv_size - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
@ -453,7 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// BOS tokens will be added for each chunk before eval
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
std::ofstream logits_stream;
if (!params.logits_file.empty()) {
@ -464,7 +464,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
}
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
logits_stream.write("_logits_", 8);
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
logits_stream.write(reinterpret_cast<const char *>(&kv_size), sizeof(kv_size));
}
auto tim1 = std::chrono::high_resolution_clock::now();
@ -475,9 +475,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (int(tokens.size()) < 2*n_ctx) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
n_ctx);
if (int(tokens.size()) < 2*kv_size) {
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
kv_size);
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
return {std::move(tokens), 0., {}, {}};
}
@ -488,7 +488,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
std::vector<float> prob_history;
prob_history.resize(tokens.size());
const int n_chunk_max = tokens.size() / n_ctx;
const int n_chunk_max = tokens.size() / kv_size;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@ -498,11 +498,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
double nll = 0.0;
double nll2 = 0.0;
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
const int num_batches = (kv_size + n_batch - 1) / n_batch;
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
logits.reserve((size_t)kv_size * n_vocab);
}
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
@ -513,14 +513,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
if (!params.logits_file.empty()) {
logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
logits_stream.write((const char *)tokens.data(), n_chunk * kv_size * sizeof(tokens[0]));
const int nv = 2*((n_vocab + 1)/2) + 4;
log_probs.resize(n_ctx * nv);
log_probs.resize(kv_size * nv);
}
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int start = i * kv_size;
const int end = start + kv_size;
const auto t_start = std::chrono::high_resolution_clock::now();
@ -566,7 +566,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// We get the logits for all the tokens in the context window (params.kv_size)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half of the window (so the model always has
// some context to predict the token).
@ -578,16 +578,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
const int first = n_ctx/2;
const int first = kv_size/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
if (!params.logits_file.empty()) {
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
workers, log_probs, nll, nll2);
} else {
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
}
count += n_ctx - first - 1;
count += kv_size - first - 1;
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
@ -596,7 +596,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
printf("%8d %.4lf %4lf %4lf\n", i*kv_size, std::exp(nll / count), av, av2);
}
fflush(stdout);
@ -805,16 +805,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double acc = 0.0f;
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(n_vocab*kv_size);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
@ -832,7 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
// each task has 4 unique seuqnce ids - one for each ending
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
while (n_cur + (int) hs_data[i1].required_tokens <= kv_size) {
auto & hs_cur = hs_data[i1];
const int s0 = 4*(i1 - i0);
@ -1082,16 +1082,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 128;
const int max_seq = 2*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(n_vocab*kv_size);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
@ -1108,7 +1108,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
llama_batch_clear(batch);
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
while (n_cur + (int) data[i1].required_tokens <= kv_size) {
const int s0 = 2*(i1 - i0);
if (s0 + 2 > max_seq) {
break;
@ -1434,16 +1434,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
printf("\ntask\tacc_norm\n");
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
const int n_batch = params.n_batch;
const int max_tasks_per_batch = 32;
const int max_seq = 4*max_tasks_per_batch;
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
std::vector<float> tok_logits(n_vocab);
std::vector<float> batch_logits(n_vocab*n_ctx);
std::vector<float> batch_logits(n_vocab*kv_size);
std::vector<std::pair<size_t, llama_token>> eval_pairs;
std::vector<float> eval_results;
@ -1467,7 +1467,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// the common prefix is shared among the 4 sequences to save tokens
// we extract logits only from the last common token and from all ending tokens of each sequence
int s0 = 0;
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
while (n_cur + (int) tasks[i1].required_tokens <= kv_size) {
auto& cur_task = tasks[i1];
int num_answers = cur_task.seq_tokens.size();
@ -1620,11 +1620,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
}
}
uint32_t n_ctx;
in.read((char *)&n_ctx, sizeof(n_ctx));
if (n_ctx > llama_n_ctx(ctx)) {
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
uint32_t kv_size;
in.read((char *)&kv_size, sizeof(kv_size));
if (kv_size > llama_kv_size(ctx)) {
fprintf(stderr, "%s: %s has been computed with %u, while the current KV Cache size is %d. Increase it with -kv and retry\n",
__func__, params.logits_file.c_str(), kv_size, params.kv_size);
}
int n_vocab, n_chunk;
@ -1638,22 +1638,22 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
}
std::vector<llama_token> tokens(n_ctx * n_chunk);
std::vector<llama_token> tokens(kv_size * n_chunk);
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
return;
}
const int n_batch = params.n_batch;
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
const int num_batches = (kv_size + n_batch - 1)/n_batch;
const int nv = 2*((n_vocab + 1)/2) + 4;
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
std::vector<uint16_t> log_probs_uint16(size_t(kv_size - 1 - kv_size/2) * nv);
std::vector<float> kld_values(size_t(kv_size - 1 - kv_size /2)*n_chunk);
std::vector<float> logits;
if (num_batches > 1) {
logits.reserve(n_ctx * n_vocab);
logits.reserve(kv_size * n_vocab);
}
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -1672,8 +1672,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
auto kld_ptr = kld_values.data();
for (int i = 0; i < n_chunk; ++i) {
const int start = i * n_ctx;
const int end = start + n_ctx;
const int start = i * kv_size;
const int end = start + kv_size;
const auto t_start = std::chrono::high_resolution_clock::now();
@ -1726,11 +1726,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n");
}
const int first = n_ctx/2;
const int first = kv_size/2;
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
workers, log_probs_uint16, kld, kld_ptr);
kld_ptr += n_ctx - 1 - first;
kld_ptr += kv_size - 1 - first;
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
@ -1788,12 +1788,12 @@ int main(int argc, char ** argv) {
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
params.n_batch = std::min(params.n_batch, params.kv_size);
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting KV size from %d to %d\n",
params.kv_size, params.kv_size + params.ppl_stride / 2);
params.kv_size += params.ppl_stride/2;
}
print_build_info();
@ -1823,9 +1823,9 @@ int main(int argc, char ** argv) {
}
const int n_ctx_train = llama_n_ctx_train(model);
if (params.n_ctx > n_ctx_train) {
if (params.kv_size > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, params.n_ctx);
__func__, n_ctx_train, params.kv_size);
}
// print system information

View File

@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
}
auto cparams = llama_context_default_params();
cparams.n_ctx = 256;
cparams.kv_size = 256;
cparams.seed = 1;
ctx = llama_new_context_with_model(model, cparams);

View File

@ -12,7 +12,7 @@ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
N_THREAD="${N_THREAD:-12}"
# Note: you can also override the generation options by specifying them on the command line:
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 4096 --batch-size 1024}"
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS

View File

@ -174,7 +174,7 @@ node index.js
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
@ -239,7 +239,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
- `generation_settings`: The provided options above excluding `prompt` but including `kv_size`, `model`
- `model`: The path to the model loaded with `-m`
- `prompt`: The provided `prompt`
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
@ -249,7 +249,7 @@ Notice that each `probs` is an array of length `n_probs`.
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the KV size (`kv_size`)
- **POST** `/tokenize`: Tokenize a given text.
@ -404,7 +404,7 @@ Notice that each `probs` is an array of length `n_probs`.
"mirostat_eta": 0.10000000149011612,
"mirostat_tau": 5.0,
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
"n_ctx": 2048,
"kv_size": 2048,
"n_keep": 0,
"n_predict": 100000,
"n_probs": 0,

View File

@ -155,7 +155,7 @@ struct llama_client_slot
int64_t t_last_used = -1;
// generation props
int32_t n_ctx = 0; // context size per slot
int32_t kv_size = 0; // KV size per slot
int32_t n_past = 0;
int32_t n_decoded = 0;
int32_t n_remaining = -1;
@ -325,7 +325,7 @@ struct llama_server_context
bool all_slots_are_idle = false;
bool add_bos_token = true;
int32_t n_ctx; // total context for all clients / slots
int32_t kv_size; // total KV Cache for all clients / slots
// system prompt
bool system_need_update = false;
@ -369,8 +369,8 @@ struct llama_server_context
return false;
}
if (params.n_ctx < 2048) { // request larger context for the image embedding
params.n_ctx = 2048;
if (params.kv_size < 2048) { // request larger context for the image embedding
params.kv_size = 2048;
}
}
@ -392,7 +392,7 @@ struct llama_server_context
}
}
n_ctx = llama_n_ctx(ctx);
kv_size = llama_kv_size(ctx);
add_bos_token = llama_should_add_bos_token(model);
@ -403,7 +403,7 @@ struct llama_server_context
// create slots
all_slots_are_idle = true;
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
const int32_t kv_size_slot = kv_size / params.n_parallel;
LOG_TEE("Available slots:\n");
for (int i = 0; i < params.n_parallel; i++)
@ -411,10 +411,10 @@ struct llama_server_context
llama_client_slot slot;
slot.id = i;
slot.n_ctx = n_ctx_slot;
slot.kv_size = kv_size_slot;
slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
LOG_TEE(" -> Slot %i - max KV Size: %i\n", slot.id, kv_size_slot);
const int ga_n = params.grp_attn_n;
const int ga_w = params.grp_attn_w;
@ -423,7 +423,7 @@ struct llama_server_context
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
//GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * ga_n"); // NOLINT
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
}
@ -439,7 +439,7 @@ struct llama_server_context
default_generation_settings_for_props = get_formated_generation(slots.front());
default_generation_settings_for_props["seed"] = -1;
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
batch = llama_batch_init(kv_size, 0, params.n_parallel);
}
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
@ -1065,7 +1065,7 @@ struct llama_server_context
}
return json {
{"n_ctx", slot.n_ctx},
{"kv_size", slot.kv_size},
{"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
@ -1474,7 +1474,7 @@ struct llama_server_context
{
if (slot.ga_n == 1)
{
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.kv_size)
{
// Shift context
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
@ -1496,7 +1496,7 @@ struct llama_server_context
slot.truncated = true;
LOG_VERBOSE("context shift", {
{ "n_ctx", n_ctx },
{ "kv_size", kv_size },
{ "n_keep", params.n_keep },
{ "n_left", n_left },
});
@ -1598,12 +1598,12 @@ struct llama_server_context
{
slot.params.n_keep = slot.num_prompt_tokens;
}
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
slot.params.n_keep = std::min(slot.kv_size - 4, slot.params.n_keep);
// if input prompt is too big, truncate it
if (slot.num_prompt_tokens >= slot.n_ctx)
if (slot.num_prompt_tokens >= slot.kv_size)
{
const int n_left = slot.n_ctx - slot.params.n_keep;
const int n_left = slot.kv_size - slot.params.n_keep;
const int n_block_size = n_left / 2;
const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
@ -1611,7 +1611,7 @@ struct llama_server_context
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
LOG_VERBOSE("input truncated", {
{"n_ctx", slot.n_ctx},
{"kv_size", slot.kv_size},
{"n_keep", slot.params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
@ -1620,7 +1620,7 @@ struct llama_server_context
prompt_tokens = new_tokens;
slot.num_prompt_tokens = prompt_tokens.size();
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
GGML_ASSERT(slot.num_prompt_tokens < slot.kv_size);
}
if (!slot.params.cache_prompt)
@ -1873,7 +1873,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.n_ctx);
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
printf(" --rope-scaling {none,linear,yarn}\n");
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
@ -2043,16 +2043,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
server_print_usage(argv[0], default_params, default_sparams);
exit(0);
}
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
else if (arg == "-c" || arg == "--ctx-size" || arg == "--kv_size")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
LOG_WARNING("-c,--ctx-size,--ctx_size option is deprecated, use --kv-size instead",
{{"--ctx_size", params.n_ctx}});
params.kv_size = std::stoi(argv[i]);
LOG_WARNING("-c,--ctx-size,--kv_size option is deprecated, use --kv-size instead",
{{"--kv_size", params.kv_size}});
}
else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size")
{
@ -2061,7 +2061,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
params.kv_size = std::stoi(argv[i]);
}
else if (arg == "--rope-scaling")
{

View File

@ -7,7 +7,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for g
...
main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
main: n_len = 32, kv_size = 2048, n_parallel = 1, n_kv_req = 32
Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old

View File

@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = 2048;
ctx_params.kv_size = 2048;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -68,15 +68,15 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
const int kv_size = llama_kv_size(ctx);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d\n", __func__, n_len, kv_size, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
if (n_kv_req > kv_size) {
LOG_TEE("%s: error: n_kv_req > kv_size, the required KV cache size is not big enough\n", __func__);
LOG_TEE("%s: either reduce n_len or increase kv_size\n", __func__);
return 1;
}

View File

@ -116,7 +116,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> inp;
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
const int max_context_size = llama_n_ctx(ctx_tgt);
const int max_context_size = llama_kv_size(ctx_tgt);
const int max_tokens_list_size = max_context_size - 4;
if ((int) inp.size() > max_tokens_list_size) {
@ -172,8 +172,8 @@ int main(int argc, char ** argv) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
}
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
llama_batch batch_dft = llama_batch_init(params.kv_size, 0, 1);
llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, n_seq_dft);
const auto t_dec_start = ggml_time_us();

View File

@ -22,7 +22,7 @@
struct my_llama_hparams {
uint32_t n_vocab = 32000;
uint32_t n_ctx = 512;
uint32_t kv_size = 512;
uint32_t n_embd = 4096;
uint32_t n_head = 32;
uint32_t n_layer = 32;
@ -112,7 +112,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
static void print_params(struct my_llama_hparams * params) {
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
printf("%s: kv_size: %u\n", __func__, params->kv_size);
printf("%s: n_embd: %u\n", __func__, params->n_embd);
printf("%s: n_head: %u\n", __func__, params->n_head);
printf("%s: n_ff: %u\n", __func__, params->n_ff);
@ -272,7 +272,7 @@ static struct ggml_tensor * llama_build_train_graphs(
const int n_past = 0;
const int N = n_tokens;
const auto & hparams = model->hparams;
const int n_ctx = hparams.n_ctx;
const int kv_size = hparams.kv_size;
const int n_vocab = hparams.n_vocab;
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
@ -295,13 +295,13 @@ static struct ggml_tensor * llama_build_train_graphs(
ggml_set_input(KQ_pos);
// rope has so much parameters that we make a custom function for it
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
(struct ggml_tensor * t) -> struct ggml_tensor * {
// not capturing these, to silcence warnings
const int rope_mode = 0;
return ggml_rope_custom(
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
ctx, t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
@ -487,8 +487,8 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
// n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
// kv_size was not saved in earlier checkpoint file versions, so we make it optional here
GGUF_GET_KEY(fctx, model->hparams.kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
@ -543,7 +543,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
// set hparams
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx );
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size );
gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd );
gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff );
gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head );
@ -945,7 +945,7 @@ int main(int argc, char ** argv) {
struct my_llama_model model;
model.hparams.n_vocab = llama_n_vocab(lmodel);
model.hparams.n_ctx = params.common.n_ctx;
model.hparams.kv_size = params.common.n_ctx;
model.hparams.n_embd = params.n_embd;
model.hparams.n_head = params.n_head;
model.hparams.n_layer = params.n_layer;
@ -982,9 +982,9 @@ int main(int argc, char ** argv) {
printf("%s: init model\n", __func__);
bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
if (existed) {
// overwrite last n_ctx with user provided n_ctx
// overwrite last kv_size with user provided kv_size
if (params.common.custom_n_ctx) {
model.hparams.n_ctx = params.common.n_ctx;
model.hparams.kv_size = params.common.n_ctx;
}
const bool opt_past_changed = opt->params.past != params.common.opt_past;
@ -1031,7 +1031,7 @@ int main(int argc, char ** argv) {
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
printf("%s: opt iter %d\n", __func__, opt->iter);
int n_tokens = model.hparams.n_ctx;
int n_tokens = model.hparams.kv_size;
int n_vocab = model.hparams.n_vocab;
int n_batch = params.common.n_batch;

176
llama.cpp
View File

@ -1607,7 +1607,7 @@ struct llama_hparams {
};
struct llama_cparams {
uint32_t n_ctx; // context size used during inference
uint32_t kv_size; // KV Cache size used during inference
uint32_t n_batch;
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
@ -1923,9 +1923,9 @@ struct llama_context {
struct ggml_tensor * inp_tokens; // I32 [n_batch]
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch]
@ -1943,7 +1943,7 @@ static bool llama_kv_cache_init(
const llama_model & model,
ggml_type ktype,
ggml_type vtype,
uint32_t n_ctx,
uint32_t kv_size,
bool offload) {
const struct llama_hparams & hparams = model.hparams;
@ -1954,11 +1954,11 @@ static bool llama_kv_cache_init(
cache.has_shift = false;
cache.head = 0;
cache.size = n_ctx;
cache.size = kv_size;
cache.used = 0;
cache.cells.clear();
cache.cells.resize(n_ctx);
cache.cells.resize(kv_size);
#ifdef GGML_USE_CLBLAST
offload = false;
@ -1997,8 +1997,8 @@ static bool llama_kv_cache_init(
for (int i = 0; i < (int) n_layer; i++) {
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*kv_size);
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*kv_size);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
@ -2029,19 +2029,19 @@ static bool llama_kv_cache_init(
static bool llama_kv_cache_find_slot(
struct llama_kv_cache & cache,
const struct llama_batch & batch) {
const uint32_t n_ctx = cache.size;
const uint32_t kv_size = cache.size;
const uint32_t n_tokens = batch.n_tokens;
if (n_tokens > n_ctx) {
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
if (n_tokens > kv_size) {
LLAMA_LOG_ERROR("%s: n_tokens=%d > kv_size=%d\n", __func__, n_tokens, kv_size);
return false;
}
uint32_t n_tested = 0;
while (true) {
if (cache.head + n_tokens > n_ctx) {
n_tested += n_ctx - cache.head;
if (cache.head + n_tokens > kv_size) {
n_tested += kv_size - cache.head;
cache.head = 0;
continue;
}
@ -2060,7 +2060,7 @@ static bool llama_kv_cache_find_slot(
break;
}
if (n_tested >= n_ctx) {
if (n_tested >= kv_size) {
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
return false;
}
@ -3692,11 +3692,11 @@ static bool llm_load_tensors(
}
// create one context per buffer type
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
size_t kv_size = ggml_tensor_overhead() * ml.n_tensors;
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
for (auto & it : buft_layer_count) {
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_size =*/ kv_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
@ -3708,7 +3708,7 @@ static bool llm_load_tensors(
model.ctxs.push_back(ctx);
}
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
LLAMA_LOG_INFO("%s: ggml KV size = %7.2f MiB\n", __func__, model.ctxs.size()*kv_size/1024.0/1024.0);
// create tensors for the weights
{
@ -4584,7 +4584,7 @@ static void llm_build_k_shift(
struct ggml_cgraph * graph,
struct ggml_tensor * K_shift,
llm_rope_type type,
int64_t n_ctx,
int64_t kv_size,
float freq_base,
float freq_scale,
const llm_build_cb & cb) {
@ -4612,7 +4612,7 @@ static void llm_build_k_shift(
// we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv.k_l[il],
n_embd_head_k, n_head_kv, n_ctx,
n_embd_head_k, n_head_kv, kv_size,
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
0),
@ -4630,7 +4630,7 @@ static void llm_build_kv_store(
struct ggml_cgraph * graph,
struct ggml_tensor * k_cur,
struct ggml_tensor * v_cur,
int64_t n_ctx,
int64_t kv_size,
int32_t n_tokens,
int32_t kv_head,
const llm_build_cb & cb,
@ -4648,7 +4648,7 @@ static void llm_build_kv_store(
cb(k_cache_view, "k_cache_view", il);
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
( n_ctx)*ggml_element_size(kv.v_l[il]),
( kv_size)*ggml_element_size(kv.v_l[il]),
(kv_head)*ggml_element_size(kv.v_l[il]));
cb(v_cache_view, "v_cache_view", il);
@ -4792,7 +4792,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int64_t n_ctx,
int64_t kv_size,
int32_t n_tokens,
int32_t n_kv,
float kq_scale,
@ -4851,8 +4851,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il],
n_kv, n_embd_head_v, n_head_kv,
ggml_element_size(kv.v_l[il])*n_ctx,
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
ggml_element_size(kv.v_l[il])*kv_size,
ggml_element_size(kv.v_l[il])*kv_size *n_embd_head_v,
0);
cb(v, "v", il);
@ -4892,7 +4892,7 @@ static struct ggml_tensor * llm_build_kv(
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int64_t n_ctx,
int64_t kv_size,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
@ -4906,11 +4906,11 @@ static struct ggml_tensor * llm_build_kv(
ggml_build_forward_expand(graph, k_cur);
ggml_build_forward_expand(graph, v_cur);
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, kv_size, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
q_cur, kq_mask, kq_pos, kv_size, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il);
return cur;
@ -4926,7 +4926,7 @@ struct llm_build_context {
const int64_t n_embd;
const int64_t n_layer;
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
const int64_t kv_size; // user-specified KV Cache size (can be different from n_ctx_train)
const int64_t n_head;
const int64_t n_head_kv;
const int64_t n_embd_head_k;
@ -4946,7 +4946,7 @@ struct llm_build_context {
const float norm_rms_eps;
const int32_t n_tokens;
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_size)
const int32_t kv_head; // index of where we store new KV data in the cache
const int32_t n_orig_ctx;
@ -4973,7 +4973,7 @@ struct llm_build_context {
kv_self (lctx.kv_self),
n_embd (hparams.n_embd),
n_layer (hparams.n_layer),
n_ctx (cparams.n_ctx),
kv_size (cparams.kv_size),
n_head (hparams.n_head),
n_head_kv (hparams.n_head_kv),
n_embd_head_k (hparams.n_embd_head_k),
@ -4991,14 +4991,14 @@ struct llm_build_context {
norm_eps (hparams.f_norm_eps),
norm_rms_eps (hparams.f_norm_rms_eps),
n_tokens (batch.n_tokens),
n_kv (worst_case ? n_ctx : kv_self.n),
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
n_kv (worst_case ? kv_size : kv_self.n),
kv_head (worst_case ? kv_size - n_tokens : kv_self.head),
n_orig_ctx (cparams.n_yarn_orig_ctx),
do_rope_shift (worst_case || kv_self.has_shift),
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
cb (cb),
buf_compute_meta (lctx.buf_compute_meta) {
// all initializations should be done in init()
// all initializations should be done in init()
}
void init() {
@ -5041,7 +5041,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5093,7 +5093,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5229,7 +5229,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5277,7 +5277,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5347,7 +5347,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5401,7 +5401,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5500,7 +5500,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5565,7 +5565,7 @@ struct llm_build_context {
cb(KQ_mask, "KQ_mask", -1);
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -5705,7 +5705,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Q, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5798,7 +5798,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -5899,7 +5899,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
} else {
// compute Q and K and RoPE them
@ -5930,7 +5930,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6043,7 +6043,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6140,7 +6140,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6209,7 +6209,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6262,7 +6262,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6332,7 +6332,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6377,7 +6377,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6446,7 +6446,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6498,7 +6498,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6567,7 +6567,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6625,7 +6625,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f, cb, il);
cb(cur, "kqv_out", il);
}
@ -6689,7 +6689,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6728,7 +6728,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * sa_out = cur;
@ -6827,7 +6827,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -6894,7 +6894,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -6936,7 +6936,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -7002,7 +7002,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -7054,7 +7054,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -7121,7 +7121,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -7172,8 +7172,8 @@ struct llm_build_context {
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -7253,7 +7253,7 @@ struct llm_build_context {
// shift the entire K-cache if needed
if (do_rope_shift) {
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
}
for (int il = 0; il < n_layer; ++il) {
@ -7304,8 +7304,8 @@ struct llm_build_context {
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
@ -7549,13 +7549,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
if (kv_self.has_shift) {
const int64_t n_ctx = cparams.n_ctx;
const int64_t kv_size = cparams.kv_size;
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
for (int i = 0; i < n_ctx; ++i) {
for (int i = 0; i < kv_size; ++i) {
data[i] = lctx.kv_self.cells[i].delta;
}
}
@ -7694,7 +7694,7 @@ static int llama_decode_internal(
// a heuristic, to avoid attending the full cache if it is not yet utilized
// after enough generations, the benefit from this heuristic disappears
// if we start defragmenting the cache, the benefit from this will be more important
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
kv_self.n = std::min((int32_t) cparams.kv_size, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
//kv_self.n = llama_kv_cache_cell_max(kv_self);
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
@ -11148,7 +11148,7 @@ struct llama_model_params llama_model_default_params() {
struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.seed =*/ LLAMA_DEFAULT_SEED,
/*.n_ctx =*/ 512,
/*.kv_size =*/ 512,
/*.n_batch =*/ 512,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
@ -11328,7 +11328,7 @@ struct llama_context * llama_new_context_with_model(
cparams.offload_kqv = params.offload_kqv;
cparams.do_pooling = params.do_pooling;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
cparams.kv_size = params.kv_size == 0 ? hparams.n_ctx_train : params.kv_size;
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
@ -11356,7 +11356,7 @@ struct llama_context * llama_new_context_with_model(
params.seed = time(NULL);
}
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
LLAMA_LOG_INFO("%s: kv_size = %u\n", __func__, cparams.kv_size);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
@ -11447,7 +11447,7 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(ctx->backend_cpu);
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
cparams.n_ctx, cparams.offload_kqv)) {
cparams.kv_size, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
return nullptr;
@ -11490,9 +11490,9 @@ struct llama_context * llama_new_context_with_model(
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size, cparams.n_batch);
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size);
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.kv_size);
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
@ -11531,8 +11531,8 @@ struct llama_context * llama_new_context_with_model(
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
// build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
int n_past = cparams.n_ctx - n_tokens;
int n_tokens = (int)std::min(cparams.kv_size, cparams.n_batch);
int n_past = cparams.kv_size - n_tokens;
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
@ -11565,7 +11565,7 @@ struct llama_context * llama_new_context_with_model(
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
// TODO: needs fix after #3228
GGML_ASSERT(false && "not implemented");
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
//const std::vector<llama_token> tmp(ctx->model.hparams.kv_size, llama_token_bos(ctx));
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
llama_backend_free();
exit(1);
@ -11583,8 +11583,8 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model;
}
uint32_t llama_n_ctx(const struct llama_context * ctx) {
return ctx->cparams.n_ctx;
uint32_t llama_kv_size(const struct llama_context * ctx) {
return ctx->cparams.kv_size;
}
uint32_t llama_n_batch(const struct llama_context * ctx) {
@ -11982,7 +11982,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
const auto n_layer = hparams.n_layer;
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
const auto n_ctx = cparams.n_ctx;
const auto n_kv_req = cparams.kv_size;
const size_t kv_buf_size = kv_self.total_size();
const uint32_t kv_head = kv_self.head;
@ -12006,7 +12006,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
// v is not contiguous, copy row by row
tmp_buf.resize(elt_size*kv_head);
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_kv_req, tmp_buf.size());
data_ctx->write(tmp_buf.data(), tmp_buf.size());
}
}
@ -12093,7 +12093,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
const int n_layer = hparams.n_layer;
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
const int n_ctx = cparams.n_ctx;
const int n_kv_req = cparams.kv_size;
size_t kv_buf_size;
uint32_t kv_head;
@ -12118,7 +12118,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
// v is not contiguous, copy row by row
size_t v_row_size = elt_size*kv_head;
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_kv_req, v_row_size);
inp += v_row_size;
}
}

View File

@ -217,7 +217,7 @@ extern "C" {
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model
uint32_t kv_size; // KV Cache size
uint32_t n_batch; // prompt processing maximum batch size
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
@ -347,7 +347,7 @@ extern "C" {
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_kv_size (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

View File

@ -8,7 +8,7 @@ import sys
import yaml
CLI_ARGS_MAIN_PERPLEXITY = [
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "kv-size", "escape",
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
@ -27,7 +27,7 @@ CLI_ARGS_LLAMA_BENCH = [
]
CLI_ARGS_SERVER = [
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
"alias", "batch-size", "kv-size", "embedding", "host", "memory-f32", "lora", "lora-base",
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
"threads", "verbose"

View File

@ -1121,21 +1121,21 @@ struct test_rope : public test_case {
const std::array<int64_t, 4> ne;
int n_dims;
int mode;
int n_ctx;
int kv_size;
std::string vars() override {
return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
return VARS_TO_STR5(type, ne, n_dims, mode, kv_size);
}
test_rope(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 10, 10, 1},
int n_dims = 10, int mode = 0, int n_ctx = 512)
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
: type(type), ne(ne), n_dims(n_dims), mode(mode), kv_size(n_ctx) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, kv_size);
return out;
}
@ -1145,7 +1145,7 @@ struct test_rope : public test_case {
// pos
std::vector<int> data(ne[2]);
for (int i = 0; i < ne[2]; i++) {
data[i] = rand() % n_ctx;
data[i] = rand() % kv_size;
}
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
} else {
@ -1545,7 +1545,7 @@ struct llama_hparams {
int32_t n_tokens;
// llm_build_context
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= kv_size
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads