mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-27 03:44:35 +00:00
rename n_ctx to kv_size
This commit is contained in:
parent
ef96e8b1f7
commit
606873401c
@ -186,7 +186,7 @@ llm_load_print_meta: vocab type = SPM
|
||||
llm_load_print_meta: n_vocab = 32000
|
||||
llm_load_print_meta: n_merges = 0
|
||||
llm_load_print_meta: n_ctx_train = 4096
|
||||
llm_load_print_meta: n_ctx = 512
|
||||
llm_load_print_meta: kv_size = 512
|
||||
llm_load_print_meta: n_embd = 5120
|
||||
llm_load_print_meta: n_head = 40
|
||||
llm_load_print_meta: n_head_kv = 40
|
||||
@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size = 75.41 MB
|
||||
|
||||
system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
|
||||
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
|
||||
generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
||||
generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0
|
||||
|
||||
|
||||
Building a website can be done in 10 simple steps:
|
||||
|
@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||
}
|
||||
sparams.top_k = std::stoi(argv[i]);
|
||||
} else if (arg == "-c" || arg == "--ctx-size") {
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_size = std::stoi(argv[i]);
|
||||
fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
|
||||
} else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
params.kv_size = std::stoi(argv[i]);
|
||||
} else if (arg == "--grp-attn-n" || arg == "-gan") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" -bf FNAME, --binary-file FNAME\n");
|
||||
printf(" binary file containing multiple choice tasks.\n");
|
||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
|
||||
printf(" (default: %s)\n", sampler_type_names.c_str());
|
||||
@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
||||
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
|
||||
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
|
||||
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
|
||||
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
|
||||
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
|
||||
@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = params.n_ctx;
|
||||
cparams.kv_size = params.kv_size;
|
||||
cparams.n_batch = params.n_batch;
|
||||
cparams.n_threads = params.n_threads;
|
||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
||||
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
||||
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
||||
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
||||
fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
|
||||
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
||||
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
||||
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
|
||||
|
@ -50,7 +50,7 @@ struct gpt_params {
|
||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
||||
int32_t n_threads_batch_draft = -1;
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t kv_size = 512; // KV Cache size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
|
||||
|
@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"
|
||||
|
||||
# Uncomment and adjust to the number of CPU cores you want to use.
|
||||
#N_THREAD="${N_THREAD:-4}"
|
||||
CTX_SIZE="${CTX_SIZE:-4096}"
|
||||
KV_SIZE="${KV_SIZE:-4096}"
|
||||
N_PREDICTS="${N_PREDICTS:-4096}"
|
||||
|
||||
GEN_OPTIONS=(--batch_size 1024
|
||||
--ctx_size "$CTX_SIZE"
|
||||
--kv_size "$KV_SIZE"
|
||||
--keep -1
|
||||
--repeat_last_n 256
|
||||
--repeat_penalty 1.17647
|
||||
|
@ -10,7 +10,7 @@ cd ..
|
||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
--kv_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
|
@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
|
||||
// Vcur shape [n_embd, N, 1, 1]
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
|
||||
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
|
||||
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
|
||||
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
|
||||
|
||||
/* {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
( kv_size)*ggml_element_size(kv_self.v),
|
||||
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
|
||||
// K shape [n_embd/n_head, n_past + N, n_head, 1]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(
|
||||
|
||||
assert_shape_3d(Vcur, N, n_embd, n_batch);
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
|
||||
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
|
||||
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
|
||||
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
|
||||
// k shape [n_embd * N, n_batch] == kv_self.k[:,n_past:n_past+N,:,il]
|
||||
// v shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]
|
||||
|
||||
/* {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
( kv_size)*ggml_element_size(kv_self.v),
|
||||
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
|
||||
0, 2, 1, 3);
|
||||
assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
|
||||
// kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
|
||||
// K shape [n_embd/n_head, n_past + N, n_head, n_batch]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
|
||||
assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);
|
||||
|
||||
// split cached V into n_head heads
|
||||
// kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
|
||||
// kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
|
||||
// V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_4d(ctx0, vc,
|
||||
@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
|
||||
cur)),
|
||||
n_embd, N)));
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.v shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
|
||||
// kv_self.v shape [n_embd * kv_size * n_layer, 1]
|
||||
// k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0]
|
||||
// v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]
|
||||
|
||||
/* {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
( kv_size)*ggml_element_size(kv_self.v),
|
||||
(il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// kv_self.k shape [n_embd * n_ctx * n_layer, 1]
|
||||
// kv_self.k shape [n_embd * kv_size * n_layer, 1]
|
||||
// K shape [n_embd/n_head, n_past + N, n_head, 1]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
@ -1470,7 +1470,7 @@ int main(int argc, char ** argv) {
|
||||
/*
|
||||
struct llama_model_lora model_lora;
|
||||
// model.hparams.n_vocab = 6;
|
||||
// model.hparams.n_ctx = 64;
|
||||
// model.hparams.kv_size = 64;
|
||||
// model.hparams.n_embd = 128;
|
||||
// model.hparams.n_mult = 2;
|
||||
// model.hparams.n_head = 8;
|
||||
@ -1478,7 +1478,7 @@ int main(int argc, char ** argv) {
|
||||
// model.hparams.n_rot = model.hparams.n_embd / model.hparams.n_head;
|
||||
|
||||
model_lora.hparams.n_vocab = 16;
|
||||
model_lora.hparams.n_ctx = 32;
|
||||
model_lora.hparams.kv_size = 32;
|
||||
model_lora.hparams.n_embd = 256;
|
||||
model_lora.hparams.n_mult = 2;
|
||||
model_lora.hparams.n_head = 16;
|
||||
|
@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_max;
|
||||
ctx_params.kv_size = n_kv_max;
|
||||
ctx_params.n_batch = 512;
|
||||
ctx_params.mul_mat_q = mmq;
|
||||
|
||||
|
@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par
|
||||
|
||||
var context_params = llama_context_default_params()
|
||||
context_params.seed = 1234
|
||||
context_params.n_ctx = n_kv_req
|
||||
context_params.kv_size = n_kv_req
|
||||
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
||||
context_params.n_threads = 8
|
||||
context_params.n_threads_batch = 8
|
||||
@ -53,12 +53,12 @@ defer {
|
||||
llama_free(context)
|
||||
}
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let kv_size = llama_kv_size(context)
|
||||
|
||||
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||
print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
||||
|
||||
if n_kv_req > n_ctx {
|
||||
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
|
||||
if n_kv_req > kv_size {
|
||||
print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
|
||||
exit(1)
|
||||
}
|
||||
|
||||
|
@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt
|
||||
|
||||
...
|
||||
|
||||
main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
|
||||
main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113
|
||||
|
||||
Hello my name is
|
||||
|
||||
|
@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_req;
|
||||
ctx_params.kv_size = n_kv_req;
|
||||
ctx_params.n_batch = std::max(n_len, n_parallel);
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
if (n_kv_req > kv_size) {
|
||||
LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__, n_kv_req);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase kv_size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -139,8 +139,8 @@ int main(int argc, char ** argv)
|
||||
|
||||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const size_t max_context_size = llama_n_ctx( ctx );
|
||||
const size_t max_tokens_list_size = max_context_size - 4 ;
|
||||
const size_t max_kv_size = llama_kv_size(ctx);
|
||||
const size_t max_tokens_list_size = max_kv_size - 4 ;
|
||||
|
||||
if (tokens_list.size() > max_tokens_list_size)
|
||||
{
|
||||
|
@ -128,20 +128,20 @@ int main(int argc, char ** argv) {
|
||||
// TODO: perform the bench for all types or for a user specified type
|
||||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
|
||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
||||
ctx_size += 1024*1024*16;
|
||||
size_t kv_size = 0;
|
||||
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
|
||||
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
|
||||
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizez);
|
||||
kv_size += ggml_row_size(qtype, sizex * sizey);
|
||||
kv_size += ggml_row_size(qtype, sizex * sizey);
|
||||
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
|
||||
kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
|
||||
kv_size += 1024 * 1024 * 16;
|
||||
|
||||
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
||||
printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size / 1024 / 1024));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_size =*/ kv_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/* no_alloc =*/ 0
|
||||
};
|
||||
|
@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
|
||||
rem if not defined N_THREAD set "N_THREAD=8"
|
||||
rem Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||
if not defined N_PREDICTS set "N_PREDICTS=2048"
|
||||
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
|
||||
if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
|
||||
|
||||
rem Default main script paths
|
||||
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
|
||||
|
@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
|
||||
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||
|
||||
# Note: you can also override the generation options by specifying them on the command line:
|
||||
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
|
||||
DATE_TIME=$(date +%H:%M)
|
||||
DATE_YEAR=$(date +%Y)
|
||||
|
@ -27,9 +27,9 @@ SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+
|
||||
SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
|
||||
SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
|
||||
|
||||
CTX_SIZE=2048
|
||||
CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
|
||||
OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
|
||||
KV_SIZE=2048
|
||||
KV_ROTATE_POINT=$((KV_SIZE * 3 / 5)) # REVIEW
|
||||
OPTS=(--model "$MODEL" --kv_size "$KV_SIZE" --repeat_last_n 256 "$@")
|
||||
|
||||
# An unbuffered `tail -c+N`
|
||||
skip_bytes() {
|
||||
@ -84,7 +84,7 @@ n_tokens=0
|
||||
|
||||
while read -e line; do
|
||||
# Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
|
||||
n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
|
||||
n_predict=$((KV_SIZE - n_tokens - ${#line} / 2 - 32))
|
||||
|
||||
# Swap prompts when we're about to run out of context
|
||||
if ((n_predict <= 0)); then
|
||||
@ -97,11 +97,11 @@ while read -e line; do
|
||||
cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
|
||||
|
||||
n_tokens=0
|
||||
n_predict=$((CTX_SIZE / 2))
|
||||
n_predict=$((KV_SIZE / 2))
|
||||
fi
|
||||
|
||||
echo " ${line}" >>"$CUR_PROMPT_FILE"
|
||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||
if ((n_tokens > KV_ROTATE_POINT)); then
|
||||
echo " ${line}" >>"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
@ -139,7 +139,7 @@ while read -e line; do
|
||||
|
||||
n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
|
||||
|
||||
if ((n_tokens > CTX_ROTATE_POINT)); then
|
||||
if ((n_tokens > KV_ROTATE_POINT)); then
|
||||
tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
|
||||
fi
|
||||
|
||||
|
@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
|
||||
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||
|
||||
# Note: you can also override the generation options by specifying them on the command line:
|
||||
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
|
||||
DATE_TIME=$(date +%H:%M)
|
||||
DATE_YEAR=$(date +%Y)
|
||||
|
@ -226,7 +226,7 @@ struct llama_vocab {
|
||||
|
||||
struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512; // this is provided as user input?
|
||||
uint32_t kv_size = 512; // this is provided as user input?
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_mult = 4;
|
||||
@ -326,7 +326,7 @@ struct train_params {
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: kv_size: %u\n", __func__, params->kv_size);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_mult: %u\n", __func__, params->n_mult);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
@ -732,7 +732,7 @@ static void save_as_llama_model(
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
|
||||
gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);
|
||||
|
||||
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
|
||||
gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.kv_size);
|
||||
gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
|
||||
gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
|
||||
gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
|
||||
@ -937,7 +937,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
struct my_llama_model model;
|
||||
model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
|
||||
model.hparams.n_ctx = params.n_ctx;
|
||||
model.hparams.kv_size = params.n_ctx;
|
||||
model.hparams.n_embd = config.dim; //params.n_embd;
|
||||
model.hparams.n_ff = config.hidden_dim;
|
||||
model.hparams.n_mult = 32;//params.n_mult;
|
||||
|
@ -88,11 +88,11 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
if (kv_size > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
__func__, n_ctx_train, kv_size);
|
||||
}
|
||||
|
||||
// print system information
|
||||
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// max batch size
|
||||
const uint64_t n_batch = params.n_batch;
|
||||
GGML_ASSERT(params.n_batch == params.n_ctx);
|
||||
GGML_ASSERT(params.n_batch == params.kv_size);
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512;
|
||||
uint32_t kv_size = 512;
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_head = 32;
|
||||
@ -190,7 +190,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab : %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx : %u\n", __func__, params->n_ctx);
|
||||
printf("%s: kv_size : %u\n", __func__, params->kv_size);
|
||||
printf("%s: n_embd : %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_ff : %u\n", __func__, params->n_ff);
|
||||
printf("%s: n_head : %u\n", __func__, params->n_head);
|
||||
@ -250,7 +250,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
|
||||
};
|
||||
|
||||
GGUF_GET_KEY(ctx, hparams->n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
||||
GGUF_GET_KEY(ctx, hparams->n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
|
||||
GGUF_GET_KEY(ctx, hparams->kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
|
||||
GGUF_GET_KEY(ctx, hparams->n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
||||
GGUF_GET_KEY(ctx, hparams->n_head, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
|
||||
GGUF_GET_KEY(ctx, hparams->n_layer, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_BLOCK_COUNT));
|
||||
@ -268,7 +268,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
|
||||
}
|
||||
}
|
||||
|
||||
static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
|
||||
static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t kv_size) {
|
||||
auto & hparams = model->hparams;
|
||||
|
||||
std::vector<char> tn_buf;
|
||||
@ -298,7 +298,7 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||
gguf_free(mctx);
|
||||
}
|
||||
hparams.n_vocab = llama_n_vocab(input);
|
||||
hparams.n_ctx = n_ctx;
|
||||
hparams.kv_size = kv_size;
|
||||
|
||||
// get tensors from llama_model (possibly mmapped)
|
||||
model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
|
||||
@ -529,7 +529,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
const int n_past = 0;
|
||||
const int N = n_tokens;
|
||||
const auto & hparams = model->hparams;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int kv_size = hparams.kv_size;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
@ -558,13 +558,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||
ggml_set_input(KQ_pos);
|
||||
|
||||
// rope has so much parameters that we make a custom function for it
|
||||
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||
auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
|
||||
(struct ggml_tensor * t) -> struct ggml_tensor * {
|
||||
// not capturing these, to silcence warnings
|
||||
const int rope_mode = 0;
|
||||
|
||||
return ggml_rope_custom(ctx,
|
||||
t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
|
||||
t, KQ_pos, n_rot, rope_mode, kv_size, 0,
|
||||
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
};
|
||||
@ -848,7 +848,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
|
||||
gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
|
||||
gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
|
||||
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx);
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size);
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd);
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff);
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head);
|
||||
@ -1554,9 +1554,9 @@ int main(int argc, char ** argv) {
|
||||
bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);
|
||||
|
||||
if (existed) {
|
||||
// overwrite last n_ctx with user provided n_ctx
|
||||
// overwrite last kv_size with user provided kv_size
|
||||
if (params.common.custom_n_ctx) {
|
||||
model.hparams.n_ctx = params.common.n_ctx;
|
||||
model.hparams.kv_size = params.common.n_ctx;
|
||||
}
|
||||
|
||||
const bool opt_param_count_changed = (
|
||||
@ -1625,7 +1625,7 @@ int main(int argc, char ** argv) {
|
||||
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
|
||||
printf("%s: opt iter %d\n", __func__, opt->iter);
|
||||
|
||||
int n_tokens = model.hparams.n_ctx;
|
||||
int n_tokens = model.hparams.kv_size;
|
||||
int n_vocab = model.hparams.n_vocab;
|
||||
int n_batch = params.common.n_batch;
|
||||
|
||||
|
@ -10,6 +10,6 @@ cd ..
|
||||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
--batch_size 8 --kv_size 2048 -n -1 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
||||
|
@ -325,7 +325,7 @@ static void process_logits(
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||
@ -336,17 +336,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (from_chunk > 0) {
|
||||
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||
if (size_t((from_chunk + 2)*kv_size) >= tokens.size()) {
|
||||
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
|
||||
return false;
|
||||
}
|
||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
|
||||
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
|
||||
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk * kv_size);
|
||||
tokens.erase(tokens.begin(), tokens.begin() + from_chunk * kv_size);
|
||||
}
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
if (int(tokens.size()) < 2*kv_size) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2 * kv_size,
|
||||
kv_size);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return false;
|
||||
}
|
||||
@ -359,7 +359,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
prob_history.resize(tokens.size());
|
||||
}
|
||||
|
||||
const int n_chunk_max = tokens.size() / n_ctx;
|
||||
const int n_chunk_max = tokens.size() / kv_size;
|
||||
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
@ -373,16 +373,16 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
const int num_batches = (kv_size + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
if (compute_ppl && num_batches > 1) {
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
logits.reserve((size_t)kv_size * n_vocab);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
const int start = i * kv_size;
|
||||
const int end = start + kv_size;
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
@ -431,11 +431,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||
}
|
||||
|
||||
if (compute_ppl) {
|
||||
const int first = n_ctx/2;
|
||||
const int first = kv_size / 2;
|
||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
count += kv_size - first - 1;
|
||||
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
fflush(stdout);
|
||||
@ -553,7 +553,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
params.n_batch = std::min(params.n_batch, params.kv_size);
|
||||
|
||||
print_build_info();
|
||||
|
||||
@ -593,9 +593,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
if (params.kv_size > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
__func__, n_ctx_train, params.kv_size);
|
||||
}
|
||||
|
||||
// print system information
|
||||
|
@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill
|
||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
|
||||
- `-kv N`, `--kv-size N`: Specify the total size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
|
@ -135,9 +135,9 @@ int main(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
if (params.kv_size != 0 && params.kv_size < 8) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
params.kv_size = 8;
|
||||
}
|
||||
if (params.instruct) {
|
||||
printf("\n************\n");
|
||||
@ -225,12 +225,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
LOG("kv_size: %d\n", kv_size);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
if (kv_size > n_ctx_train) {
|
||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
__func__, n_ctx_train, kv_size);
|
||||
}
|
||||
|
||||
// print system information
|
||||
@ -291,8 +291,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
if ((int) embd_inp.size() > kv_size - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -366,7 +366,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||
@ -416,9 +416,9 @@ int main(int argc, char ** argv) {
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
||||
// Note: kv_size - 4 here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
int max_embd_size = kv_size - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
@ -434,8 +434,8 @@ int main(int argc, char ** argv) {
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
// - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
@ -444,8 +444,8 @@ int main(int argc, char ** argv) {
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, kv_size, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
|
@ -514,7 +514,7 @@ struct cmd_params_instance {
|
||||
llama_context_params to_llama_cparams() const {
|
||||
llama_context_params cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = n_prompt + n_gen;
|
||||
cparams.kv_size = n_prompt + n_gen;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
|
@ -68,8 +68,8 @@ actor LlamaContext {
|
||||
print("Using \(n_threads) threads")
|
||||
|
||||
var ctx_params = llama_context_default_params()
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.n_ctx = 2048
|
||||
ctx_params.seed = 1234
|
||||
ctx_params.kv_size = 2048
|
||||
ctx_params.n_threads = UInt32(n_threads)
|
||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
||||
|
||||
@ -112,13 +112,13 @@ actor LlamaContext {
|
||||
tokens_list = tokenize(text: text, add_bos: true)
|
||||
temporary_invalid_cchars = []
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let kv_size = llama_kv_size(context)
|
||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||
|
||||
print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
|
||||
print("\n n_len = \(n_len), kv_size = \(kv_size), n_kv_req = \(n_kv_req)")
|
||||
|
||||
if n_kv_req > n_ctx {
|
||||
print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
|
||||
if n_kv_req > kv_size {
|
||||
print("error: n_kv_req > kv_size, the required KV cache size is not big enough")
|
||||
}
|
||||
|
||||
for id in tokens_list {
|
||||
|
@ -9,7 +9,7 @@ cd ..
|
||||
|
||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
--kv_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
|
@ -9,7 +9,7 @@ cd ..
|
||||
|
||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
--kv_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
|
@ -230,7 +230,7 @@ static struct llava_context * llava_init(gpt_params * params) {
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||
ctx_params.kv_size = params->kv_size < 2048 ? 2048 : params->kv_size; // we need a longer context size to process image embeddings
|
||||
|
||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
|
@ -103,15 +103,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
||||
const size_t num_images = num_patches_width * num_patches_height + 1;
|
||||
|
||||
// TODO: size calculation is not calculated - it's only tens of MB
|
||||
size_t ctx_size = 0;
|
||||
size_t kv_size = 0;
|
||||
|
||||
{
|
||||
ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
|
||||
ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
|
||||
kv_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
|
||||
kv_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
|
||||
}
|
||||
|
||||
struct ggml_init_params params {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_size =*/ kv_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
|
||||
};
|
||||
|
@ -73,8 +73,8 @@ int main(int argc, char ** argv) {
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
all = inp;
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
const int max_kv_size = llama_kv_size(ctx);
|
||||
const int max_tokens_list_size = max_kv_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
|
||||
// seq_id == 0 : the current input token
|
||||
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
|
||||
// seq_id [W + 1, W + G] : verification n-grams
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||
llama_batch batch = llama_batch_init(params.kv_size, 0, W + G + 1);
|
||||
|
||||
// target model sampling context
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||
|
@ -47,8 +47,8 @@ int main(int argc, char ** argv){
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
const int max_kv_size = llama_kv_size(ctx);
|
||||
const int max_tokens_list_size = max_kv_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||
@ -86,7 +86,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
std::vector<llama_token> draft;
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
|
||||
llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, 1);
|
||||
|
||||
// debug
|
||||
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
|
||||
|
@ -70,7 +70,8 @@ In this section, we cover the most commonly used options for running the `main`
|
||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
- `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
|
||||
- `-kv N`, `--kv-size N`: Set the size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||
|
||||
## Input Prompts
|
||||
|
||||
@ -134,15 +135,15 @@ By understanding and utilizing these interaction options, you can create engagin
|
||||
|
||||
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
|
||||
|
||||
### Context Size
|
||||
### KV Context Size
|
||||
|
||||
The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
||||
The `--kv-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
|
||||
|
||||
- `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||
- `-c N, --kv-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
|
||||
|
||||
### Extended Context Size
|
||||
|
||||
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--kv-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||
|
||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||
|
||||
@ -152,7 +153,7 @@ The `--keep` option allows users to retain the original prompt when the model ru
|
||||
|
||||
- `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||
|
||||
By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
||||
By utilizing context management options like `--kv-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
|
||||
|
||||
## Generation Flags
|
||||
|
||||
@ -181,12 +182,12 @@ Example usage: `--temp 0.5`
|
||||
### Repeat Penalty
|
||||
|
||||
- `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||
- `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
|
||||
- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.
|
||||
|
||||
The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
|
||||
|
||||
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
|
||||
The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`kv-size`).
|
||||
|
||||
Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
|
||||
|
||||
|
@ -157,9 +157,9 @@ int main(int argc, char ** argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
||||
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||
params.n_ctx = 8;
|
||||
if (params.kv_size != 0 && params.kv_size < 8) {
|
||||
LOG_TEE("%s: warning: minimum KV size is 8, using minimum size.\n", __func__);
|
||||
params.kv_size = 8;
|
||||
}
|
||||
|
||||
if (params.rope_freq_base != 0.0) {
|
||||
@ -208,12 +208,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
LOG("n_ctx: %d\n", n_ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
LOG("kv_size: %d\n", kv_size);
|
||||
|
||||
if (n_ctx > n_ctx_train) {
|
||||
if (kv_size > n_ctx_train) {
|
||||
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, n_ctx);
|
||||
__func__, n_ctx_train, kv_size);
|
||||
}
|
||||
|
||||
// print system information
|
||||
@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
|
||||
} else {
|
||||
// The file exists and is not empty
|
||||
session_tokens.resize(n_ctx);
|
||||
session_tokens.resize(kv_size);
|
||||
size_t n_token_count_out = 0;
|
||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||
@ -289,8 +289,8 @@ int main(int argc, char ** argv) {
|
||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
if ((int) embd_inp.size() > kv_size - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -450,7 +450,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
|
||||
|
||||
// group-attention state
|
||||
// number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
|
||||
@ -463,7 +463,7 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||
}
|
||||
LOG_TEE("\n\n");
|
||||
@ -514,9 +514,9 @@ int main(int argc, char ** argv) {
|
||||
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
|
||||
// predict
|
||||
if (!embd.empty()) {
|
||||
// Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
|
||||
// Note: (kv_size - 4) here is to match the logic for commandline prompt handling via
|
||||
// --prompt or --file which uses the same value.
|
||||
int max_embd_size = n_ctx - 4;
|
||||
int max_embd_size = kv_size - 4;
|
||||
|
||||
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
||||
if ((int) embd.size() > max_embd_size) {
|
||||
@ -533,8 +533,8 @@ int main(int argc, char ** argv) {
|
||||
// infinite text generation via context shifting
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||
// - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
|
||||
if (params.n_predict == -2) {
|
||||
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
||||
break;
|
||||
@ -543,8 +543,8 @@ int main(int argc, char ** argv) {
|
||||
const int n_left = n_past - params.n_keep - 1;
|
||||
const int n_discard = n_left/2;
|
||||
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
||||
LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
|
||||
n_past, n_left, kv_size, params.n_keep, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
||||
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
||||
@ -666,7 +666,7 @@ int main(int argc, char ** argv) {
|
||||
LOG("n_past = %d\n", n_past);
|
||||
// Display total tokens alongside total time
|
||||
if (params.n_print > 0 && n_past % params.n_print == 0) {
|
||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
|
||||
LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, kv_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "\n\n");
|
||||
fflush(stderr);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
std::vector<client> clients(n_clients);
|
||||
for (size_t i = 0; i < clients.size(); ++i) {
|
||||
@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, 1);
|
||||
llama_batch batch = llama_batch_init(kv_size, 0, 1);
|
||||
|
||||
int32_t n_total_prompt = 0;
|
||||
int32_t n_total_gen = 0;
|
||||
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = seed;
|
||||
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||
ctx_params.kv_size = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||
ctx_params.n_batch = 512;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
@ -121,12 +121,12 @@ int main(int argc, char ** argv) {
|
||||
// total length of the sequences including the prompt
|
||||
const int n_len = n_tokens_all + n_predict;
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx) - n_keep;
|
||||
const int n_kv_req = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx) - n_keep;
|
||||
const int n_kv_req = llama_kv_size(ctx);
|
||||
const int n_batch = ctx_params.n_batch;
|
||||
const int n_batch_grp = ctx_params.n_batch/n_grp;
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
|
||||
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, kv_size, n_kv_req, n_grp, n_batch);
|
||||
|
||||
// print the prompt token-by-token
|
||||
|
||||
@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
|
||||
int n_past = 0;
|
||||
|
||||
// fill the KV cache
|
||||
for (int i = 0; i < n_ctx; i += n_batch) {
|
||||
for (int i = 0; i < kv_size; i += n_batch) {
|
||||
if (i > 0 && n_grp > 1) {
|
||||
// if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
|
||||
const int ib = i/n_batch - 1;
|
||||
@ -174,13 +174,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
|
||||
for (int i = kv_size; i < n_tokens_all; i += n_batch) {
|
||||
const int n_discard = n_batch;
|
||||
|
||||
LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
|
||||
@ -203,13 +203,13 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
{
|
||||
const int n_discard = n_past - n_ctx + n_predict;
|
||||
const int n_discard = n_past - kv_size + n_predict;
|
||||
|
||||
if (n_discard > 0) {
|
||||
LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
|
||||
|
||||
llama_kv_cache_seq_rm (ctx, 0, n_keep , n_keep + n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx, -n_discard);
|
||||
llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);
|
||||
|
||||
n_past -= n_discard;
|
||||
}
|
||||
|
@ -320,11 +320,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
|
||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
if (int(tokens.size()) < 2*kv_size) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
|
||||
kv_size);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
@ -340,13 +340,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
const int calc_chunk = n_ctx;
|
||||
const int calc_chunk = kv_size;
|
||||
|
||||
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
|
||||
|
||||
if (int(tokens.size()) <= calc_chunk) {
|
||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
|
||||
tokens.size(), n_ctx, params.ppl_stride);
|
||||
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n", __func__,
|
||||
tokens.size(), kv_size, params.ppl_stride);
|
||||
return {tokens, -1, logit_history, prob_history};
|
||||
}
|
||||
|
||||
@ -414,8 +414,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
||||
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.kv_size - params.ppl_stride + start, params.kv_size + start);
|
||||
for (int j = kv_size - params.ppl_stride - 1; j < kv_size - 1; ++j) {
|
||||
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
const std::vector<float> tok_logits(
|
||||
@ -453,7 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
// BOS tokens will be added for each chunk before eval
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
|
||||
std::ofstream logits_stream;
|
||||
if (!params.logits_file.empty()) {
|
||||
@ -464,7 +464,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
}
|
||||
fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
|
||||
logits_stream.write("_logits_", 8);
|
||||
logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
|
||||
logits_stream.write(reinterpret_cast<const char *>(&kv_size), sizeof(kv_size));
|
||||
}
|
||||
|
||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||
@ -475,9 +475,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||
|
||||
if (int(tokens.size()) < 2*n_ctx) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
|
||||
n_ctx);
|
||||
if (int(tokens.size()) < 2*kv_size) {
|
||||
fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
|
||||
kv_size);
|
||||
fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
|
||||
return {std::move(tokens), 0., {}, {}};
|
||||
}
|
||||
@ -488,7 +488,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
std::vector<float> prob_history;
|
||||
prob_history.resize(tokens.size());
|
||||
|
||||
const int n_chunk_max = tokens.size() / n_ctx;
|
||||
const int n_chunk_max = tokens.size() / kv_size;
|
||||
|
||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
@ -498,11 +498,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
double nll = 0.0;
|
||||
double nll2 = 0.0;
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
const int num_batches = (kv_size + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
if (num_batches > 1) {
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
logits.reserve((size_t)kv_size * n_vocab);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||
@ -513,14 +513,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
if (!params.logits_file.empty()) {
|
||||
logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
|
||||
logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
|
||||
logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
|
||||
logits_stream.write((const char *)tokens.data(), n_chunk * kv_size * sizeof(tokens[0]));
|
||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||
log_probs.resize(n_ctx * nv);
|
||||
log_probs.resize(kv_size * nv);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
const int start = i * kv_size;
|
||||
const int end = start + kv_size;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@ -566,7 +566,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
// We get the logits for all the tokens in the context window (params.n_ctx)
|
||||
// We get the logits for all the tokens in the context window (params.kv_size)
|
||||
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
|
||||
// calculate the perplexity over the last half of the window (so the model always has
|
||||
// some context to predict the token).
|
||||
@ -578,16 +578,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||
// process the entire prompt.
|
||||
const int first = n_ctx/2;
|
||||
const int first = kv_size/2;
|
||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
if (!params.logits_file.empty()) {
|
||||
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
|
||||
workers, log_probs, nll, nll2);
|
||||
} else {
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
}
|
||||
count += n_ctx - first - 1;
|
||||
count += kv_size - first - 1;
|
||||
|
||||
// perplexity is e^(average negative log-likelihood)
|
||||
if (params.ppl_output_type == 0) {
|
||||
@ -596,7 +596,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||
double av = nll/count;
|
||||
double av2 = nll2/count - av*av;
|
||||
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||
printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||
printf("%8d %.4lf %4lf %4lf\n", i*kv_size, std::exp(nll / count), av, av2);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
@ -805,16 +805,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
double acc = 0.0f;
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
const int max_tasks_per_batch = 32;
|
||||
const int max_seq = 4*max_tasks_per_batch;
|
||||
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
|
||||
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||
std::vector<float> batch_logits(n_vocab*kv_size);
|
||||
|
||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||
std::vector<float> eval_results;
|
||||
@ -832,7 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||
// each task has 4 unique seuqnce ids - one for each ending
|
||||
// the common prefix is shared among the 4 sequences to save tokens
|
||||
// we extract logits only from the last common token and from all ending tokens of each sequence
|
||||
while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
|
||||
while (n_cur + (int) hs_data[i1].required_tokens <= kv_size) {
|
||||
auto & hs_cur = hs_data[i1];
|
||||
|
||||
const int s0 = 4*(i1 - i0);
|
||||
@ -1082,16 +1082,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
const int max_tasks_per_batch = 128;
|
||||
const int max_seq = 2*max_tasks_per_batch;
|
||||
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
|
||||
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||
std::vector<float> batch_logits(n_vocab*kv_size);
|
||||
|
||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||
std::vector<float> eval_results;
|
||||
@ -1108,7 +1108,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
llama_batch_clear(batch);
|
||||
|
||||
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
||||
while (n_cur + (int) data[i1].required_tokens <= kv_size) {
|
||||
const int s0 = 2*(i1 - i0);
|
||||
if (s0 + 2 > max_seq) {
|
||||
break;
|
||||
@ -1434,16 +1434,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
printf("\ntask\tacc_norm\n");
|
||||
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
const int n_batch = params.n_batch;
|
||||
|
||||
const int max_tasks_per_batch = 32;
|
||||
const int max_seq = 4*max_tasks_per_batch;
|
||||
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||
llama_batch batch = llama_batch_init(kv_size, 0, max_seq);
|
||||
|
||||
std::vector<float> tok_logits(n_vocab);
|
||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
||||
std::vector<float> batch_logits(n_vocab*kv_size);
|
||||
|
||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||
std::vector<float> eval_results;
|
||||
@ -1467,7 +1467,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||
// the common prefix is shared among the 4 sequences to save tokens
|
||||
// we extract logits only from the last common token and from all ending tokens of each sequence
|
||||
int s0 = 0;
|
||||
while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
|
||||
while (n_cur + (int) tasks[i1].required_tokens <= kv_size) {
|
||||
auto& cur_task = tasks[i1];
|
||||
|
||||
int num_answers = cur_task.seq_tokens.size();
|
||||
@ -1620,11 +1620,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t n_ctx;
|
||||
in.read((char *)&n_ctx, sizeof(n_ctx));
|
||||
if (n_ctx > llama_n_ctx(ctx)) {
|
||||
fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
|
||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||
uint32_t kv_size;
|
||||
in.read((char *)&kv_size, sizeof(kv_size));
|
||||
if (kv_size > llama_kv_size(ctx)) {
|
||||
fprintf(stderr, "%s: %s has been computed with %u, while the current KV Cache size is %d. Increase it with -kv and retry\n",
|
||||
__func__, params.logits_file.c_str(), kv_size, params.kv_size);
|
||||
}
|
||||
|
||||
int n_vocab, n_chunk;
|
||||
@ -1638,22 +1638,22 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
||||
std::vector<llama_token> tokens(kv_size * n_chunk);
|
||||
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
||||
fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
const int n_batch = params.n_batch;
|
||||
const int num_batches = (n_ctx + n_batch - 1)/n_batch;
|
||||
const int num_batches = (kv_size + n_batch - 1)/n_batch;
|
||||
const int nv = 2*((n_vocab + 1)/2) + 4;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
|
||||
std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
|
||||
std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||
std::vector<uint16_t> log_probs_uint16(size_t(kv_size - 1 - kv_size/2) * nv);
|
||||
std::vector<float> kld_values(size_t(kv_size - 1 - kv_size /2)*n_chunk);
|
||||
std::vector<float> logits;
|
||||
if (num_batches > 1) {
|
||||
logits.reserve(n_ctx * n_vocab);
|
||||
logits.reserve(kv_size * n_vocab);
|
||||
}
|
||||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
@ -1672,8 +1672,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
auto kld_ptr = kld_values.data();
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
const int start = i * kv_size;
|
||||
const int end = start + kv_size;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
@ -1726,11 +1726,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||
printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n");
|
||||
}
|
||||
|
||||
const int first = n_ctx/2;
|
||||
const int first = kv_size/2;
|
||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
|
||||
workers, log_probs_uint16, kld, kld_ptr);
|
||||
kld_ptr += n_ctx - 1 - first;
|
||||
kld_ptr += kv_size - 1 - first;
|
||||
|
||||
auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
|
||||
auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
|
||||
@ -1788,12 +1788,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
params.n_batch = std::min(params.n_batch, params.kv_size);
|
||||
|
||||
if (params.ppl_stride > 0) {
|
||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
|
||||
params.n_ctx, params.n_ctx + params.ppl_stride/2);
|
||||
params.n_ctx += params.ppl_stride/2;
|
||||
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting KV size from %d to %d\n",
|
||||
params.kv_size, params.kv_size + params.ppl_stride / 2);
|
||||
params.kv_size += params.ppl_stride/2;
|
||||
}
|
||||
|
||||
print_build_info();
|
||||
@ -1823,9 +1823,9 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
if (params.kv_size > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
__func__, n_ctx_train, params.n_ctx);
|
||||
__func__, n_ctx_train, params.kv_size);
|
||||
}
|
||||
|
||||
// print system information
|
||||
|
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
|
||||
auto cparams = llama_context_default_params();
|
||||
cparams.n_ctx = 256;
|
||||
cparams.kv_size = 256;
|
||||
cparams.seed = 1;
|
||||
|
||||
ctx = llama_new_context_with_model(model, cparams);
|
||||
|
@ -12,7 +12,7 @@ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
|
||||
N_THREAD="${N_THREAD:-12}"
|
||||
|
||||
# Note: you can also override the generation options by specifying them on the command line:
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---kv_size 4096 --batch-size 1024}"
|
||||
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
|
@ -174,7 +174,7 @@ node index.js
|
||||
|
||||
`repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).
|
||||
|
||||
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
|
||||
`repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
|
||||
|
||||
`penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).
|
||||
|
||||
@ -239,7 +239,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
|
||||
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
|
||||
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
|
||||
- `generation_settings`: The provided options above excluding `prompt` but including `kv_size`, `model`
|
||||
- `model`: The path to the model loaded with `-m`
|
||||
- `prompt`: The provided `prompt`
|
||||
- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
|
||||
@ -249,7 +249,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
- `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
|
||||
- `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
|
||||
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the KV size (`kv_size`)
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
@ -404,7 +404,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
"mirostat_eta": 0.10000000149011612,
|
||||
"mirostat_tau": 5.0,
|
||||
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
|
||||
"n_ctx": 2048,
|
||||
"kv_size": 2048,
|
||||
"n_keep": 0,
|
||||
"n_predict": 100000,
|
||||
"n_probs": 0,
|
||||
|
@ -155,7 +155,7 @@ struct llama_client_slot
|
||||
int64_t t_last_used = -1;
|
||||
|
||||
// generation props
|
||||
int32_t n_ctx = 0; // context size per slot
|
||||
int32_t kv_size = 0; // KV size per slot
|
||||
int32_t n_past = 0;
|
||||
int32_t n_decoded = 0;
|
||||
int32_t n_remaining = -1;
|
||||
@ -325,7 +325,7 @@ struct llama_server_context
|
||||
bool all_slots_are_idle = false;
|
||||
bool add_bos_token = true;
|
||||
|
||||
int32_t n_ctx; // total context for all clients / slots
|
||||
int32_t kv_size; // total KV Cache for all clients / slots
|
||||
|
||||
// system prompt
|
||||
bool system_need_update = false;
|
||||
@ -369,8 +369,8 @@ struct llama_server_context
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params.n_ctx < 2048) { // request larger context for the image embedding
|
||||
params.n_ctx = 2048;
|
||||
if (params.kv_size < 2048) { // request larger context for the image embedding
|
||||
params.kv_size = 2048;
|
||||
}
|
||||
}
|
||||
|
||||
@ -392,7 +392,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
kv_size = llama_kv_size(ctx);
|
||||
|
||||
add_bos_token = llama_should_add_bos_token(model);
|
||||
|
||||
@ -403,7 +403,7 @@ struct llama_server_context
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
|
||||
const int32_t n_ctx_slot = n_ctx / params.n_parallel;
|
||||
const int32_t kv_size_slot = kv_size / params.n_parallel;
|
||||
|
||||
LOG_TEE("Available slots:\n");
|
||||
for (int i = 0; i < params.n_parallel; i++)
|
||||
@ -411,10 +411,10 @@ struct llama_server_context
|
||||
llama_client_slot slot;
|
||||
|
||||
slot.id = i;
|
||||
slot.n_ctx = n_ctx_slot;
|
||||
slot.kv_size = kv_size_slot;
|
||||
slot.n_predict = params.n_predict;
|
||||
|
||||
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
|
||||
LOG_TEE(" -> Slot %i - max KV Size: %i\n", slot.id, kv_size_slot);
|
||||
|
||||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
@ -423,7 +423,7 @@ struct llama_server_context
|
||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
//GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * ga_n"); // NOLINT
|
||||
LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
|
||||
}
|
||||
|
||||
@ -439,7 +439,7 @@ struct llama_server_context
|
||||
default_generation_settings_for_props = get_formated_generation(slots.front());
|
||||
default_generation_settings_for_props["seed"] = -1;
|
||||
|
||||
batch = llama_batch_init(n_ctx, 0, params.n_parallel);
|
||||
batch = llama_batch_init(kv_size, 0, params.n_parallel);
|
||||
}
|
||||
|
||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||
@ -1065,7 +1065,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
return json {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"kv_size", slot.kv_size},
|
||||
{"n_predict", slot.n_predict},
|
||||
{"model", params.model_alias},
|
||||
{"seed", slot.params.seed},
|
||||
@ -1474,7 +1474,7 @@ struct llama_server_context
|
||||
{
|
||||
if (slot.ga_n == 1)
|
||||
{
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
|
||||
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.kv_size)
|
||||
{
|
||||
// Shift context
|
||||
const int n_left = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
|
||||
@ -1496,7 +1496,7 @@ struct llama_server_context
|
||||
slot.truncated = true;
|
||||
|
||||
LOG_VERBOSE("context shift", {
|
||||
{ "n_ctx", n_ctx },
|
||||
{ "kv_size", kv_size },
|
||||
{ "n_keep", params.n_keep },
|
||||
{ "n_left", n_left },
|
||||
});
|
||||
@ -1598,12 +1598,12 @@ struct llama_server_context
|
||||
{
|
||||
slot.params.n_keep = slot.num_prompt_tokens;
|
||||
}
|
||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||
slot.params.n_keep = std::min(slot.kv_size - 4, slot.params.n_keep);
|
||||
|
||||
// if input prompt is too big, truncate it
|
||||
if (slot.num_prompt_tokens >= slot.n_ctx)
|
||||
if (slot.num_prompt_tokens >= slot.kv_size)
|
||||
{
|
||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||
const int n_left = slot.kv_size - slot.params.n_keep;
|
||||
const int n_block_size = n_left / 2;
|
||||
const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;
|
||||
|
||||
@ -1611,7 +1611,7 @@ struct llama_server_context
|
||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());
|
||||
|
||||
LOG_VERBOSE("input truncated", {
|
||||
{"n_ctx", slot.n_ctx},
|
||||
{"kv_size", slot.kv_size},
|
||||
{"n_keep", slot.params.n_keep},
|
||||
{"n_left", n_left},
|
||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||
@ -1620,7 +1620,7 @@ struct llama_server_context
|
||||
prompt_tokens = new_tokens;
|
||||
|
||||
slot.num_prompt_tokens = prompt_tokens.size();
|
||||
GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
|
||||
GGML_ASSERT(slot.num_prompt_tokens < slot.kv_size);
|
||||
}
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
@ -1873,7 +1873,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.n_ctx);
|
||||
printf(" -kv N, --kv-size N Specify the total size of the KV cache (default: %d)\n", params.kv_size);
|
||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
@ -2043,16 +2043,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
server_print_usage(argv[0], default_params, default_sparams);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
|
||||
else if (arg == "-c" || arg == "--ctx-size" || arg == "--kv_size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
LOG_WARNING("-c,--ctx-size,--ctx_size option is deprecated, use --kv-size instead",
|
||||
{{"--ctx_size", params.n_ctx}});
|
||||
params.kv_size = std::stoi(argv[i]);
|
||||
LOG_WARNING("-c,--ctx-size,--kv_size option is deprecated, use --kv-size instead",
|
||||
{{"--kv_size", params.kv_size}});
|
||||
}
|
||||
else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size")
|
||||
{
|
||||
@ -2061,7 +2061,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
params.kv_size = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--rope-scaling")
|
||||
{
|
||||
|
@ -7,7 +7,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for g
|
||||
|
||||
...
|
||||
|
||||
main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
|
||||
main: n_len = 32, kv_size = 2048, n_parallel = 1, n_kv_req = 32
|
||||
|
||||
Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old
|
||||
|
||||
|
@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = 2048;
|
||||
ctx_params.kv_size = 2048;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
|
||||
@ -68,15 +68,15 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> tokens_list;
|
||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
const int kv_size = llama_kv_size(ctx);
|
||||
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||
|
||||
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
|
||||
LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d\n", __func__, n_len, kv_size, n_kv_req);
|
||||
|
||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||
if (n_kv_req > kv_size) {
|
||||
LOG_TEE("%s: error: n_kv_req > kv_size, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_len or increase kv_size\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -116,7 +116,7 @@ int main(int argc, char ** argv) {
|
||||
std::vector<llama_token> inp;
|
||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||
|
||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||
const int max_context_size = llama_kv_size(ctx_tgt);
|
||||
const int max_tokens_list_size = max_context_size - 4;
|
||||
|
||||
if ((int) inp.size() > max_tokens_list_size) {
|
||||
@ -172,8 +172,8 @@ int main(int argc, char ** argv) {
|
||||
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
|
||||
}
|
||||
|
||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||
llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
|
||||
llama_batch batch_dft = llama_batch_init(params.kv_size, 0, 1);
|
||||
llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, n_seq_dft);
|
||||
|
||||
const auto t_dec_start = ggml_time_us();
|
||||
|
||||
|
@ -22,7 +22,7 @@
|
||||
|
||||
struct my_llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx = 512;
|
||||
uint32_t kv_size = 512;
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_layer = 32;
|
||||
@ -112,7 +112,7 @@ static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
||||
|
||||
static void print_params(struct my_llama_hparams * params) {
|
||||
printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
|
||||
printf("%s: n_ctx: %u\n", __func__, params->n_ctx);
|
||||
printf("%s: kv_size: %u\n", __func__, params->kv_size);
|
||||
printf("%s: n_embd: %u\n", __func__, params->n_embd);
|
||||
printf("%s: n_head: %u\n", __func__, params->n_head);
|
||||
printf("%s: n_ff: %u\n", __func__, params->n_ff);
|
||||
@ -272,7 +272,7 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
const int n_past = 0;
|
||||
const int N = n_tokens;
|
||||
const auto & hparams = model->hparams;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int kv_size = hparams.kv_size;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
@ -295,13 +295,13 @@ static struct ggml_tensor * llama_build_train_graphs(
|
||||
ggml_set_input(KQ_pos);
|
||||
|
||||
// rope has so much parameters that we make a custom function for it
|
||||
auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
|
||||
auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
|
||||
(struct ggml_tensor * t) -> struct ggml_tensor * {
|
||||
// not capturing these, to silcence warnings
|
||||
const int rope_mode = 0;
|
||||
|
||||
return ggml_rope_custom(
|
||||
ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
ctx, t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
|
||||
);
|
||||
};
|
||||
|
||||
@ -487,8 +487,8 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
|
||||
GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
|
||||
GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);
|
||||
|
||||
// n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
|
||||
GGUF_GET_KEY(fctx, model->hparams.n_ctx, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
|
||||
// kv_size was not saved in earlier checkpoint file versions, so we make it optional here
|
||||
GGUF_GET_KEY(fctx, model->hparams.kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
|
||||
|
||||
GGUF_GET_KEY(fctx, model->hparams.n_embd, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
|
||||
GGUF_GET_KEY(fctx, model->hparams.n_ff, gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
|
||||
@ -543,7 +543,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
|
||||
gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
|
||||
|
||||
// set hparams
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.n_ctx );
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH), model->hparams.kv_size );
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH), model->hparams.n_embd );
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH), model->hparams.n_ff );
|
||||
gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT), model->hparams.n_head );
|
||||
@ -945,7 +945,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
struct my_llama_model model;
|
||||
model.hparams.n_vocab = llama_n_vocab(lmodel);
|
||||
model.hparams.n_ctx = params.common.n_ctx;
|
||||
model.hparams.kv_size = params.common.n_ctx;
|
||||
model.hparams.n_embd = params.n_embd;
|
||||
model.hparams.n_head = params.n_head;
|
||||
model.hparams.n_layer = params.n_layer;
|
||||
@ -982,9 +982,9 @@ int main(int argc, char ** argv) {
|
||||
printf("%s: init model\n", __func__);
|
||||
bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
|
||||
if (existed) {
|
||||
// overwrite last n_ctx with user provided n_ctx
|
||||
// overwrite last kv_size with user provided kv_size
|
||||
if (params.common.custom_n_ctx) {
|
||||
model.hparams.n_ctx = params.common.n_ctx;
|
||||
model.hparams.kv_size = params.common.n_ctx;
|
||||
}
|
||||
|
||||
const bool opt_past_changed = opt->params.past != params.common.opt_past;
|
||||
@ -1031,7 +1031,7 @@ int main(int argc, char ** argv) {
|
||||
printf("%s: opt_size = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
|
||||
printf("%s: opt iter %d\n", __func__, opt->iter);
|
||||
|
||||
int n_tokens = model.hparams.n_ctx;
|
||||
int n_tokens = model.hparams.kv_size;
|
||||
int n_vocab = model.hparams.n_vocab;
|
||||
int n_batch = params.common.n_batch;
|
||||
|
||||
|
176
llama.cpp
176
llama.cpp
@ -1607,7 +1607,7 @@ struct llama_hparams {
|
||||
};
|
||||
|
||||
struct llama_cparams {
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t kv_size; // KV Cache size used during inference
|
||||
uint32_t n_batch;
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
@ -1923,9 +1923,9 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
|
||||
struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
|
||||
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
||||
struct ggml_tensor * inp_KQ_pos; // F32 [kv_size]
|
||||
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
||||
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
||||
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
||||
|
||||
@ -1943,7 +1943,7 @@ static bool llama_kv_cache_init(
|
||||
const llama_model & model,
|
||||
ggml_type ktype,
|
||||
ggml_type vtype,
|
||||
uint32_t n_ctx,
|
||||
uint32_t kv_size,
|
||||
bool offload) {
|
||||
const struct llama_hparams & hparams = model.hparams;
|
||||
|
||||
@ -1954,11 +1954,11 @@ static bool llama_kv_cache_init(
|
||||
cache.has_shift = false;
|
||||
|
||||
cache.head = 0;
|
||||
cache.size = n_ctx;
|
||||
cache.size = kv_size;
|
||||
cache.used = 0;
|
||||
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
cache.cells.resize(kv_size);
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
offload = false;
|
||||
@ -1997,8 +1997,8 @@ static bool llama_kv_cache_init(
|
||||
|
||||
for (int i = 0; i < (int) n_layer; i++) {
|
||||
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*kv_size);
|
||||
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*kv_size);
|
||||
ggml_format_name(k, "cache_k_l%d", i);
|
||||
ggml_format_name(v, "cache_v_l%d", i);
|
||||
cache.k_l.push_back(k);
|
||||
@ -2029,19 +2029,19 @@ static bool llama_kv_cache_init(
|
||||
static bool llama_kv_cache_find_slot(
|
||||
struct llama_kv_cache & cache,
|
||||
const struct llama_batch & batch) {
|
||||
const uint32_t n_ctx = cache.size;
|
||||
const uint32_t kv_size = cache.size;
|
||||
const uint32_t n_tokens = batch.n_tokens;
|
||||
|
||||
if (n_tokens > n_ctx) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
|
||||
if (n_tokens > kv_size) {
|
||||
LLAMA_LOG_ERROR("%s: n_tokens=%d > kv_size=%d\n", __func__, n_tokens, kv_size);
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t n_tested = 0;
|
||||
|
||||
while (true) {
|
||||
if (cache.head + n_tokens > n_ctx) {
|
||||
n_tested += n_ctx - cache.head;
|
||||
if (cache.head + n_tokens > kv_size) {
|
||||
n_tested += kv_size - cache.head;
|
||||
cache.head = 0;
|
||||
continue;
|
||||
}
|
||||
@ -2060,7 +2060,7 @@ static bool llama_kv_cache_find_slot(
|
||||
break;
|
||||
}
|
||||
|
||||
if (n_tested >= n_ctx) {
|
||||
if (n_tested >= kv_size) {
|
||||
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
||||
return false;
|
||||
}
|
||||
@ -3692,11 +3692,11 @@ static bool llm_load_tensors(
|
||||
}
|
||||
|
||||
// create one context per buffer type
|
||||
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
||||
size_t kv_size = ggml_tensor_overhead() * ml.n_tensors;
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_size =*/ kv_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@ -3708,7 +3708,7 @@ static bool llm_load_tensors(
|
||||
model.ctxs.push_back(ctx);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: ggml KV size = %7.2f MiB\n", __func__, model.ctxs.size()*kv_size/1024.0/1024.0);
|
||||
|
||||
// create tensors for the weights
|
||||
{
|
||||
@ -4584,7 +4584,7 @@ static void llm_build_k_shift(
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * K_shift,
|
||||
llm_rope_type type,
|
||||
int64_t n_ctx,
|
||||
int64_t kv_size,
|
||||
float freq_base,
|
||||
float freq_scale,
|
||||
const llm_build_cb & cb) {
|
||||
@ -4612,7 +4612,7 @@ static void llm_build_k_shift(
|
||||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
n_embd_head_k, n_head_kv, kv_size,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
0),
|
||||
@ -4630,7 +4630,7 @@ static void llm_build_kv_store(
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * k_cur,
|
||||
struct ggml_tensor * v_cur,
|
||||
int64_t n_ctx,
|
||||
int64_t kv_size,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
const llm_build_cb & cb,
|
||||
@ -4648,7 +4648,7 @@ static void llm_build_kv_store(
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
|
||||
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
( kv_size)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
@ -4792,7 +4792,7 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * q_cur,
|
||||
struct ggml_tensor * kq_mask,
|
||||
struct ggml_tensor * kq_pos,
|
||||
int64_t n_ctx,
|
||||
int64_t kv_size,
|
||||
int32_t n_tokens,
|
||||
int32_t n_kv,
|
||||
float kq_scale,
|
||||
@ -4851,8 +4851,8 @@ static struct ggml_tensor * llm_build_kqv(
|
||||
struct ggml_tensor * v =
|
||||
ggml_view_3d(ctx, kv.v_l[il],
|
||||
n_kv, n_embd_head_v, n_head_kv,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx,
|
||||
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
|
||||
ggml_element_size(kv.v_l[il])*kv_size,
|
||||
ggml_element_size(kv.v_l[il])*kv_size *n_embd_head_v,
|
||||
0);
|
||||
cb(v, "v", il);
|
||||
|
||||
@ -4892,7 +4892,7 @@ static struct ggml_tensor * llm_build_kv(
|
||||
struct ggml_tensor * q_cur,
|
||||
struct ggml_tensor * kq_mask,
|
||||
struct ggml_tensor * kq_pos,
|
||||
int64_t n_ctx,
|
||||
int64_t kv_size,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
int32_t n_kv,
|
||||
@ -4906,11 +4906,11 @@ static struct ggml_tensor * llm_build_kv(
|
||||
ggml_build_forward_expand(graph, k_cur);
|
||||
ggml_build_forward_expand(graph, v_cur);
|
||||
|
||||
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, kv_size, n_tokens, kv_head, cb, il);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
||||
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
||||
q_cur, kq_mask, kq_pos, kv_size, n_tokens, n_kv, kq_scale, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
return cur;
|
||||
@ -4926,7 +4926,7 @@ struct llm_build_context {
|
||||
|
||||
const int64_t n_embd;
|
||||
const int64_t n_layer;
|
||||
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
||||
const int64_t kv_size; // user-specified KV Cache size (can be different from n_ctx_train)
|
||||
const int64_t n_head;
|
||||
const int64_t n_head_kv;
|
||||
const int64_t n_embd_head_k;
|
||||
@ -4946,7 +4946,7 @@ struct llm_build_context {
|
||||
const float norm_rms_eps;
|
||||
|
||||
const int32_t n_tokens;
|
||||
const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx)
|
||||
const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_size)
|
||||
const int32_t kv_head; // index of where we store new KV data in the cache
|
||||
const int32_t n_orig_ctx;
|
||||
|
||||
@ -4973,7 +4973,7 @@ struct llm_build_context {
|
||||
kv_self (lctx.kv_self),
|
||||
n_embd (hparams.n_embd),
|
||||
n_layer (hparams.n_layer),
|
||||
n_ctx (cparams.n_ctx),
|
||||
kv_size (cparams.kv_size),
|
||||
n_head (hparams.n_head),
|
||||
n_head_kv (hparams.n_head_kv),
|
||||
n_embd_head_k (hparams.n_embd_head_k),
|
||||
@ -4991,14 +4991,14 @@ struct llm_build_context {
|
||||
norm_eps (hparams.f_norm_eps),
|
||||
norm_rms_eps (hparams.f_norm_rms_eps),
|
||||
n_tokens (batch.n_tokens),
|
||||
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||
n_kv (worst_case ? kv_size : kv_self.n),
|
||||
kv_head (worst_case ? kv_size - n_tokens : kv_self.head),
|
||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||
do_rope_shift (worst_case || kv_self.has_shift),
|
||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
|
||||
cb (cb),
|
||||
buf_compute_meta (lctx.buf_compute_meta) {
|
||||
// all initializations should be done in init()
|
||||
// all initializations should be done in init()
|
||||
}
|
||||
|
||||
void init() {
|
||||
@ -5041,7 +5041,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -5093,7 +5093,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5229,7 +5229,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -5277,7 +5277,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5347,7 +5347,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -5401,7 +5401,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5500,7 +5500,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5565,7 +5565,7 @@ struct llm_build_context {
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -5705,7 +5705,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Q, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5798,7 +5798,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -5899,7 +5899,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
} else {
|
||||
// compute Q and K and RoPE them
|
||||
@ -5930,7 +5930,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6043,7 +6043,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6140,7 +6140,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6209,7 +6209,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6262,7 +6262,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6332,7 +6332,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6377,7 +6377,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6446,7 +6446,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6498,7 +6498,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6567,7 +6567,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6625,7 +6625,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6689,7 +6689,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6728,7 +6728,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
struct ggml_tensor * sa_out = cur;
|
||||
@ -6827,7 +6827,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -6894,7 +6894,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -6936,7 +6936,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -7002,7 +7002,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -7054,7 +7054,7 @@ struct llm_build_context {
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -7121,7 +7121,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -7172,8 +7172,8 @@ struct llm_build_context {
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -7253,7 +7253,7 @@ struct llm_build_context {
|
||||
|
||||
// shift the entire K-cache if needed
|
||||
if (do_rope_shift) {
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
||||
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@ -7304,8 +7304,8 @@ struct llm_build_context {
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
@ -7549,13 +7549,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||
}
|
||||
|
||||
if (kv_self.has_shift) {
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
const int64_t kv_size = cparams.kv_size;
|
||||
|
||||
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
||||
|
||||
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
||||
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
for (int i = 0; i < kv_size; ++i) {
|
||||
data[i] = lctx.kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
@ -7694,7 +7694,7 @@ static int llama_decode_internal(
|
||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||
// after enough generations, the benefit from this heuristic disappears
|
||||
// if we start defragmenting the cache, the benefit from this will be more important
|
||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||
kv_self.n = std::min((int32_t) cparams.kv_size, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||
|
||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||
@ -11148,7 +11148,7 @@ struct llama_model_params llama_model_default_params() {
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
struct llama_context_params result = {
|
||||
/*.seed =*/ LLAMA_DEFAULT_SEED,
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.kv_size =*/ 512,
|
||||
/*.n_batch =*/ 512,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||
@ -11328,7 +11328,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.do_pooling = params.do_pooling;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
cparams.kv_size = params.kv_size == 0 ? hparams.n_ctx_train : params.kv_size;
|
||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
||||
|
||||
@ -11356,7 +11356,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
|
||||
LLAMA_LOG_INFO("%s: kv_size = %u\n", __func__, cparams.kv_size);
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
|
||||
@ -11447,7 +11447,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->backends.push_back(ctx->backend_cpu);
|
||||
|
||||
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
||||
cparams.n_ctx, cparams.offload_kqv)) {
|
||||
cparams.kv_size, cparams.offload_kqv)) {
|
||||
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
@ -11490,9 +11490,9 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
|
||||
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
|
||||
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
|
||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
|
||||
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size, cparams.n_batch);
|
||||
ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size);
|
||||
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.kv_size);
|
||||
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
|
||||
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
|
||||
|
||||
@ -11531,8 +11531,8 @@ struct llama_context * llama_new_context_with_model(
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
||||
|
||||
// build worst-case graph
|
||||
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
||||
int n_past = cparams.n_ctx - n_tokens;
|
||||
int n_tokens = (int)std::min(cparams.kv_size, cparams.n_batch);
|
||||
int n_past = cparams.kv_size - n_tokens;
|
||||
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);
|
||||
|
||||
@ -11565,7 +11565,7 @@ struct llama_context * llama_new_context_with_model(
|
||||
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
||||
// TODO: needs fix after #3228
|
||||
GGML_ASSERT(false && "not implemented");
|
||||
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
||||
//const std::vector<llama_token> tmp(ctx->model.hparams.kv_size, llama_token_bos(ctx));
|
||||
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
||||
llama_backend_free();
|
||||
exit(1);
|
||||
@ -11583,8 +11583,8 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
|
||||
return &ctx->model;
|
||||
}
|
||||
|
||||
uint32_t llama_n_ctx(const struct llama_context * ctx) {
|
||||
return ctx->cparams.n_ctx;
|
||||
uint32_t llama_kv_size(const struct llama_context * ctx) {
|
||||
return ctx->cparams.kv_size;
|
||||
}
|
||||
|
||||
uint32_t llama_n_batch(const struct llama_context * ctx) {
|
||||
@ -11982,7 +11982,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
const auto n_layer = hparams.n_layer;
|
||||
const auto n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const auto n_ctx = cparams.n_ctx;
|
||||
const auto n_kv_req = cparams.kv_size;
|
||||
|
||||
const size_t kv_buf_size = kv_self.total_size();
|
||||
const uint32_t kv_head = kv_self.head;
|
||||
@ -12006,7 +12006,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||
// v is not contiguous, copy row by row
|
||||
tmp_buf.resize(elt_size*kv_head);
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
||||
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_kv_req, tmp_buf.size());
|
||||
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
||||
}
|
||||
}
|
||||
@ -12093,7 +12093,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||
const int n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||
const int n_ctx = cparams.n_ctx;
|
||||
const int n_kv_req = cparams.kv_size;
|
||||
|
||||
size_t kv_buf_size;
|
||||
uint32_t kv_head;
|
||||
@ -12118,7 +12118,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||
// v is not contiguous, copy row by row
|
||||
size_t v_row_size = elt_size*kv_head;
|
||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_kv_req, v_row_size);
|
||||
inp += v_row_size;
|
||||
}
|
||||
}
|
||||
|
4
llama.h
4
llama.h
@ -217,7 +217,7 @@ extern "C" {
|
||||
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
uint32_t kv_size; // KV Cache size
|
||||
uint32_t n_batch; // prompt processing maximum batch size
|
||||
uint32_t n_threads; // number of threads to use for generation
|
||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||
@ -347,7 +347,7 @@ extern "C" {
|
||||
|
||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_kv_size (const struct llama_context * ctx);
|
||||
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||
|
||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||
|
@ -8,7 +8,7 @@ import sys
|
||||
import yaml
|
||||
|
||||
CLI_ARGS_MAIN_PERPLEXITY = [
|
||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
|
||||
"batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "kv-size", "escape",
|
||||
"export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
|
||||
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
|
||||
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
|
||||
@ -27,7 +27,7 @@ CLI_ARGS_LLAMA_BENCH = [
|
||||
]
|
||||
|
||||
CLI_ARGS_SERVER = [
|
||||
"alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
|
||||
"alias", "batch-size", "kv-size", "embedding", "host", "memory-f32", "lora", "lora-base",
|
||||
"low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
|
||||
"numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
|
||||
"threads", "verbose"
|
||||
|
@ -1121,21 +1121,21 @@ struct test_rope : public test_case {
|
||||
const std::array<int64_t, 4> ne;
|
||||
int n_dims;
|
||||
int mode;
|
||||
int n_ctx;
|
||||
int kv_size;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
|
||||
return VARS_TO_STR5(type, ne, n_dims, mode, kv_size);
|
||||
}
|
||||
|
||||
test_rope(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
||||
int n_dims = 10, int mode = 0, int n_ctx = 512)
|
||||
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
|
||||
: type(type), ne(ne), n_dims(n_dims), mode(mode), kv_size(n_ctx) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
||||
ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
|
||||
ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, kv_size);
|
||||
return out;
|
||||
}
|
||||
|
||||
@ -1145,7 +1145,7 @@ struct test_rope : public test_case {
|
||||
// pos
|
||||
std::vector<int> data(ne[2]);
|
||||
for (int i = 0; i < ne[2]; i++) {
|
||||
data[i] = rand() % n_ctx;
|
||||
data[i] = rand() % kv_size;
|
||||
}
|
||||
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
|
||||
} else {
|
||||
@ -1545,7 +1545,7 @@ struct llama_hparams {
|
||||
int32_t n_tokens;
|
||||
|
||||
// llm_build_context
|
||||
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= n_ctx
|
||||
static constexpr int32_t n_kv = 32; // size of KV cache to consider (n_kv <= kv_size
|
||||
static constexpr int32_t kv_head = 1; // index of where we store new KV data in the cache
|
||||
|
||||
uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
|
||||
|
Loading…
Reference in New Issue
Block a user