rename n_ctx to kv_size

2024-12-27 03:44:35 +00:00 · 2024-02-18 20:59:26 +01:00 · 2024-02-18 20:59:26 +01:00 · 606873401c
commit 606873401c
parent ef96e8b1f7
48 changed files with 403 additions and 393 deletions
--- a/README.md
+++ b/README.md
@ -186,7 +186,7 @@ llm_load_print_meta: vocab type     = SPM
 llm_load_print_meta: n_vocab        = 32000
 llm_load_print_meta: n_merges       = 0
 llm_load_print_meta: n_ctx_train    = 4096
-llm_load_print_meta: n_ctx          = 512
+llm_load_print_meta: kv_size        = 512
 llm_load_print_meta: n_embd         = 5120
 llm_load_print_meta: n_head         = 40
 llm_load_print_meta: n_head_kv      = 40
@ -214,7 +214,7 @@ llama_new_context_with_model: compute buffer total size =   75.41 MB

 system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
-generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0
+generate: kv_size = 512, n_batch = 512, n_predict = 400, n_keep = 0


 Building a website can be done in 10 simple steps:
--- a/common/common.cpp
+++ b/common/common.cpp
@ -258,11 +258,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            }
            sparams.top_k = std::stoi(argv[i]);
        } else if (arg == "-c" || arg == "--ctx-size") {
+            if (++i >= argc)
+            {
+                invalid_param = true;
+                break;
+            }
+            params.kv_size = std::stoi(argv[i]);
+            fprintf(stderr, "warning: -c,--ctx-size option is deprecated, use --kv-size instead");
+        } else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            params.n_ctx = std::stoi(argv[i]);
+            params.kv_size = std::stoi(argv[i]);
        } else if (arg == "--grp-attn-n" || arg == "-gan") {
            if (++i >= argc) {
                invalid_param = true;
@ -962,7 +970,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -bf FNAME, --binary-file FNAME\n");
    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
+    printf("  -kv N, --kv-size N    Specify the total size of the KV cache (default: %d)\n", params.kv_size);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
    printf("                        (default: %s)\n", sampler_type_names.c_str());
@ -972,7 +980,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
+    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = kv_size)\n", sparams.penalty_last_n);
    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
@ -1269,7 +1277,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

-    cparams.n_ctx             = params.n_ctx;
+    cparams.kv_size           = params.kv_size;
    cparams.n_batch           = params.n_batch;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -1658,7 +1666,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+    fprintf(stream, "kv_size: %d # default: 512\n", params.kv_size);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
--- a/common/common.h
+++ b/common/common.h
@ -50,7 +50,7 @@ struct gpt_params {
    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
    int32_t n_threads_batch_draft = -1;
    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
+    int32_t kv_size               = 512;   // KV Cache size
    int32_t n_batch               = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
    int32_t n_draft               = 8;     // number of tokens to draft during speculative decoding
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -7,11 +7,11 @@ USER_NAME="${USER_NAME:-Anon}"

 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
+KV_SIZE="${KV_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"

 GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
+--kv_size "$KV_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@ -10,7 +10,7 @@ cd ..
 ./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
       --color \
       -f ./prompts/alpaca.txt \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@ -532,16 +532,16 @@ static struct ggml_tensor * forward(
                // Vcur shape [n_embd, N, 1, 1]
                struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N)));

-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -560,7 +560,7 @@ static struct ggml_tensor * forward(
                        Qcur,
                        0, 2, 1, 3);

-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@ -780,16 +780,16 @@ static struct ggml_tensor * forward_batch(

                assert_shape_3d(Vcur, N, n_embd, n_batch);

-                // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
-                // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+                // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
+                // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
                // k         shape [n_embd * N, n_batch]   == kv_self.k[:,n_past:n_past+N,:,il]
                // v         shape [N, n_embd, n_batch, 1] == kv_self.v[:,n_past:n_past+N,:,il]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -817,7 +817,7 @@ static struct ggml_tensor * forward_batch(
                        0, 2, 1, 3);
            assert_shape_4d(Q, n_embd/n_head, N, n_head, n_batch);

-            // kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
+            // kv_self.k shape [n_embd * kv_size * n_batch * n_layer]
            // K shape [n_embd/n_head, n_past + N, n_head, n_batch]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@ -855,7 +855,7 @@ static struct ggml_tensor * forward_batch(
            assert_shape_4d(KQ_soft_max, n_past + N, N, n_head, n_batch);

            // split cached V into n_head heads
-            // kv_self.v shape [n_ctx * n_embd * n_batch * n_layer]
+            // kv_self.v shape [kv_size * n_embd * n_batch * n_layer]
            // V shape [n_past + N, n_embd/n_head, n_head, n_batch] == kv_self.v[:(n_past+N),:,:,il]
            struct ggml_tensor * V =
                ggml_view_4d(ctx0, vc,
@ -1082,16 +1082,16 @@ static struct ggml_tensor * forward_lora(
                                                                cur)),
                                                        n_embd, N)));

-                // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
-                // kv_self.v shape [n_embd * n_ctx * n_layer, 1]
+                // kv_self.k shape [n_embd * kv_size * n_layer, 1]
+                // kv_self.v shape [n_embd * kv_size * n_layer, 1]
                // k         shape [n_embd * N, 1]   == kv_self.k[:,n_past:n_past+N,il,0]
                // v         shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0]

                /* {
-                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                    struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*kv_size + n_past));
                    struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
-                            (   n_ctx)*ggml_element_size(kv_self.v),
-                            (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
+                            (   kv_size)*ggml_element_size(kv_self.v),
+                            (il*kv_size)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));

                    // important: storing RoPE-ed version of K in the KV cache!
                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
@ -1110,7 +1110,7 @@ static struct ggml_tensor * forward_lora(
                        Qcur,
                        0, 2, 1, 3);

-            // kv_self.k shape [n_embd * n_ctx * n_layer, 1]
+            // kv_self.k shape [n_embd * kv_size * n_layer, 1]
            // K shape [n_embd/n_head, n_past + N, n_head, 1]
            struct ggml_tensor * K =
                ggml_permute(ctx0,
@ -1470,7 +1470,7 @@ int main(int argc, char ** argv) {
 /*
    struct llama_model_lora model_lora;
    // model.hparams.n_vocab = 6;
-    // model.hparams.n_ctx   = 64;
+    // model.hparams.kv_size = 64;
    // model.hparams.n_embd  = 128;
    // model.hparams.n_mult  = 2;
    // model.hparams.n_head  = 8;
@ -1478,7 +1478,7 @@ int main(int argc, char ** argv) {
    // model.hparams.n_rot   = model.hparams.n_embd / model.hparams.n_head;

    model_lora.hparams.n_vocab = 16;
-    model_lora.hparams.n_ctx   = 32;
+    model_lora.hparams.kv_size = 32;
    model_lora.hparams.n_embd  = 256;
    model_lora.hparams.n_mult  = 2;
    model_lora.hparams.n_head  = 16;
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -104,7 +104,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
+    ctx_params.kv_size   = n_kv_max;
    ctx_params.n_batch   = 512;
    ctx_params.mul_mat_q = mmq;

--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -38,7 +38,7 @@ let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_par

 var context_params = llama_context_default_params()
 context_params.seed = 1234
-context_params.n_ctx = n_kv_req
+context_params.kv_size = n_kv_req
 context_params.n_batch = UInt32(max(n_len, n_parallel))
 context_params.n_threads = 8
 context_params.n_threads_batch = 8
@ -53,12 +53,12 @@ defer {
    llama_free(context)
 }

-let n_ctx = llama_n_ctx(context)
+let kv_size = llama_kv_size(context)

-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
+print("\nn_len = \(n_len), kv_size = \(kv_size), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")

-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
+if n_kv_req > kv_size {
+    print("error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", n_kv_req)
    exit(1)
 }

--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@ -7,7 +7,7 @@ The example demonstrates batched generation from a given prompt

 ...

-main: n_len = 32, n_ctx = 2048, n_parallel = 4, n_kv_req = 113
+main: n_len = 32, kv_size = 2048, n_parallel = 4, n_kv_req = 113

 Hello my name is

--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = n_kv_req;
+    ctx_params.kv_size = n_kv_req;
    ctx_params.n_batch = std::max(n_len, n_parallel);
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -90,14 +90,14 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx    = llama_n_ctx(ctx);
+    const int kv_size    = llama_kv_size(ctx);

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, kv_size, ctx_params.n_batch, n_parallel, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__,  n_kv_req);
-        LOG_TEE("%s:        either reduce n_parallel or increase n_ctx\n", __func__);
+    if (n_kv_req > kv_size) {
+        LOG_TEE("%s: error: n_kv_req (%d) > kv_size, the required KV cache size is not big enough\n", __func__,  n_kv_req);
+        LOG_TEE("%s:        either reduce n_parallel or increase kv_size\n", __func__);
        return 1;
    }

--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
@ -139,8 +139,8 @@ int main(int argc, char ** argv)

    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
+    const size_t max_kv_size          = llama_kv_size(ctx);
+    const size_t max_tokens_list_size = max_kv_size - 4 ;

    if (tokens_list.size() > max_tokens_list_size)
    {
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -128,20 +128,20 @@ int main(int argc, char ** argv)  {
    // TODO: perform the bench for all types or for a user specified type
    const ggml_type qtype = GGML_TYPE_Q4_1;

-    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += 1024*1024*16;
+    size_t kv_size = 0;
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizez);
+    kv_size += ggml_row_size(qtype, sizex * sizey);
+    kv_size += ggml_row_size(qtype, sizex * sizey);
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
+    kv_size += ggml_row_size(GGML_TYPE_F32, sizex * sizey); // BLAS
+    kv_size += 1024 * 1024 * 16;

-    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
+    printf("Allocating Memory of size %zi bytes, %zi MB\n", kv_size, (kv_size / 1024 / 1024));

    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
+        /*.mem_size   =*/ kv_size,
        /*.mem_buffer =*/ NULL,
        /* no_alloc   =*/ 0
    };
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@ -15,7 +15,7 @@ rem Adjust to the number of CPU cores you want to use.
 rem if not defined N_THREAD set "N_THREAD=8"
 rem Number of tokens to predict (made it larger than default because we want a long interaction)
 if not defined N_PREDICTS set "N_PREDICTS=2048"
-if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"

 rem Default main script paths
 set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"

 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"

 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -27,9 +27,9 @@ SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+
 SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"

-CTX_SIZE=2048
-CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
-OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+KV_SIZE=2048
+KV_ROTATE_POINT=$((KV_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --kv_size "$KV_SIZE" --repeat_last_n 256 "$@")

 # An unbuffered `tail -c+N`
 skip_bytes() {
@ -84,7 +84,7 @@ n_tokens=0

 while read -e line; do
    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
-    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+    n_predict=$((KV_SIZE - n_tokens - ${#line} / 2 - 32))

    # Swap prompts when we're about to run out of context
    if ((n_predict <= 0)); then
@ -97,11 +97,11 @@ while read -e line; do
        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"

        n_tokens=0
-        n_predict=$((CTX_SIZE / 2))
+        n_predict=$((KV_SIZE / 2))
    fi

    echo " ${line}" >>"$CUR_PROMPT_FILE"
-    if ((n_tokens > CTX_ROTATE_POINT)); then
+    if ((n_tokens > KV_ROTATE_POINT)); then
        echo " ${line}" >>"$NEXT_PROMPT_FILE"
    fi

@ -139,7 +139,7 @@ while read -e line; do

    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))

-    if ((n_tokens > CTX_ROTATE_POINT)); then
+    if ((n_tokens > KV_ROTATE_POINT)); then
        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
    fi

--- a/examples/chat-vicuna.sh
+++ b/examples/chat-vicuna.sh
@ -15,8 +15,8 @@ N_THREAD="${N_THREAD:-8}"
 N_PREDICTS="${N_PREDICTS:-2048}"

 # Note: you can also override the generation options by specifying them on the command line:
-# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
+# For example, override the context size by doing: ./chatLLaMa --kv_size 1024
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"

 DATE_TIME=$(date +%H:%M)
 DATE_YEAR=$(date +%Y)
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -226,7 +226,7 @@ struct llama_vocab {

 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t kv_size   = 512;   // this is provided as user input?
    uint32_t n_embd  = 4096;
    uint32_t n_ff    = 11008;
    uint32_t n_mult  = 4;
@ -326,7 +326,7 @@ struct train_params {

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size: %u\n", __func__, params->kv_size);
    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
    printf("%s: n_mult:  %u\n", __func__, params->n_mult);
    printf("%s: n_head:  %u\n", __func__, params->n_head);
@ -732,7 +732,7 @@ static void save_as_llama_model(
    gguf_set_val_u32(ctx, KV_TOKENIZER_SEP_ID, -1);
    gguf_set_val_u32(ctx, KV_TOKENIZER_PAD_ID, -1);

-    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.n_ctx);
+    gguf_set_val_u32(ctx, KV_CONTEXT_LENGTH, model->hparams.kv_size);
    gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd);
    gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff);
    gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head);
@ -937,7 +937,7 @@ int main(int argc, char ** argv) {

    struct my_llama_model model;
    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
-    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.kv_size = params.n_ctx;
    model.hparams.n_embd  = config.dim; //params.n_embd;
    model.hparams.n_ff    = config.hidden_dim;
    model.hparams.n_mult  = 32;//params.n_mult;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -88,11 +88,11 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {

    // max batch size
    const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch == params.n_ctx);
+    GGML_ASSERT(params.n_batch == params.kv_size);

    // tokenize the prompts and trim
    std::vector<std::vector<int32_t>> inputs;
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@ -16,7 +16,7 @@

 struct my_llama_hparams {
    uint32_t n_vocab    = 32000;
-    uint32_t n_ctx      = 512;
+    uint32_t kv_size    = 512;
    uint32_t n_embd     = 4096;
    uint32_t n_ff       = 11008;
    uint32_t n_head     = 32;
@ -190,7 +190,7 @@ static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size               : %u\n", __func__, params->kv_size);
    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
    printf("%s: n_head                : %u\n", __func__, params->n_head);
@ -250,7 +250,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    };

    GGUF_GET_KEY(ctx, hparams->n_embd,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_EMBEDDING_LENGTH));
-    GGUF_GET_KEY(ctx, hparams->n_ctx,          gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    GGUF_GET_KEY(ctx, hparams->kv_size,        gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
    GGUF_GET_KEY(ctx, hparams->n_ff,           gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_FEED_FORWARD_LENGTH));
    GGUF_GET_KEY(ctx, hparams->n_head,         gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_ATTENTION_HEAD_COUNT));
    GGUF_GET_KEY(ctx, hparams->n_layer,        gguf_get_val_u32, GGUF_TYPE_UINT32,  true, kv(LLM_KV_BLOCK_COUNT));
@ -268,7 +268,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    }
 }

-static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t n_ctx) {
+static void init_model(struct llama_model * input, struct my_llama_model * model, const char * fn_model, uint32_t kv_size) {
    auto & hparams = model->hparams;

    std::vector<char> tn_buf;
@ -298,7 +298,7 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
        gguf_free(mctx);
    }
    hparams.n_vocab = llama_n_vocab(input);
-    hparams.n_ctx = n_ctx;
+    hparams.kv_size = kv_size;

    // get tensors from llama_model (possibly mmapped)
    model->tok_embeddings = llama_get_model_tensor(input, tn(LLM_TENSOR_TOKEN_EMBD));
@ -529,7 +529,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams  = model->hparams;
-    const int n_ctx       = hparams.n_ctx;
+    const int kv_size     = hparams.kv_size;
    const int n_vocab     = hparams.n_vocab;
    const int n_embd      = hparams.n_embd;
    const int n_layer     = hparams.n_layer;
@ -558,13 +558,13 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
                (struct ggml_tensor * t) -> struct ggml_tensor * {
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

        return ggml_rope_custom(ctx,
-            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
+            t, KQ_pos, n_rot, rope_mode, kv_size, 0,
            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };
@ -848,7 +848,7 @@ static void save_llama_lora_gguf(struct gguf_context * fctx, struct my_llama_mod
    gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);

-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx);
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.kv_size);
    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd);
    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff);
    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head);
@ -1554,9 +1554,9 @@ int main(int argc, char ** argv) {
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);

    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
+        // overwrite last kv_size with user provided kv_size
        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
+            model.hparams.kv_size = params.common.n_ctx;
        }

        const bool opt_param_count_changed = (
@ -1625,7 +1625,7 @@ int main(int argc, char ** argv) {
    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
    printf("%s: opt iter %d\n", __func__, opt->iter);

-    int n_tokens = model.hparams.n_ctx;
+    int n_tokens = model.hparams.kv_size;
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 -n -1 \
+       --batch_size 8 --kv_size 2048 -n -1 \
       --repeat_last_n 64 --repeat_penalty 1.3 \
       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -325,7 +325,7 @@ static void process_logits(
 static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);

    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
@ -336,17 +336,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (from_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
+        if (size_t((from_chunk + 2)*kv_size) >= tokens.size()) {
            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
            return false;
        }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk * kv_size);
+        tokens.erase(tokens.begin(), tokens.begin() + from_chunk * kv_size);
    }

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return false;
    }
@ -359,7 +359,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
        prob_history.resize(tokens.size());
    }

-    const int n_chunk_max = tokens.size() / n_ctx;
+    const int n_chunk_max = tokens.size() / kv_size;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@ -373,16 +373,16 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

-    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int num_batches = (kv_size + n_batch - 1) / n_batch;

    std::vector<float> logits;
    if (compute_ppl && num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve((size_t)kv_size * n_vocab);
    }

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        std::vector<float> logits;

@ -431,11 +431,11 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
        }

        if (compute_ppl) {
-            const int first = n_ctx/2;
+            const int first = kv_size / 2;
            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-            count += n_ctx - first - 1;
+            count += kv_size - first - 1;

            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
            fflush(stdout);
@ -553,7 +553,7 @@ int main(int argc, char ** argv) {
    }

    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    params.n_batch = std::min(params.n_batch, params.kv_size);

    print_build_info();

@ -593,9 +593,9 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
+    if (params.kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
+                __func__, n_ctx_train, params.kv_size);
    }

    // print system information
--- a/examples/infill/README.md
+++ b/examples/infill/README.md
@ -14,7 +14,8 @@ In this section, we cover the most commonly used options for running the `infill
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
+-   `-kv N`, `--kv-size N`: Specify the total size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -135,9 +135,9 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.n_ctx != 0 && params.n_ctx < 8) {
+    if (params.kv_size != 0 && params.kv_size < 8) {
        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+        params.kv_size = 8;
    }
    if (params.instruct) {
        printf("\n************\n");
@ -225,12 +225,12 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    const int kv_size     = llama_kv_size(ctx);
+    LOG("kv_size: %d\n", kv_size);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@ -291,8 +291,8 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+    if ((int) embd_inp.size() > kv_size - 4) {
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
        return 1;
    }

@ -366,7 +366,7 @@ int main(int argc, char ** argv) {
        }
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);
    LOG_TEE("\n\n");

    LOG_TEE("\n#####  Infill mode  #####\n\n");
@ -416,9 +416,9 @@ int main(int argc, char ** argv) {
    while (n_remain != 0 || params.interactive) {
        // predict
        if (!embd.empty()) {
-            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // Note: kv_size - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
+            int max_embd_size = kv_size - 4;

            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int) embd.size() > max_embd_size) {
@ -434,8 +434,8 @@ int main(int argc, char ** argv) {
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
+            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
                if (params.n_predict == -2) {
                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                    break;
@ -444,8 +444,8 @@ int main(int argc, char ** argv) {
                const int n_left    = n_past - params.n_keep - 1;
                const int n_discard = n_left/2;

-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                    n_past, n_left, n_ctx, params.n_keep, n_discard);
+                LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
+                    n_past, n_left, kv_size, params.n_keep, n_discard);

                llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -514,7 +514,7 @@ struct cmd_params_instance {
    llama_context_params to_llama_cparams() const {
        llama_context_params cparams = llama_context_default_params();

-        cparams.n_ctx = n_prompt + n_gen;
+        cparams.kv_size = n_prompt + n_gen;
        cparams.n_batch = n_batch;
        cparams.type_k = type_k;
        cparams.type_v = type_v;
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -68,8 +68,8 @@ actor LlamaContext {
        print("Using \(n_threads) threads")

        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
-        ctx_params.n_ctx = 2048
+        ctx_params.seed    = 1234
+        ctx_params.kv_size = 2048
        ctx_params.n_threads       = UInt32(n_threads)
        ctx_params.n_threads_batch = UInt32(n_threads)

@ -112,13 +112,13 @@ actor LlamaContext {
        tokens_list = tokenize(text: text, add_bos: true)
        temporary_invalid_cchars = []

-        let n_ctx = llama_n_ctx(context)
+        let kv_size = llama_kv_size(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)

-        print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
+        print("\n n_len = \(n_len), kv_size = \(kv_size), n_kv_req = \(n_kv_req)")

-        if n_kv_req > n_ctx {
-            print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
+        if n_kv_req > kv_size {
+            print("error: n_kv_req > kv_size, the required KV cache size is not big enough")
        }

        for id in tokens_list {
--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@ -9,7 +9,7 @@ cd ..

 ./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
       --color \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@ -9,7 +9,7 @@ cd ..

 ./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
       --color \
-       --ctx_size 2048 \
+       --kv_size 2048 \
       -n -1 \
       -ins -b 256 \
       --top_k 10000 \
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -230,7 +230,7 @@ static struct llava_context * llava_init(gpt_params * params) {
    }

    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
+    ctx_params.kv_size              = params->kv_size < 2048 ? 2048 : params->kv_size; // we need a longer context size to process image embeddings

    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@ -103,15 +103,15 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
    const size_t num_images = num_patches_width * num_patches_height + 1;

    // TODO: size calculation is not calculated - it's only tens of MB
-    size_t ctx_size = 0;
+    size_t kv_size = 0;

    {
-        ctx_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
-        ctx_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
+        kv_size += clip_embd_nbytes(ctx_clip) * num_images * 8; // image_features
+        kv_size += 1024*1024 * ggml_type_size(GGML_TYPE_F32);
    }

    struct ggml_init_params params {
-        /*.mem_size   =*/ ctx_size,
+        /*.mem_size   =*/ kv_size,
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
    };
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -73,8 +73,8 @@ int main(int argc, char ** argv) {
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
    all = inp;

-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int max_kv_size          = llama_kv_size(ctx);
+    const int max_tokens_list_size = max_kv_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
    // seq_id == 0           : the current input token
    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
    // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    llama_batch batch = llama_batch_init(params.kv_size, 0, W + G + 1);

    // target model sampling context
    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -47,8 +47,8 @@ int main(int argc, char ** argv){
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);

-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
+    const int max_kv_size          = llama_kv_size(ctx);
+    const int max_tokens_list_size = max_kv_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
@ -86,7 +86,7 @@ int main(int argc, char ** argv){

    std::vector<llama_token> draft;

-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, 1);

    // debug
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -70,7 +70,8 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-c N`, `--ctx-size N`: Deprecated, use `--kv-size` instead.
+-   `-kv N`, `--kv-size N`: Set the size of the KV cache for the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

@ -134,15 +135,15 @@ By understanding and utilizing these interaction options, you can create engagin

 During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

-### Context Size
+### KV Context Size

-The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--kv-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.

-   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+-   `-c N, --kv-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

 ### Extended Context Size

-Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--kv-size` to 32768 (32k) and `--rope-scale` to 8.

 -   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.

@ -152,7 +153,7 @@ The `--keep` option allows users to retain the original prompt when the model ru

 -   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

-By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--kv-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.

 ## Generation Flags

@ -181,12 +182,12 @@ Example usage: `--temp 0.5`
 ### Repeat Penalty

 -   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

 The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.

-The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`kv-size`).

 Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -157,9 +157,9 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    if (params.n_ctx != 0 && params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
+    if (params.kv_size != 0 && params.kv_size < 8) {
+        LOG_TEE("%s: warning: minimum KV size is 8, using minimum size.\n", __func__);
+        params.kv_size = 8;
    }

    if (params.rope_freq_base != 0.0) {
@ -208,12 +208,12 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);
+    const int kv_size     = llama_kv_size(ctx);
+    LOG("kv_size: %d\n", kv_size);

-    if (n_ctx > n_ctx_train) {
+    if (kv_size > n_ctx_train) {
        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, n_ctx);
+                __func__, n_ctx_train, kv_size);
    }

    // print system information
@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
            LOG_TEE("%s: The session file is empty. A new session will be initialized.\n", __func__);
        } else {
            // The file exists and is not empty
-            session_tokens.resize(n_ctx);
+            session_tokens.resize(kv_size);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
@ -289,8 +289,8 @@ int main(int argc, char ** argv) {
        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

-    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+    if ((int) embd_inp.size() > kv_size - 4) {
+        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), kv_size - 4);
        return 1;
    }

@ -450,7 +450,7 @@ int main(int argc, char ** argv) {
    }
    LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
    LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    LOG_TEE("generate: kv_size = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", kv_size, params.n_batch, params.n_predict, params.n_keep);

    // group-attention state
    // number of grouped KV tokens so far (used only if params.grp_attn_n > 1)
@ -463,7 +463,7 @@ int main(int argc, char ** argv) {
        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
-      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
+      //GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * grp_attn_n"); // NOLINT
        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
    }
    LOG_TEE("\n\n");
@ -514,9 +514,9 @@ int main(int argc, char ** argv) {
    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (!embd.empty()) {
-            // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via
+            // Note: (kv_size - 4) here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
+            int max_embd_size = kv_size - 4;

            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int) embd.size() > max_embd_size) {
@ -533,8 +533,8 @@ int main(int argc, char ** argv) {
                // infinite text generation via context shifting
                // if we run out of context:
                // - take the n_keep first tokens from the original prompt (via n_past)
-                // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                // - take half of the last (kv_size - n_keep) tokens and recompute the logits in batches
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > kv_size) {
                    if (params.n_predict == -2) {
                        LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                        break;
@ -543,8 +543,8 @@ int main(int argc, char ** argv) {
                    const int n_left    = n_past - params.n_keep - 1;
                    const int n_discard = n_left/2;

-                    LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
-                            n_past, n_left, n_ctx, params.n_keep, n_discard);
+                    LOG("context full, swapping: n_past = %d, n_left = %d, kv_size = %d, n_keep = %d, n_discard = %d\n",
+                        n_past, n_left, kv_size, params.n_keep, n_discard);

                    llama_kv_cache_seq_rm   (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
                    llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
@ -666,7 +666,7 @@ int main(int argc, char ** argv) {
                LOG("n_past = %d\n", n_past);
                // Display total tokens alongside total time
                if (params.n_print > 0 && n_past % params.n_print == 0) {
-                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx);
+                    LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, kv_size);
                }
            }

--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "\n\n");
    fflush(stderr);

-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

    std::vector<client> clients(n_clients);
    for (size_t i = 0; i < clients.size(); ++i) {
@ -169,7 +169,7 @@ int main(int argc, char ** argv) {

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    llama_batch batch = llama_batch_init(kv_size, 0, 1);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
+    ctx_params.kv_size   = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_batch = 512;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@ -121,12 +121,12 @@ int main(int argc, char ** argv) {
    // total length of the sequences including the prompt
    const int n_len = n_tokens_all + n_predict;

-    const int n_ctx       = llama_n_ctx(ctx) - n_keep;
-    const int n_kv_req    = llama_n_ctx(ctx);
+    const int kv_size     = llama_kv_size(ctx) - n_keep;
+    const int n_kv_req    = llama_kv_size(ctx);
    const int n_batch     = ctx_params.n_batch;
    const int n_batch_grp = ctx_params.n_batch/n_grp;

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, n_ctx, n_kv_req, n_grp, n_batch);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d, n_grp = %d, n_batch = %d\n", __func__, n_len, kv_size, n_kv_req, n_grp, n_batch);

    // print the prompt token-by-token

@ -140,7 +140,7 @@ int main(int argc, char ** argv) {
    int n_past = 0;

    // fill the KV cache
-    for (int i = 0; i < n_ctx; i += n_batch) {
+    for (int i = 0; i < kv_size; i += n_batch) {
        if (i > 0 && n_grp > 1) {
            // if SelfExtend is enabled, we compress the position from the last batch by a factor of n_grp
            const int ib = i/n_batch - 1;
@ -174,13 +174,13 @@ int main(int argc, char ** argv) {
        }
    }

-    for (int i = n_ctx; i < n_tokens_all; i += n_batch) {
+    for (int i = kv_size; i < n_tokens_all; i += n_batch) {
        const int n_discard = n_batch;

        LOG_TEE("%s: shifting KV cache with %d\n", __func__, n_discard);

        llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);

        n_past -= n_discard;

@ -203,13 +203,13 @@ int main(int argc, char ** argv) {
    }

    {
-        const int n_discard = n_past - n_ctx + n_predict;
+        const int n_discard = n_past - kv_size + n_predict;

        if (n_discard > 0) {
            LOG_TEE("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);

            llama_kv_cache_seq_rm   (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_kv_cache_seq_shift(ctx, 0, n_keep + n_discard, kv_size, -n_discard);

            n_past -= n_discard;
        }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -320,11 +320,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &

    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@ -340,13 +340,13 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        return {tokens, -1, logit_history, prob_history};
    }

-    const int calc_chunk = n_ctx;
+    const int calc_chunk = kv_size;

    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);

    if (int(tokens.size()) <= calc_chunk) {
-        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
-                tokens.size(), n_ctx, params.ppl_stride);
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n", __func__,
+                tokens.size(), kv_size, params.ppl_stride);
        return {tokens, -1, logit_history, prob_history};
    }

@ -414,8 +414,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
-        for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.kv_size - params.ppl_stride + start, params.kv_size + start);
+        for (int j = kv_size - params.ppl_stride - 1; j < kv_size - 1; ++j) {

            // Calculate probability of next token, given the previous ones.
            const std::vector<float> tok_logits(
@ -453,7 +453,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    // BOS tokens will be added for each chunk before eval

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);

    std::ofstream logits_stream;
    if (!params.logits_file.empty()) {
@ -464,7 +464,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        }
        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
        logits_stream.write("_logits_", 8);
-        logits_stream.write(reinterpret_cast<const char *>(&n_ctx), sizeof(n_ctx));
+        logits_stream.write(reinterpret_cast<const char *>(&kv_size), sizeof(kv_size));
    }

    auto tim1 = std::chrono::high_resolution_clock::now();
@ -475,9 +475,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*n_ctx,
-                n_ctx);
+    if (int(tokens.size()) < 2*kv_size) {
+        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n", __func__, 2 * kv_size,
+                kv_size);
        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
        return {std::move(tokens), 0., {}, {}};
    }
@ -488,7 +488,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    std::vector<float> prob_history;
    prob_history.resize(tokens.size());

-    const int n_chunk_max = tokens.size() / n_ctx;
+    const int n_chunk_max = tokens.size() / kv_size;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
@ -498,11 +498,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    double nll = 0.0;
    double nll2 = 0.0;

-    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+    const int num_batches = (kv_size + n_batch - 1) / n_batch;

    std::vector<float> logits;
    if (num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
+        logits.reserve((size_t)kv_size * n_vocab);
    }

    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
@ -513,14 +513,14 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    if (!params.logits_file.empty()) {
        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
-        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
+        logits_stream.write((const char *)tokens.data(), n_chunk * kv_size * sizeof(tokens[0]));
        const int nv = 2*((n_vocab + 1)/2) + 4;
-        log_probs.resize(n_ctx * nv);
+        log_probs.resize(kv_size * nv);
    }

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        const auto t_start = std::chrono::high_resolution_clock::now();

@ -566,7 +566,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        // We get the logits for all the tokens in the context window (params.n_ctx)
+        // We get the logits for all the tokens in the context window (params.kv_size)
        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
        // calculate the perplexity over the last half of the window (so the model always has
        // some context to predict the token).
@ -578,16 +578,16 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = n_ctx/2;
+        const int first = kv_size/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
        if (!params.logits_file.empty()) {
-            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, log_probs, nll, nll2);
        } else {
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        }
-        count += n_ctx - first - 1;
+        count += kv_size - first - 1;

        // perplexity is e^(average negative log-likelihood)
        if (params.ppl_output_type == 0) {
@ -596,7 +596,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
            double av = nll/count;
            double av2 = nll2/count - av*av;
            if (av2 > 0) av2 = sqrt(av2/(count-1));
-            printf("%8d  %.4lf  %4lf  %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
+            printf("%8d  %.4lf  %4lf  %4lf\n", i*kv_size, std::exp(nll / count), av, av2);
        }
        fflush(stdout);

@ -805,16 +805,16 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@ -832,7 +832,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        // each task has 4 unique seuqnce ids - one for each ending
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
-        while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) hs_data[i1].required_tokens <= kv_size) {
            auto & hs_cur = hs_data[i1];

            const int s0 = 4*(i1 - i0);
@ -1082,16 +1082,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__);

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 128;
    const int max_seq = 2*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@ -1108,7 +1108,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {

        llama_batch_clear(batch);

-        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) data[i1].required_tokens <= kv_size) {
            const int s0 = 2*(i1 - i0);
            if (s0 + 2 > max_seq) {
                break;
@ -1434,16 +1434,16 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
    printf("\ntask\tacc_norm\n");

    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
+    const int kv_size = llama_kv_size(ctx);
    const int n_batch = params.n_batch;

    const int max_tasks_per_batch = 32;
    const int max_seq = 4*max_tasks_per_batch;

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    llama_batch batch = llama_batch_init(kv_size, 0, max_seq);

    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
+    std::vector<float> batch_logits(n_vocab*kv_size);

    std::vector<std::pair<size_t, llama_token>> eval_pairs;
    std::vector<float> eval_results;
@ -1467,7 +1467,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
        // the common prefix is shared among the 4 sequences to save tokens
        // we extract logits only from the last common token and from all ending tokens of each sequence
        int s0 = 0;
-        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
+        while (n_cur + (int) tasks[i1].required_tokens <= kv_size) {
            auto& cur_task = tasks[i1];

            int num_answers = cur_task.seq_tokens.size();
@ -1620,11 +1620,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        }
    }

-    uint32_t n_ctx;
-    in.read((char *)&n_ctx, sizeof(n_ctx));
-    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %u, while the current context is %d. Increase it with -c and retry\n",
-                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
+    uint32_t kv_size;
+    in.read((char *)&kv_size, sizeof(kv_size));
+    if (kv_size > llama_kv_size(ctx)) {
+        fprintf(stderr, "%s: %s has been computed with %u, while the current KV Cache size is %d. Increase it with -kv and retry\n",
+                __func__, params.logits_file.c_str(), kv_size, params.kv_size);
    }

    int n_vocab, n_chunk;
@ -1638,22 +1638,22 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
    }

-    std::vector<llama_token> tokens(n_ctx * n_chunk);
+    std::vector<llama_token> tokens(kv_size * n_chunk);
    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
        return;
    }

    const int n_batch = params.n_batch;
-    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
+    const int num_batches = (kv_size + n_batch - 1)/n_batch;
    const int nv = 2*((n_vocab + 1)/2) + 4;
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));

-    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
+    std::vector<uint16_t> log_probs_uint16(size_t(kv_size - 1 - kv_size/2) * nv);
+    std::vector<float> kld_values(size_t(kv_size - 1 - kv_size /2)*n_chunk);
    std::vector<float> logits;
    if (num_batches > 1) {
-        logits.reserve(n_ctx * n_vocab);
+        logits.reserve(kv_size * n_vocab);
    }

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
@ -1672,8 +1672,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
    auto kld_ptr = kld_values.data();

    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
+        const int start = i * kv_size;
+        const int end   = start + kv_size;

        const auto t_start = std::chrono::high_resolution_clock::now();

@ -1726,11 +1726,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
        }

-        const int first = n_ctx/2;
+        const int first = kv_size/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, kv_size - 1 - first,
                workers, log_probs_uint16, kld, kld_ptr);
-        kld_ptr += n_ctx - 1 - first;
+        kld_ptr += kv_size - 1 - first;

        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
@ -1788,12 +1788,12 @@ int main(int argc, char ** argv) {
    }

    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
+    params.n_batch = std::min(params.n_batch, params.kv_size);

    if (params.ppl_stride > 0) {
-        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
-                params.n_ctx, params.n_ctx + params.ppl_stride/2);
-        params.n_ctx += params.ppl_stride/2;
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting KV size from %d to %d\n",
+                params.kv_size, params.kv_size + params.ppl_stride / 2);
+        params.kv_size += params.ppl_stride/2;
    }

    print_build_info();
@ -1823,9 +1823,9 @@ int main(int argc, char ** argv) {
    }

    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
+    if (params.kv_size > n_ctx_train) {
        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
+                __func__, n_ctx_train, params.kv_size);
    }

    // print system information
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -319,7 +319,7 @@ int main(int argc, char ** argv) {
        }

        auto cparams = llama_context_default_params();
-        cparams.n_ctx      = 256;
+        cparams.kv_size    = 256;
        cparams.seed       = 1;

        ctx = llama_new_context_with_model(model, cparams);
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@ -12,7 +12,7 @@ PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
 N_THREAD="${N_THREAD:-12}"

 # Note: you can also override the generation options by specifying them on the command line:
-GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
+GEN_OPTIONS="${GEN_OPTIONS:---kv_size 4096 --batch-size 1024}"


 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -174,7 +174,7 @@ node index.js

    `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1).

-    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
+    `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = kv-size).

    `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true).

@ -239,7 +239,7 @@ Notice that each `probs` is an array of length `n_probs`.

 - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
 - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`
+- `generation_settings`: The provided options above excluding `prompt` but including `kv_size`, `model`
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
 - `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
@ -249,7 +249,7 @@ Notice that each `probs` is an array of length `n_probs`.
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)
 - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
+- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the KV size (`kv_size`)

 - **POST** `/tokenize`: Tokenize a given text.

@ -404,7 +404,7 @@ Notice that each `probs` is an array of length `n_probs`.
        "mirostat_eta": 0.10000000149011612,
        "mirostat_tau": 5.0,
        "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
-        "n_ctx": 2048,
+        "kv_size": 2048,
        "n_keep": 0,
        "n_predict": 100000,
        "n_probs": 0,
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -155,7 +155,7 @@ struct llama_client_slot
    int64_t t_last_used = -1;

    // generation props
-    int32_t n_ctx       = 0;  // context size per slot
+    int32_t kv_size     = 0;  // KV size per slot
    int32_t n_past      = 0;
    int32_t n_decoded   = 0;
    int32_t n_remaining = -1;
@ -325,7 +325,7 @@ struct llama_server_context
    bool all_slots_are_idle = false;
    bool add_bos_token      = true;

-    int32_t n_ctx;  // total context for all clients / slots
+    int32_t kv_size;  // total KV Cache for all clients / slots

    // system prompt
    bool system_need_update = false;
@ -369,8 +369,8 @@ struct llama_server_context
                return false;
            }

-            if (params.n_ctx < 2048) { // request larger context for the image embedding
-                params.n_ctx = 2048;
+            if (params.kv_size < 2048) { // request larger context for the image embedding
+                params.kv_size = 2048;
            }
        }

@ -392,7 +392,7 @@ struct llama_server_context
            }
        }

-        n_ctx = llama_n_ctx(ctx);
+        kv_size = llama_kv_size(ctx);

        add_bos_token = llama_should_add_bos_token(model);

@ -403,7 +403,7 @@ struct llama_server_context
        // create slots
        all_slots_are_idle = true;

-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t kv_size_slot = kv_size / params.n_parallel;

        LOG_TEE("Available slots:\n");
        for (int i = 0; i < params.n_parallel; i++)
@ -411,10 +411,10 @@ struct llama_server_context
            llama_client_slot slot;

            slot.id = i;
-            slot.n_ctx = n_ctx_slot;
+            slot.kv_size = kv_size_slot;
            slot.n_predict = params.n_predict;

-            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+            LOG_TEE(" -> Slot %i - max KV Size: %i\n", slot.id, kv_size_slot);

            const int ga_n = params.grp_attn_n;
            const int ga_w = params.grp_attn_w;
@ -423,7 +423,7 @@ struct llama_server_context
                GGML_ASSERT(ga_n > 0                    && "ga_n must be positive");                     // NOLINT
                GGML_ASSERT(ga_w % ga_n == 0            && "ga_w must be a multiple of ga_n");     // NOLINT
                //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of ga_w");    // NOLINT
-                //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
+                //GGML_ASSERT(kv_size >= n_ctx_train * ga_n && "kv_size must be at least n_ctx_train * ga_n"); // NOLINT
                LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w);
            }

@ -439,7 +439,7 @@ struct llama_server_context
        default_generation_settings_for_props = get_formated_generation(slots.front());
        default_generation_settings_for_props["seed"] = -1;

-        batch = llama_batch_init(n_ctx, 0, params.n_parallel);
+        batch = llama_batch_init(kv_size, 0, params.n_parallel);
    }

    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
@ -1065,7 +1065,7 @@ struct llama_server_context
        }

        return json {
-            {"n_ctx",             slot.n_ctx},
+            {"kv_size",           slot.kv_size},
            {"n_predict",         slot.n_predict},
            {"model",             params.model_alias},
            {"seed",              slot.params.seed},
@ -1474,7 +1474,7 @@ struct llama_server_context
        {
            if (slot.ga_n == 1)
            {
-                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
+                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.kv_size)
                {
                    // Shift context
                    const int n_left    = system_tokens.size() + slot.n_past - slot.params.n_keep - 1;
@ -1496,7 +1496,7 @@ struct llama_server_context
                    slot.truncated = true;

                    LOG_VERBOSE("context shift", {
-                        { "n_ctx", n_ctx },
+                        { "kv_size", kv_size },
                        { "n_keep", params.n_keep },
                        { "n_left", n_left },
                    });
@ -1598,12 +1598,12 @@ struct llama_server_context
                    {
                        slot.params.n_keep = slot.num_prompt_tokens;
                    }
-                    slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
+                    slot.params.n_keep = std::min(slot.kv_size - 4, slot.params.n_keep);

                    // if input prompt is too big, truncate it
-                    if (slot.num_prompt_tokens >= slot.n_ctx)
+                    if (slot.num_prompt_tokens >= slot.kv_size)
                    {
-                        const int n_left = slot.n_ctx - slot.params.n_keep;
+                        const int n_left = slot.kv_size - slot.params.n_keep;
                        const int n_block_size = n_left / 2;
                        const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size;

@ -1611,7 +1611,7 @@ struct llama_server_context
                        new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end());

                        LOG_VERBOSE("input truncated", {
-                            {"n_ctx",  slot.n_ctx},
+                            {"kv_size", slot.kv_size},
                            {"n_keep", slot.params.n_keep},
                            {"n_left", n_left},
                            {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
@ -1620,7 +1620,7 @@ struct llama_server_context
                        prompt_tokens = new_tokens;

                        slot.num_prompt_tokens = prompt_tokens.size();
-                        GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
+                        GGML_ASSERT(slot.num_prompt_tokens < slot.kv_size);
                    }

                    if (!slot.params.cache_prompt)
@ -1873,7 +1873,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -kv N, --kv-size N        Specify the total size of the KV cache (default: %d)\n", params.n_ctx);
+    printf("  -kv N, --kv-size N        Specify the total size of the KV cache (default: %d)\n", params.kv_size);
    printf("  --rope-scaling {none,linear,yarn}\n");
    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
@ -2043,16 +2043,16 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            server_print_usage(argv[0], default_params, default_sparams);
            exit(0);
        }
-        else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
+        else if (arg == "-c" || arg == "--ctx-size" || arg == "--kv_size")
        {
            if (++i >= argc)
            {
                invalid_param = true;
                break;
            }
-            params.n_ctx = std::stoi(argv[i]);
-            LOG_WARNING("-c,--ctx-size,--ctx_size option is deprecated, use --kv-size instead",
-                        {{"--ctx_size", params.n_ctx}});
+            params.kv_size = std::stoi(argv[i]);
+            LOG_WARNING("-c,--ctx-size,--kv_size option is deprecated, use --kv-size instead",
+                        {{"--kv_size", params.kv_size}});
        }
        else if (arg == "-kv" || arg == "--kv-size" || arg == "--kv_size")
        {
@ -2061,7 +2061,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                invalid_param = true;
                break;
            }
-            params.n_ctx = std::stoi(argv[i]);
+            params.kv_size = std::stoi(argv[i]);
        }
        else if (arg == "--rope-scaling")
        {
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@ -7,7 +7,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for g

 ...

-main: n_len = 32, n_ctx = 2048, n_parallel = 1, n_kv_req = 32
+main: n_len = 32, kv_size = 2048, n_parallel = 1, n_kv_req = 32

 Hello my name is Shawn and I'm a 20 year old male from the United States. I'm a 20 year old

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -52,7 +52,7 @@ int main(int argc, char ** argv) {
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
+    ctx_params.kv_size = 2048;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

@ -68,15 +68,15 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int n_ctx    = llama_n_ctx(ctx);
+    const int kv_size  = llama_kv_size(ctx);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_len = %d, kv_size = %d, n_kv_req = %d\n", __func__, n_len, kv_size, n_kv_req);

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
+    if (n_kv_req > kv_size) {
+        LOG_TEE("%s: error: n_kv_req > kv_size, the required KV cache size is not big enough\n", __func__);
+        LOG_TEE("%s:        either reduce n_len or increase kv_size\n", __func__);
        return 1;
    }

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -116,7 +116,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> inp;
    inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);

-    const int max_context_size     = llama_n_ctx(ctx_tgt);
+    const int max_context_size     = llama_kv_size(ctx_tgt);
    const int max_tokens_list_size = max_context_size - 4;

    if ((int) inp.size() > max_tokens_list_size) {
@ -172,8 +172,8 @@ int main(int argc, char ** argv) {
        drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
    }

-    llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, n_seq_dft);
+    llama_batch batch_dft = llama_batch_init(params.kv_size, 0, 1);
+    llama_batch batch_tgt = llama_batch_init(params.kv_size, 0, n_seq_dft);

    const auto t_dec_start = ggml_time_us();

--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@ -22,7 +22,7 @@

 struct my_llama_hparams {
    uint32_t n_vocab = 32000;
-    uint32_t n_ctx   = 512;
+    uint32_t kv_size = 512;
    uint32_t n_embd  = 4096;
    uint32_t n_head  = 32;
    uint32_t n_layer = 32;
@ -112,7 +112,7 @@ static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx:   %u\n", __func__, params->n_ctx);
+    printf("%s: kv_size: %u\n", __func__, params->kv_size);
    printf("%s: n_embd:  %u\n", __func__, params->n_embd);
    printf("%s: n_head:  %u\n", __func__, params->n_head);
    printf("%s: n_ff:    %u\n", __func__, params->n_ff);
@ -272,7 +272,7 @@ static struct ggml_tensor * llama_build_train_graphs(
    const int n_past = 0;
    const int N = n_tokens;
    const auto & hparams = model->hparams;
-    const int n_ctx      = hparams.n_ctx;
+    const int kv_size    = hparams.kv_size;
    const int n_vocab    = hparams.n_vocab;
    const int n_embd     = hparams.n_embd;
    const int n_layer    = hparams.n_layer;
@ -295,13 +295,13 @@ static struct ggml_tensor * llama_build_train_graphs(
    ggml_set_input(KQ_pos);

    // rope has so much parameters that we make a custom function for it
-    auto rope = [ctx, KQ_pos, n_rot, n_ctx, rope_freq_base, rope_freq_scale]
+    auto rope = [ctx, KQ_pos, n_rot, kv_size, rope_freq_base, rope_freq_scale]
                (struct ggml_tensor * t) -> struct ggml_tensor * {
        // not capturing these, to silcence warnings
        const int rope_mode = 0;

        return ggml_rope_custom(
-            ctx, t, KQ_pos, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+                ctx, t, KQ_pos, n_rot, rope_mode, kv_size, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
        );
    };

@ -487,8 +487,8 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
    GGUF_GET_KEY(fctx, ftype_u, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_GENERAL_FILE_TYPE);
    GGML_ASSERT((enum llama_ftype) ftype_u == LLAMA_FTYPE_ALL_F32);

-    // n_ctx was not saved in earlier checkpoint file versions, so we make it optional here
-    GGUF_GET_KEY(fctx, model->hparams.n_ctx,   gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));
+    // kv_size was not saved in earlier checkpoint file versions, so we make it optional here
+    GGUF_GET_KEY(fctx, model->hparams.kv_size, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_CONTEXT_LENGTH));

    GGUF_GET_KEY(fctx, model->hparams.n_embd,  gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_EMBEDDING_LENGTH));
    GGUF_GET_KEY(fctx, model->hparams.n_ff,    gguf_get_val_u32, GGUF_TYPE_UINT32, true, kv(LLM_KV_FEED_FORWARD_LENGTH));
@ -543,7 +543,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
    gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);

    // set hparams
-    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.n_ctx                  );
+    gguf_set_val_u32(fctx, kv(LLM_KV_CONTEXT_LENGTH),              model->hparams.kv_size                  );
    gguf_set_val_u32(fctx, kv(LLM_KV_EMBEDDING_LENGTH),            model->hparams.n_embd                 );
    gguf_set_val_u32(fctx, kv(LLM_KV_FEED_FORWARD_LENGTH),         model->hparams.n_ff                   );
    gguf_set_val_u32(fctx, kv(LLM_KV_ATTENTION_HEAD_COUNT),        model->hparams.n_head                 );
@ -945,7 +945,7 @@ int main(int argc, char ** argv) {

    struct my_llama_model model;
    model.hparams.n_vocab = llama_n_vocab(lmodel);
-    model.hparams.n_ctx   = params.common.n_ctx;
+    model.hparams.kv_size = params.common.n_ctx;
    model.hparams.n_embd  = params.n_embd;
    model.hparams.n_head  = params.n_head;
    model.hparams.n_layer = params.n_layer;
@ -982,9 +982,9 @@ int main(int argc, char ** argv) {
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_file(params.common.fn_checkpoint_in, &model, train);
    if (existed) {
-        // overwrite last n_ctx with user provided n_ctx
+        // overwrite last kv_size with user provided kv_size
        if (params.common.custom_n_ctx) {
-            model.hparams.n_ctx = params.common.n_ctx;
+            model.hparams.kv_size = params.common.n_ctx;
        }

        const bool opt_past_changed = opt->params.past != params.common.opt_past;
@ -1031,7 +1031,7 @@ int main(int argc, char ** argv) {
    printf("%s: opt_size  = %zu bytes (%.1f MB)\n", __func__, ggml_get_mem_size(opt->ctx), (float) ggml_get_mem_size(opt->ctx) / (1024.0f*1024.0f));
    printf("%s: opt iter %d\n", __func__, opt->iter);

-    int n_tokens = model.hparams.n_ctx;
+    int n_tokens = model.hparams.kv_size;
    int n_vocab  = model.hparams.n_vocab;
    int n_batch  = params.common.n_batch;

--- a/llama.cpp
+++ b/llama.cpp
@ -1607,7 +1607,7 @@ struct llama_hparams {
 };

 struct llama_cparams {
-    uint32_t n_ctx;       // context size used during inference
+    uint32_t kv_size;         // KV Cache size used during inference
    uint32_t n_batch;
    uint32_t n_threads;       // number of threads to use for generation
    uint32_t n_threads_batch; // number of threads to use for batch processing
@ -1923,9 +1923,9 @@ struct llama_context {
    struct ggml_tensor * inp_tokens;    // I32 [n_batch]
    struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
    struct ggml_tensor * inp_pos;       // I32 [n_batch]
-    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
-    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
-    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
+    struct ggml_tensor * inp_KQ_mask;   // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_pos;    // F32 [kv_size]
+    struct ggml_tensor * inp_K_shift;   // I32 [kv_size]
    struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
    struct ggml_tensor * inp_cls;       // I32 [n_batch]

@ -1943,7 +1943,7 @@ static bool llama_kv_cache_init(
                 const llama_model & model,
                         ggml_type   ktype,
                         ggml_type   vtype,
-                          uint32_t   n_ctx,
+                          uint32_t   kv_size,
                              bool   offload) {
    const struct llama_hparams & hparams = model.hparams;

@ -1954,11 +1954,11 @@ static bool llama_kv_cache_init(
    cache.has_shift = false;

    cache.head = 0;
-    cache.size = n_ctx;
+    cache.size = kv_size;
    cache.used = 0;

    cache.cells.clear();
-    cache.cells.resize(n_ctx);
+    cache.cells.resize(kv_size);

 #ifdef GGML_USE_CLBLAST
    offload = false;
@ -1997,8 +1997,8 @@ static bool llama_kv_cache_init(

    for (int i = 0; i < (int) n_layer; i++) {
        struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*kv_size);
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        cache.k_l.push_back(k);
@ -2029,19 +2029,19 @@ static bool llama_kv_cache_init(
 static bool llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
        const struct llama_batch & batch) {
-    const uint32_t n_ctx    = cache.size;
+    const uint32_t kv_size  = cache.size;
    const uint32_t n_tokens = batch.n_tokens;

-    if (n_tokens > n_ctx) {
-        LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
+    if (n_tokens > kv_size) {
+        LLAMA_LOG_ERROR("%s: n_tokens=%d > kv_size=%d\n", __func__, n_tokens, kv_size);
        return false;
    }

    uint32_t n_tested = 0;

    while (true) {
-        if (cache.head + n_tokens > n_ctx) {
-            n_tested += n_ctx - cache.head;
+        if (cache.head + n_tokens > kv_size) {
+            n_tested += kv_size - cache.head;
            cache.head = 0;
            continue;
        }
@ -2060,7 +2060,7 @@ static bool llama_kv_cache_find_slot(
            break;
        }

-        if (n_tested >= n_ctx) {
+        if (n_tested >= kv_size) {
            //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
            return false;
        }
@ -3692,11 +3692,11 @@ static bool llm_load_tensors(
    }

    // create one context per buffer type
-    size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
+    size_t kv_size = ggml_tensor_overhead() * ml.n_tensors;
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
    for (auto & it : buft_layer_count) {
        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
+            /*.mem_size   =*/ kv_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ true,
        };
@ -3708,7 +3708,7 @@ static bool llm_load_tensors(
        model.ctxs.push_back(ctx);
    }

-    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
+    LLAMA_LOG_INFO("%s: ggml KV size = %7.2f MiB\n", __func__, model.ctxs.size()*kv_size/1024.0/1024.0);

    // create tensors for the weights
    {
@ -4584,7 +4584,7 @@ static void llm_build_k_shift(
       struct ggml_cgraph * graph,
       struct ggml_tensor * K_shift,
            llm_rope_type   type,
-                  int64_t   n_ctx,
+                  int64_t   kv_size,
                  float     freq_base,
                  float     freq_scale,
       const llm_build_cb & cb) {
@ -4612,7 +4612,7 @@ static void llm_build_k_shift(
            // we rotate only the first n_rot dimensions
            ggml_rope_custom_inplace(ctx,
                    ggml_view_3d(ctx, kv.k_l[il],
-                        n_embd_head_k, n_head_kv, n_ctx,
+                        n_embd_head_k, n_head_kv, kv_size,
                        ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
                        ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
                        0),
@ -4630,7 +4630,7 @@ static void llm_build_kv_store(
         struct ggml_cgraph * graph,
         struct ggml_tensor * k_cur,
         struct ggml_tensor * v_cur,
-                    int64_t   n_ctx,
+                    int64_t   kv_size,
                    int32_t   n_tokens,
                    int32_t   kv_head,
         const llm_build_cb & cb,
@ -4648,7 +4648,7 @@ static void llm_build_kv_store(
    cb(k_cache_view, "k_cache_view", il);

    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa,
-            (  n_ctx)*ggml_element_size(kv.v_l[il]),
+            (  kv_size)*ggml_element_size(kv.v_l[il]),
            (kv_head)*ggml_element_size(kv.v_l[il]));
    cb(v_cache_view, "v_cache_view", il);

@ -4792,7 +4792,7 @@ static struct ggml_tensor * llm_build_kqv(
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
         struct ggml_tensor * kq_pos,
-                    int64_t   n_ctx,
+                    int64_t   kv_size,
                    int32_t   n_tokens,
                    int32_t   n_kv,
                    float     kq_scale,
@ -4851,8 +4851,8 @@ static struct ggml_tensor * llm_build_kqv(
    struct ggml_tensor * v =
        ggml_view_3d(ctx, kv.v_l[il],
                n_kv, n_embd_head_v, n_head_kv,
-                ggml_element_size(kv.v_l[il])*n_ctx,
-                ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v,
+                ggml_element_size(kv.v_l[il])*kv_size,
+                ggml_element_size(kv.v_l[il])*kv_size *n_embd_head_v,
                0);
    cb(v, "v", il);

@ -4892,7 +4892,7 @@ static struct ggml_tensor * llm_build_kv(
         struct ggml_tensor * q_cur,
         struct ggml_tensor * kq_mask,
         struct ggml_tensor * kq_pos,
-                    int64_t   n_ctx,
+                    int64_t   kv_size,
                    int32_t   n_tokens,
                    int32_t   kv_head,
                    int32_t   n_kv,
@ -4906,11 +4906,11 @@ static struct ggml_tensor * llm_build_kv(
    ggml_build_forward_expand(graph, k_cur);
    ggml_build_forward_expand(graph, v_cur);

-    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
+    llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, kv_size, n_tokens, kv_head, cb, il);

    struct ggml_tensor * cur;
    cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
-            q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
+            q_cur, kq_mask, kq_pos, kv_size, n_tokens, n_kv, kq_scale, cb, il);
    cb(cur, "kqv_out", il);

    return cur;
@ -4926,7 +4926,7 @@ struct llm_build_context {

    const int64_t n_embd;
    const int64_t n_layer;
-    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
+    const int64_t kv_size;       // user-specified KV Cache size (can be different from n_ctx_train)
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@ -4946,7 +4946,7 @@ struct llm_build_context {
    const float norm_rms_eps;

    const int32_t n_tokens;
-    const int32_t n_kv;     // size of KV cache to consider (n_kv <= n_ctx)
+    const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_size)
    const int32_t kv_head;  // index of where we store new KV data in the cache
    const int32_t n_orig_ctx;

@ -4973,7 +4973,7 @@ struct llm_build_context {
        kv_self          (lctx.kv_self),
        n_embd           (hparams.n_embd),
        n_layer          (hparams.n_layer),
-        n_ctx            (cparams.n_ctx),
+        kv_size          (cparams.kv_size),
        n_head           (hparams.n_head),
        n_head_kv        (hparams.n_head_kv),
        n_embd_head_k    (hparams.n_embd_head_k),
@ -4991,14 +4991,14 @@ struct llm_build_context {
        norm_eps         (hparams.f_norm_eps),
        norm_rms_eps     (hparams.f_norm_rms_eps),
        n_tokens         (batch.n_tokens),
-        n_kv             (worst_case ? n_ctx            : kv_self.n),
-        kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
+        n_kv             (worst_case ? kv_size            : kv_self.n),
+        kv_head          (worst_case ? kv_size - n_tokens : kv_self.head),
        n_orig_ctx       (cparams.n_yarn_orig_ctx),
        do_rope_shift    (worst_case || kv_self.has_shift),
        pooling_type     (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
        cb               (cb),
        buf_compute_meta (lctx.buf_compute_meta) {
-            // all initializations should be done in init()
+        // all initializations should be done in init()
        }

    void init() {
@ -5041,7 +5041,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -5093,7 +5093,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5229,7 +5229,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -5277,7 +5277,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5347,7 +5347,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -5401,7 +5401,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5500,7 +5500,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5565,7 +5565,7 @@ struct llm_build_context {
        cb(KQ_mask, "KQ_mask", -1);

        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -5705,7 +5705,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Q, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5798,7 +5798,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -5899,7 +5899,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            } else {
                // compute Q and K and RoPE them
@ -5930,7 +5930,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6043,7 +6043,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6140,7 +6140,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, KQ_pos, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6209,7 +6209,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6262,7 +6262,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6332,7 +6332,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6377,7 +6377,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6446,7 +6446,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6498,7 +6498,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6567,7 +6567,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6625,7 +6625,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f, cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6689,7 +6689,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6728,7 +6728,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }
            struct ggml_tensor * sa_out = cur;
@ -6827,7 +6827,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -6894,7 +6894,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -6936,7 +6936,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -7002,7 +7002,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -7054,7 +7054,7 @@ struct llm_build_context {

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                        model.layers[il].wo, NULL,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                        Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -7121,7 +7121,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -7172,8 +7172,8 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                       model.layers[il].wo, model.layers[il].bo,
+                       Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -7253,7 +7253,7 @@ struct llm_build_context {

        // shift the entire K-cache if needed
        if (do_rope_shift) {
-            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, kv_size, freq_base, freq_scale, cb);
        }

        for (int il = 0; il < n_layer; ++il) {
@ -7304,8 +7304,8 @@ struct llm_build_context {
                cb(Kcur, "Kcur", il);

                cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
-                        model.layers[il].wo, model.layers[il].bo,
-                        Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
+                       model.layers[il].wo, model.layers[il].bo,
+                       Kcur, Vcur, Qcur, KQ_mask, nullptr, kv_size, n_tokens, kv_head, n_kv, 1.0f / sqrtf(float(n_embd_head)), cb, il);
                cb(cur, "kqv_out", il);
            }

@ -7549,13 +7549,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
    }

    if (kv_self.has_shift) {
-        const int64_t n_ctx = cparams.n_ctx;
+        const int64_t kv_size = cparams.kv_size;

        assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));

        int32_t * data = (int32_t *) lctx.inp_K_shift->data;

-        for (int i = 0; i < n_ctx; ++i) {
+        for (int i = 0; i < kv_size; ++i) {
            data[i] = lctx.kv_self.cells[i].delta;
        }
    }
@ -7694,7 +7694,7 @@ static int llama_decode_internal(
    // a heuristic, to avoid attending the full cache if it is not yet utilized
    // after enough generations, the benefit from this heuristic disappears
    // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
+    kv_self.n = std::min((int32_t) cparams.kv_size, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
    //kv_self.n = llama_kv_cache_cell_max(kv_self);

    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
@ -11148,7 +11148,7 @@ struct llama_model_params llama_model_default_params() {
 struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.seed                        =*/ LLAMA_DEFAULT_SEED,
-        /*.n_ctx                       =*/ 512,
+        /*.kv_size                     =*/ 512,
        /*.n_batch                     =*/ 512,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
@ -11328,7 +11328,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.offload_kqv      = params.offload_kqv;
    cparams.do_pooling       = params.do_pooling;

-    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
+    cparams.kv_size          = params.kv_size         == 0    ? hparams.n_ctx_train           : params.kv_size;
    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
    cparams.rope_freq_scale  = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;

@ -11356,7 +11356,7 @@ struct llama_context * llama_new_context_with_model(
        params.seed = time(NULL);
    }

-    LLAMA_LOG_INFO("%s: n_ctx      = %u\n",     __func__, cparams.n_ctx);
+    LLAMA_LOG_INFO("%s: kv_size    = %u\n",   __func__, cparams.kv_size);
    LLAMA_LOG_INFO("%s: freq_base  = %.1f\n",   __func__, cparams.rope_freq_base);
    LLAMA_LOG_INFO("%s: freq_scale = %g\n",     __func__, cparams.rope_freq_scale);

@ -11447,7 +11447,7 @@ struct llama_context * llama_new_context_with_model(
        ctx->backends.push_back(ctx->backend_cpu);

        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
-                cparams.n_ctx, cparams.offload_kqv)) {
+                cparams.kv_size, cparams.offload_kqv)) {
            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
            return nullptr;
@ -11490,9 +11490,9 @@ struct llama_context * llama_new_context_with_model(
            ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
            ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
            ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
-            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
-            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size, cparams.n_batch);
+            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.kv_size);
+            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.kv_size);
            ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
            ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);

@ -11531,8 +11531,8 @@ struct llama_context * llama_new_context_with_model(
            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);

            // build worst-case graph
-            int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
-            int n_past = cparams.n_ctx - n_tokens;
+            int n_tokens = (int)std::min(cparams.kv_size, cparams.n_batch);
+            int n_past = cparams.kv_size - n_tokens;
            llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
            ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0), true);

@ -11565,7 +11565,7 @@ struct llama_context * llama_new_context_with_model(
        // Enter a blocking eval loop with dummy input, letting rank=0 drive the process
        // TODO: needs fix after #3228
        GGML_ASSERT(false && "not implemented");
-        //const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
+        //const std::vector<llama_token> tmp(ctx->model.hparams.kv_size, llama_token_bos(ctx));
        //while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
        llama_backend_free();
        exit(1);
@ -11583,8 +11583,8 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
    return &ctx->model;
 }

-uint32_t llama_n_ctx(const struct llama_context * ctx) {
-    return ctx->cparams.n_ctx;
+uint32_t llama_kv_size(const struct llama_context * ctx) {
+    return ctx->cparams.kv_size;
 }

 uint32_t llama_n_batch(const struct llama_context * ctx) {
@ -11982,7 +11982,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
        const auto   n_layer      = hparams.n_layer;
        const auto   n_embd_k_gqa = hparams.n_embd_k_gqa();
        const auto   n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const auto   n_ctx        = cparams.n_ctx;
+        const auto   n_kv_req     = cparams.kv_size;

        const size_t   kv_buf_size = kv_self.total_size();
        const uint32_t kv_head     = kv_self.head;
@ -12006,7 +12006,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
                // v is not contiguous, copy row by row
                tmp_buf.resize(elt_size*kv_head);
                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_kv_req, tmp_buf.size());
                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
                }
            }
@ -12093,7 +12093,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        const int    n_layer      = hparams.n_layer;
        const int    n_embd_k_gqa = hparams.n_embd_k_gqa();
        const int    n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const int    n_ctx        = cparams.n_ctx;
+        const int    n_kv_req     = cparams.kv_size;

        size_t   kv_buf_size;
        uint32_t kv_head;
@ -12118,7 +12118,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
                // v is not contiguous, copy row by row
                size_t v_row_size = elt_size*kv_head;
                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
-                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
+                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_kv_req, v_row_size);
                    inp += v_row_size;
                }
            }
--- a/llama.h
+++ b/llama.h
@ -217,7 +217,7 @@ extern "C" {

    struct llama_context_params {
        uint32_t seed;              // RNG seed, -1 for random
-        uint32_t n_ctx;             // text context, 0 = from model
+        uint32_t kv_size;           // KV Cache size
        uint32_t n_batch;           // prompt processing maximum batch size
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
@ -347,7 +347,7 @@ extern "C" {

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

-    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_kv_size    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@ -8,7 +8,7 @@ import sys
 import yaml

 CLI_ARGS_MAIN_PERPLEXITY = [
-    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
+    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "kv-size", "escape",
    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
@ -27,7 +27,7 @@ CLI_ARGS_LLAMA_BENCH = [
 ]

 CLI_ARGS_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
+    "alias", "batch-size", "kv-size", "embedding", "host", "memory-f32", "lora", "lora-base",
    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
    "threads", "verbose"
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1121,21 +1121,21 @@ struct test_rope : public test_case {
    const std::array<int64_t, 4> ne;
    int n_dims;
    int mode;
-    int n_ctx;
+    int kv_size;

    std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_dims, mode, n_ctx);
+        return VARS_TO_STR5(type, ne, n_dims, mode, kv_size);
    }

    test_rope(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 1},
            int n_dims = 10, int mode = 0, int n_ctx = 512)
-        : type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx) {}
+        : type(type), ne(ne), n_dims(n_dims), mode(mode), kv_size(n_ctx) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
-        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, n_ctx);
+        ggml_tensor * out = ggml_rope(ctx, a, pos, n_dims, mode, kv_size);
        return out;
    }

@ -1145,7 +1145,7 @@ struct test_rope : public test_case {
                // pos
                std::vector<int> data(ne[2]);
                for (int i = 0; i < ne[2]; i++) {
-                    data[i] = rand() % n_ctx;
+                    data[i] = rand() % kv_size;
                }
                ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
            } else {
@ -1545,7 +1545,7 @@ struct llama_hparams {
    int32_t n_tokens;

    // llm_build_context
-    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
+    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= kv_size
    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache

    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads