Merge commit 'bcc0eb4591bec5ec02fad3f2bdcb1b265052ea56' into ceb/nomic-vulkan

2024-12-30 21:34:36 +00:00 · 2024-01-08 15:50:18 -05:00 · 2024-01-08 15:50:18 -05:00 · 8b65f4c5e5
commit 8b65f4c5e5
parent 44b1a97a15 bcc0eb4591
11 changed files with 755 additions and 291 deletions
--- a/README.md
+++ b/README.md
@ -10,6 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 ### Hot topics
 - **llama.h API change for handling KV cache offloading and data type: https://github.com/ggerganov/llama.cpp/pull/4309**
 - Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 - Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
--- a/common/common.cpp
+++ b/common/common.cpp
@ -278,8 +278,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--samplers") {
            if (++i >= argc) {
                invalid_param = true;
@ -510,6 +508,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.infill = true;
        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
            params.dump_kv_cache = true;
        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
            params.no_kv_offload = true;
        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            params.cache_type_k = argv[++i];
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@ -858,8 +862,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@ -900,6 +902,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --verbose-prompt      print prompt before generation\n");
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
    printf("  -nkvo, --no-kv-offload\n");
    printf("                        disable KV offload\n");
    printf("  -ctk TYPE, --cache-type-k TYPE\n");
    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
    printf("  -ctv TYPE, --cache-type-v TYPE\n");
    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@ -1015,6 +1023,29 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    return mparams;
 }
 static ggml_type kv_cache_type_from_str(const std::string & s) {
    if (s == "f16") {
        return GGML_TYPE_F16;
    }
    if (s == "q8_0") {
        return GGML_TYPE_Q8_0;
    }
    if (s == "q4_0") {
        return GGML_TYPE_Q4_0;
    }
    if (s == "q4_1") {
        return GGML_TYPE_Q4_1;
    }
    if (s == "q5_0") {
        return GGML_TYPE_Q5_0;
    }
    if (s == "q5_1") {
        return GGML_TYPE_Q5_1;
    }
    throw std::runtime_error("Invalid cache type: " + s);
 }
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();
@ -1024,7 +1055,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@ -1035,6 +1065,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
    return cparams;
 }
@ -1447,7 +1481,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
--- a/common/common.h
+++ b/common/common.h
@ -100,7 +100,6 @@ struct gpt_params {
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
@ -125,6 +124,10 @@ struct gpt_params {
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -53,6 +53,13 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
    std::vector<std::string> str_values;
    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
    return str_values;
 }
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@ -126,7 +133,8 @@ struct cmd_params {
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> type_k;
    std::vector<ggml_type> type_v;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
@ -142,7 +150,8 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* type_k        */ {GGML_TYPE_F16},
    /* type_v        */ {GGML_TYPE_F16},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
@ -162,7 +171,8 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -ctk <t>, --cache-type-k <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
    printf("  -ctv <t>, --cache-type-v <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@ -173,9 +183,32 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
 }
 static ggml_type ggml_type_from_name(const std::string & s) {
    if (s == "f16") {
        return GGML_TYPE_F16;
    }
    if (s == "q8_0") {
        return GGML_TYPE_Q8_0;
    }
    if (s == "q4_0") {
        return GGML_TYPE_Q4_0;
    }
    if (s == "q4_1") {
        return GGML_TYPE_Q4_1;
    }
    if (s == "q5_0") {
        return GGML_TYPE_Q5_0;
    }
    if (s == "q5_1") {
        return GGML_TYPE_Q5_1;
    }
    return GGML_TYPE_COUNT;
 }
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@ -224,13 +257,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-ctk" || arg == "--cache-type-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = split<std::string>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
                if (gt == GGML_TYPE_COUNT) {
                    invalid_param = true;
                    break;
                }
                types.push_back(gt);
            }
            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
        } else if (arg == "-ctv" || arg == "--cache-type-v") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            auto p = split<std::string>(argv[i], split_delim);
            std::vector<ggml_type> types;
            for (const auto & t : p) {
                ggml_type gt = ggml_type_from_name(t);
                if (gt == GGML_TYPE_COUNT) {
                    invalid_param = true;
                    break;
                }
                types.push_back(gt);
            }
            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@ -321,7 +379,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@ -336,7 +395,8 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    bool f32_kv;
+    ggml_type type_k;
    ggml_type type_v;
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
@ -365,7 +425,8 @@ struct cmd_params_instance {
        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.f16_kv = !f32_kv;
+        cparams.type_k = type_k;
        cparams.type_v = type_v;
        cparams.mul_mat_q = mul_mat_q;
        return cparams;
@ -380,7 +441,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
@ -388,7 +450,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .type_k       = */ tk,
            /* .type_v       = */ tv,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
@ -410,7 +473,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & tk : params.type_k)
    for (const auto & tv : params.type_v)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
@ -422,7 +486,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@ -441,7 +506,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .f32_kv       = */ fk,
+                /* .type_k       = */ tk,
                /* .type_v       = */ tv,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@ -489,7 +555,8 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    bool f32_kv;
+    ggml_type type_k;
    ggml_type type_v;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
@ -508,7 +575,8 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        type_k = inst.type_k;
        type_v = inst.type_v;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
@ -571,7 +639,7 @@ struct test {
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "type_k", "type_v",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@ -621,7 +689,7 @@ struct test {
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@ -805,8 +873,11 @@ struct markdown_printer : public printer {
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
-            fields.push_back("f16_kv");
+            fields.push_back("type_k");
        }
        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
            fields.push_back("type_v");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -321,7 +321,6 @@ int main(int argc, char ** argv) {
        auto cparams = llama_context_default_params();
        cparams.n_ctx      = 256;
        cparams.seed       = 1;
        cparams.f16_kv     = false;
        ctx = llama_new_context_with_model(model, cparams);
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2108,10 +2108,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.yarn_beta_slow = std::stof(argv[i]);
        }
        else if (arg == "--memory-f32" || arg == "--memory_f32")
        {
            params.memory_f16 = false;
        }
        else if (arg == "--threads" || arg == "-t")
        {
            if (++i >= argc)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <atomic>
 #include <assert.h>
 #include <float.h>
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@ -4559,6 +4560,116 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
    cpy_1(cx + x_offset, cdst + dst_offset);
 }
 static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q8_0 * dsti = (block_q8_0 *) cdsti;
    float amax = 0.0f; // absolute max
    for (int j = 0; j < QK8_0; j++) {
        const float v = xi[j];
        amax = fmaxf(amax, fabsf(v));
    }
    const float d = amax / ((1 << 7) - 1);
    const float id = d ? 1.0f/d : 0.0f;
    dsti->d = d;
    for (int j = 0; j < QK8_0; ++j) {
        const float x0 = xi[j]*id;
        dsti->qs[j] = roundf(x0);
    }
 }
 static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_0 * dsti = (block_q4_0 *) cdsti;
    float amax = 0.0f;
    float vmax = 0.0f;
    for (int j = 0; j < QK4_0; ++j) {
        const float v = xi[j];
        if (amax < fabsf(v)) {
            amax = fabsf(v);
            vmax = v;
        }
    }
    const float d  = vmax / -8;
    const float id = d ? 1.0f/d : 0.0f;
    dsti->d = d;
    for (int j = 0; j < QK4_0/2; ++j) {
        const float x0 = xi[0       + j]*id;
        const float x1 = xi[QK4_0/2 + j]*id;
        const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f));
        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
 }
 static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
    const float * xi = (const float *) cxi;
    block_q4_1 * dsti = (block_q4_1 *) cdsti;
    float vmin = FLT_MAX;
    float vmax = -FLT_MAX;
    for (int j = 0; j < QK4_1; ++j) {
        const float v = xi[j];
        if (v < vmin) vmin = v;
        if (v > vmax) vmax = v;
    }
    const float d  = (vmax - vmin) / ((1 << 4) - 1);
    const float id = d ? 1.0f/d : 0.0f;
    dsti->dm.x = d;
    dsti->dm.y = vmin;
    for (int j = 0; j < QK4_1/2; ++j) {
        const float x0 = (xi[0       + j] - vmin)*id;
        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
        const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f));
        const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f));
        dsti->qs[j]  = xi0;
        dsti->qs[j] |= xi1 << 4;
    }
 }
 template <cpy_kernel_t cpy_blck, int qk>
 static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;
    if (i >= ne) {
        return;
    }
    const int i02 = i / (ne00*ne01);
    const int i01 = (i - i02*ne01*ne00) / ne00;
    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
    const int i12 = i / (ne10*ne11);
    const int i11 = (i - i12*ne10*ne11) / ne10;
    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
@ -5737,6 +5848,39 @@ static void ggml_cpy_f32_f16_cuda(
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void ggml_cpy_f32_q8_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void ggml_cpy_f32_q4_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
    const int ne10, const int ne11, const int nb10, const int nb11, const int nb12, cudaStream_t stream) {
    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
        (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@ -6093,20 +6237,21 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
    const enum ggml_type type = src->type;
    const int64_t ts = ggml_type_size(type);
    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
+    const int64_t i1_diff = i1_high - i1_low;
    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
+    if (nb0 == ts && nb1 == ts*(ne0/bs)) {
        return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
    }
    if (nb0 == ts) {
-        return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
+        return cudaMemcpy2DAsync(dst_ptr, ts*(ne0/bs), x, nb1, ts*(ne0/bs), i1_diff, kind, stream);
    }
    GGML_ASSERT(bs == 1 && "TODO: implement bs != 1");
    for (int64_t i1 = 0; i1 < i1_diff; i1++) {
        const void * rx = (const void *) ((const char *) x + i1*nb1);
-        void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+        void * rd = (void *) (dst_ptr + i1*ts*ne0);
        // pretend the row is a matrix with cols=1
-        cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
+        cudaError_t r = cudaMemcpy2DAsync(rd, ts, rx, nb0, ts, ne0, kind, stream);
        if (r != cudaSuccess) { return r; }
    }
    return cudaSuccess;
@ -6474,6 +6619,8 @@ inline void ggml_cuda_op_mul_mat_vec_q(
    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
    const int64_t src1_padded_row_size, const cudaStream_t & stream) {
    GGML_ASSERT(ggml_nrows(src1) == 1);
    const int64_t ne00 = src0->ne[0];
    const int64_t row_diff = row_high - row_low;
@ -6533,7 +6680,8 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
    size_t ash;
    dfloat * src1_dfloat = nullptr; // dfloat == half
-    bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+    bool src1_convert_f16 =
        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
@ -7103,10 +7251,9 @@ static void ggml_cuda_op_mul_mat(
    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
    const bool src0_is_contiguous = ggml_is_contiguous(src0);
    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-    const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
+
-        ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
    GGML_ASSERT(!(split && ne02 > 1));
@ -7231,7 +7378,7 @@ static void ggml_cuda_op_mul_mat(
                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * ne01*ne00*src0_ts/src0_bs;
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
@ -7698,10 +7845,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #ifdef GGML_CUDA_FORCE_DMMV
            const bool use_mul_mat_vec_q = false;
 #else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
 #endif // GGML_CUDA_FORCE_DMMV
            if (use_mul_mat_vec_q) {
                // NOTE: this kernel does not support ggml_nrows(src1) > 1
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
            } else {
                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
@ -7770,14 +7918,17 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
                              ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-                              ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
                              ne10, ne11, nb10, nb11, nb12, main_stream);
    } else {
        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
@ -7788,6 +7939,7 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
 }
 static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    // TODO: why do we pass dst as src1 here?
    ggml_cuda_cpy(src0, dst, nullptr);
    (void) src1;
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -118,6 +118,11 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(im2col_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_q8_0);
    GGML_METAL_DECL_KERNEL(cpy_f32_q4_0);
    GGML_METAL_DECL_KERNEL(cpy_f32_q4_1);
    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_0);
    //GGML_METAL_DECL_KERNEL(cpy_f32_q5_1);
    GGML_METAL_DECL_KERNEL(cpy_f16_f16);
    GGML_METAL_DECL_KERNEL(concat);
    GGML_METAL_DECL_KERNEL(sqr);
@ -324,6 +329,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(im2col_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_q8_0);
        GGML_METAL_ADD_KERNEL(cpy_f32_q4_0);
        GGML_METAL_ADD_KERNEL(cpy_f32_q4_1);
        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_0);
        //GGML_METAL_ADD_KERNEL(cpy_f32_q5_1);
        GGML_METAL_ADD_KERNEL(cpy_f16_f16);
        GGML_METAL_ADD_KERNEL(concat);
        GGML_METAL_ADD_KERNEL(sqr);
@ -425,6 +435,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(im2col_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f16);
    GGML_METAL_DEL_KERNEL(cpy_f32_f32);
    GGML_METAL_DEL_KERNEL(cpy_f32_q8_0);
    GGML_METAL_DEL_KERNEL(cpy_f32_q4_0);
    GGML_METAL_DEL_KERNEL(cpy_f32_q4_1);
    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_0);
    //GGML_METAL_DEL_KERNEL(cpy_f32_q5_1);
    GGML_METAL_DEL_KERNEL(cpy_f16_f16);
    GGML_METAL_DEL_KERNEL(concat);
    GGML_METAL_DEL_KERNEL(sqr);
@ -1114,7 +1129,7 @@ void ggml_metal_graph_compute(
                                !ggml_is_transposed(src1) &&
                                src1t == GGML_TYPE_F32 &&
                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                ne11 > ne11_mm_min) {
+                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
                                switch (src0->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
@ -1549,14 +1564,23 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
-                            const int nth = MIN(1024, ne00);
+                            GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
                            int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
                            switch (src0t) {
                                case GGML_TYPE_F32:
                                    {
                                        GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
                                        switch (dstt) {
                                            case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16];  break;
                                            case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];  break;
                                            case GGML_TYPE_Q8_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q8_0]; break;
                                            case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_0]; break;
                                            case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q4_1]; break;
                                            //case GGML_TYPE_Q5_0: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_0]; break;
                                            //case GGML_TYPE_Q5_1: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_q5_1]; break;
                                            default: GGML_ASSERT(false && "not implemented");
                                        };
                                    } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -3,6 +3,7 @@
 using namespace metal;
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define QK4_0 32
 #define QR4_0 2
@ -1460,6 +1461,197 @@ kernel void kernel_cpy_f32_f32(
    }
 }
 kernel void kernel_cpy_f32_q8_0(
        device const float * src0,
        device        void * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
        constant   int64_t & ne3,
        constant  uint64_t & nb0,
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        constant  uint64_t & nb3,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig[2];
    const int64_t i02 = tgpig[1];
    const int64_t i01 = tgpig[0];
    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    const int64_t i3 = n / (ne2*ne1*ne0);
    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK8_0;
    device block_q8_0 * dst_data = (device block_q8_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int64_t i00 = tpitg.x*QK8_0; i00 < ne00; i00 += ntg.x*QK8_0) {
        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        float amax = 0.0f; // absolute max
        for (int j = 0; j < QK8_0; j++) {
            const float v = src[j];
            amax = MAX(amax, fabs(v));
        }
        const float d = amax / ((1 << 7) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        dst_data[i00/QK8_0].d = d;
        for (int j = 0; j < QK8_0; ++j) {
            const float x0 = src[j]*id;
            dst_data[i00/QK8_0].qs[j] = round(x0);
        }
    }
 }
 kernel void kernel_cpy_f32_q4_0(
        device const float * src0,
        device        void * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
        constant   int64_t & ne3,
        constant  uint64_t & nb0,
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        constant  uint64_t & nb3,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig[2];
    const int64_t i02 = tgpig[1];
    const int64_t i01 = tgpig[0];
    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    const int64_t i3 = n / (ne2*ne1*ne0);
    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_0;
    device block_q4_0 * dst_data = (device block_q4_0 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int64_t i00 = tpitg.x*QK4_0; i00 < ne00; i00 += ntg.x*QK4_0) {
        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        float amax = 0.0f; // absolute max
        float max  = 0.0f;
        for (int j = 0; j < QK4_0; j++) {
            const float v = src[j];
            if (amax < fabs(v)) {
                amax = fabs(v);
                max  = v;
            }
        }
        const float d = max / -8;
        const float id = d ? 1.0f/d : 0.0f;
        dst_data[i00/QK4_0].d = d;
        for (int j = 0; j < QK4_0/2; ++j) {
            const float x0 = src[0       + j]*id;
            const float x1 = src[QK4_0/2 + j]*id;
            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
            dst_data[i00/QK4_0].qs[j]  = xi0;
            dst_data[i00/QK4_0].qs[j] |= xi1 << 4;
        }
    }
 }
 kernel void kernel_cpy_f32_q4_1(
        device const float * src0,
        device        void * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant   int64_t & ne03,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant  uint64_t & nb03,
        constant   int64_t & ne0,
        constant   int64_t & ne1,
        constant   int64_t & ne2,
        constant   int64_t & ne3,
        constant  uint64_t & nb0,
        constant  uint64_t & nb1,
        constant  uint64_t & nb2,
        constant  uint64_t & nb3,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig[2];
    const int64_t i02 = tgpig[1];
    const int64_t i01 = tgpig[0];
    const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
    const int64_t i3 = n / (ne2*ne1*ne0);
    const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
    const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
    const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0)/QK4_1;
    device block_q4_1 * dst_data = (device block_q4_1 *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
    for (int64_t i00 = tpitg.x*QK4_1; i00 < ne00; i00 += ntg.x*QK4_1) {
        device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
        float min = FLT_MAX;
        float max = -FLT_MAX;
        for (int j = 0; j < QK4_1; j++) {
            const float v = src[j];
            if (min > v) min = v;
            if (max < v) max = v;
        }
        const float d = (max - min) / ((1 << 4) - 1);
        const float id = d ? 1.0f/d : 0.0f;
        dst_data[i00/QK4_1].d = d;
        dst_data[i00/QK4_1].m = min;
        for (int j = 0; j < QK4_1/2; ++j) {
            const float x0 = (src[0       + j] - min)*id;
            const float x1 = (src[QK4_1/2 + j] - min)*id;
            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
            dst_data[i00/QK4_1].qs[j]  = xi0;
            dst_data[i00/QK4_1].qs[j] |= xi1 << 4;
        }
    }
 }
 kernel void kernel_concat(
    device const char * src0,
    device const char * src1,
--- a/llama.cpp
+++ b/llama.cpp
@ -1253,6 +1253,7 @@ struct llama_cparams {
    float yarn_beta_slow;
    bool mul_mat_q;
    bool offload_kqv;
 };
 struct llama_layer {
@ -1321,8 +1322,8 @@ struct llama_kv_cache {
    std::vector<llama_kv_cell> cells;
-    struct ggml_tensor * k = NULL;
+    std::vector<struct ggml_tensor *> k_l; // per layer
-    struct ggml_tensor * v = NULL;
+    std::vector<struct ggml_tensor *> v_l;
    struct ggml_context * ctx = NULL;
@ -1335,8 +1336,10 @@ struct llama_kv_cache {
 #ifdef GGML_USE_CUBLAS
        if (ggml_cublas_loaded()) {
-            ggml_cuda_free_data(k);
+            for (size_t i = 0; i < k_l.size(); ++i) {
-            ggml_cuda_free_data(v);
+                ggml_cuda_free_data(k_l[i]);
                ggml_cuda_free_data(v_l[i]);
            }
        }
 #endif
    }
@ -1528,9 +1531,11 @@ struct llama_context {
 static bool llama_kv_cache_init(
        const struct llama_hparams & hparams,
             struct llama_kv_cache & cache,
-                         ggml_type   wtype,
+                         ggml_type   ktype,
                         ggml_type   vtype,
                          uint32_t   n_ctx,
-                               int   n_gpu_layers) {
+                               int   n_gpu_layers,
                              bool   offload) {
    const uint32_t n_embd  = hparams.n_embd_gqa();
    const uint32_t n_layer = hparams.n_layer;
@ -1546,7 +1551,7 @@ static bool llama_kv_cache_init(
    cache.cells.clear();
    cache.cells.resize(n_ctx);
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
+    cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
    memset(cache.buf.data, 0, cache.buf.size);
    struct ggml_init_params params;
@ -1556,37 +1561,44 @@ static bool llama_kv_cache_init(
    cache.ctx = ggml_init(params);
    size_t vram_kv_cache = 0;
    if (!cache.ctx) {
        LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
        return false;
    }
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.k_l.reserve(n_layer);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v_l.reserve(n_layer);
    ggml_set_name(cache.k, "cache_k");
    ggml_set_name(cache.v, "cache_v");
-    (void) n_gpu_layers;
+    const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
    GGML_UNUSED(offload);
    for (int i = 0; i < (int) n_layer; i++) {
        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
        ggml_format_name(k, "cache_k_l%d", i);
        ggml_format_name(v, "cache_v_l%d", i);
        cache.k_l.push_back(k);
        cache.v_l.push_back(v);
 #ifdef GGML_USE_CUBLAS
-    if (ggml_cublas_loaded()) {
+        if (i >= i_gpu_start) {
-        size_t vram_kv_cache = 0;
+            if (offload) {
                ggml_cuda_assign_buffers_no_scratch(k);
                vram_kv_cache += ggml_nbytes(k);
                ggml_cuda_assign_buffers_no_scratch(v);
                vram_kv_cache += ggml_nbytes(v);
            }
        }
 #endif // GGML_USE_CUBLAS
    }
        if (n_gpu_layers > (int)n_layer + 1) {
            ggml_cuda_assign_buffers_no_scratch(cache.v);
            LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
            vram_kv_cache += ggml_nbytes(cache.v);
        }
        if (n_gpu_layers > (int)n_layer + 2) {
            ggml_cuda_assign_buffers_no_scratch(cache.k);
            LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
            vram_kv_cache += ggml_nbytes(cache.k);
        }
    if (vram_kv_cache > 0) {
-            LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+        LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
    }
-    }
+
-#endif
+    GGML_UNUSED(n_gpu_layers);
    return true;
 }
@ -2995,14 +3007,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3072,14 +3077,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3142,14 +3140,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3219,14 +3210,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3296,21 +3280,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
 #ifdef GGML_USE_CUBLAS
                            if (n_gpu_layers > int(n_layer + 1)) {
                                LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
                                    __func__, n_layer + 1);
                                throw std::runtime_error("Persimmon CUDA offload failed");
                            }
 #endif
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3369,14 +3339,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3447,14 +3410,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3514,14 +3470,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3586,14 +3535,7 @@ static void llm_load_tensors(
                        ggml_backend_type backend_output;
                        if (n_gpu_layers > int(n_layer)) {
                            // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
                            // on Windows however this is detrimental unless everything is on the GPU
 #ifndef _WIN32
                            backend_norm   = llama_backend_offload;
 #else
                            backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
 #endif // _WIN32
                            backend_output = llama_backend_offload_split;
                        } else {
                            backend_norm   = GGML_BACKEND_CPU;
@ -3669,8 +3611,8 @@ static void llm_load_tensors(
        }
 #ifdef GGML_USE_CUBLAS
-        const int max_backend_supported_layers = hparams.n_layer + 3;
+        const int max_backend_supported_layers = hparams.n_layer + 1;
-        const int max_offloadable_layers       = hparams.n_layer + 3;
+        const int max_offloadable_layers       = hparams.n_layer + 1;
 #elif GGML_USE_CLBLAST
        const int max_backend_supported_layers = hparams.n_layer + 1;
        const int max_offloadable_layers       = hparams.n_layer + 1;
@ -3853,11 +3795,11 @@ static void llm_build_k_shift(
        struct ggml_tensor * tmp =
            // we rotate only the first n_rot dimensions
            ggml_rope_custom_inplace(ctx,
-                    ggml_view_3d(ctx, kv.k,
+                    ggml_view_3d(ctx, kv.k_l[il],
                        n_embd_head, n_head_kv, n_ctx,
-                        ggml_element_size(kv.k)*n_embd_head,
+                        ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
-                        ggml_element_size(kv.k)*n_embd_gqa,
+                        ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
-                        ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
+                        0),
                    K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
                    ext_factor, attn_factor, beta_fast, beta_slow);
        cb(tmp, "K_shifted", il);
@ -3884,13 +3826,13 @@ static void llm_build_kv_store(
    //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
    cb(v_cur_t, "v_cur_t", il);
-    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k, n_tokens*n_embd_gqa,
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
-            (ggml_element_size(kv.k)*n_embd_gqa)*(il*n_ctx + kv_head));
+            (ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
    cb(k_cache_view, "k_cache_view", il);
-    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v, n_tokens, n_embd_gqa,
+    struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
-            (   n_ctx)*ggml_element_size(kv.v),
+            (  n_ctx)*ggml_element_size(kv.v_l[il]),
-            (il*n_ctx)*ggml_element_size(kv.v)*n_embd_gqa + kv_head*ggml_element_size(kv.v));
+            (kv_head)*ggml_element_size(kv.v_l[il]));
    cb(v_cache_view, "v_cache_view", il);
    // important: storing RoPE-ed version of K in the KV cache!
@ -4042,11 +3984,11 @@ static struct ggml_tensor * llm_build_kqv(
    cb(q, "q", il);
    struct ggml_tensor * k =
-        ggml_view_3d(ctx, kv.k,
+        ggml_view_3d(ctx, kv.k_l[il],
                n_embd_head, n_kv, n_head_kv,
-                ggml_element_size(kv.k)*n_embd_gqa,
+                ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
-                ggml_element_size(kv.k)*n_embd_head,
+                ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
-                ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il);
+                0);
    cb(k, "k", il);
    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
@ -4077,11 +4019,11 @@ static struct ggml_tensor * llm_build_kqv(
    // split cached v into n_head heads
    struct ggml_tensor * v =
-        ggml_view_3d(ctx, kv.v,
+        ggml_view_3d(ctx, kv.v_l[il],
                n_kv, n_embd_head, n_head_kv,
-                ggml_element_size(kv.v)*n_ctx,
+                ggml_element_size(kv.v_l[il])*n_ctx,
-                ggml_element_size(kv.v)*n_ctx*n_embd_head,
+                ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
-                ggml_element_size(kv.v)*n_ctx*n_embd_gqa*il);
+                0);
    cb(v, "v", il);
    struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
@ -4681,6 +4623,7 @@ struct llm_build_context {
        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
        cb(inpL, "imp_embd", -1);
        // inp_pos - contains the positions
        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
        cb(inp_pos, "inp_pos", -1);
@ -4688,6 +4631,7 @@ struct llm_build_context {
        struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
        cb(KQ_scale, "KQ_scale", -1);
        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
        cb(KQ_mask, "KQ_mask", -1);
@ -5294,7 +5238,7 @@ struct llm_build_context {
        struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
        cb(KQ_scale, "KQ_scale", -1);
-        // KQ_mask (mask for 1 head, it wil be broadcasted to all heads)
+        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
        struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
        cb(KQ_mask, "KQ_mask", -1);
@ -5401,8 +5345,8 @@ struct llm_build_context {
 enum llm_offload_func_e {
    OFFLOAD_FUNC_NOP,
    OFFLOAD_FUNC,
-    OFFLOAD_FUNC_KQ,
+    OFFLOAD_FUNC_FRC, // force offload
-    OFFLOAD_FUNC_V,
+    OFFLOAD_FUNC_KQV,
    OFFLOAD_FUNC_NR,
    OFFLOAD_FUNC_EMB,
    OFFLOAD_FUNC_OUT,
@ -5488,11 +5432,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
  //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
    { "pos_embd",                   OFFLOAD_FUNC_NR  },
-    { "inp_pos",                    OFFLOAD_FUNC_KQ  }, // this is often used for KQ ops (e.g. rope)
+    { "inp_pos",                    OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_scale",                   OFFLOAD_FUNC_KQ  },
+    { "KQ_scale",                   OFFLOAD_FUNC_FRC },
-    { "KQ_mask",                    OFFLOAD_FUNC_KQ  },
+    { "KQ_mask",                    OFFLOAD_FUNC_FRC },
-    { "K_shift",                    OFFLOAD_FUNC_KQ  },
+    { "K_shift",                    OFFLOAD_FUNC_FRC },
-    { "K_shifted",                  OFFLOAD_FUNC_KQ  },
+
    { "K_shifted",                  OFFLOAD_FUNC     },
    { "inp_norm",                   OFFLOAD_FUNC_NR  },
    { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
@ -5505,38 +5450,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
    { "attn_norm",                  OFFLOAD_FUNC     },
    { "attn_norm_2",                OFFLOAD_FUNC     },
-    { "wqkv",                       OFFLOAD_FUNC_KQ  },
+    { "wqkv",                       OFFLOAD_FUNC_KQV },
-    { "bqkv",                       OFFLOAD_FUNC_KQ  },
+    { "bqkv",                       OFFLOAD_FUNC_KQV },
-    { "wqkv_clamped",               OFFLOAD_FUNC_KQ  },
+    { "wqkv_clamped",               OFFLOAD_FUNC_KQV },
-    { "tmpk",                       OFFLOAD_FUNC_KQ  },
+    { "tmpk",                       OFFLOAD_FUNC_KQV },
-    { "tmpq",                       OFFLOAD_FUNC_KQ  },
+    { "tmpq",                       OFFLOAD_FUNC_KQV },
-    { "tmpv",                       OFFLOAD_FUNC_V   },
+    { "tmpv",                       OFFLOAD_FUNC_KQV },
-    { "Kcur",                       OFFLOAD_FUNC_KQ  },
+    { "Kcur",                       OFFLOAD_FUNC_KQV },
-    { "Qcur",                       OFFLOAD_FUNC_KQ  },
+    { "Qcur",                       OFFLOAD_FUNC_KQV },
-    { "Vcur",                       OFFLOAD_FUNC_V   },
+    { "Vcur",                       OFFLOAD_FUNC_KQV },
-    { "krot",                       OFFLOAD_FUNC_KQ  },
+    { "krot",                       OFFLOAD_FUNC_KQV },
-    { "qrot",                       OFFLOAD_FUNC_KQ  },
+    { "qrot",                       OFFLOAD_FUNC_KQV },
-    { "kpass",                      OFFLOAD_FUNC_KQ  },
+    { "kpass",                      OFFLOAD_FUNC_KQV },
-    { "qpass",                      OFFLOAD_FUNC_KQ  },
+    { "qpass",                      OFFLOAD_FUNC_KQV },
-    { "krotated",                   OFFLOAD_FUNC_KQ  },
+    { "krotated",                   OFFLOAD_FUNC_KQV },
-    { "qrotated",                   OFFLOAD_FUNC_KQ  },
+    { "qrotated",                   OFFLOAD_FUNC_KQV },
-    { "q",                          OFFLOAD_FUNC_KQ  },
+    { "q",                          OFFLOAD_FUNC_KQV },
-    { "k",                          OFFLOAD_FUNC_KQ  },
+    { "k",                          OFFLOAD_FUNC_KQV },
-    { "kq",                         OFFLOAD_FUNC_KQ  },
+    { "kq",                         OFFLOAD_FUNC_KQV },
-    { "kq_scaled",                  OFFLOAD_FUNC_KQ  },
+    { "kq_scaled",                  OFFLOAD_FUNC_KQV },
-    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQ  },
+    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQV },
-    { "kq_masked",                  OFFLOAD_FUNC_KQ  },
+    { "kq_masked",                  OFFLOAD_FUNC_KQV },
-    { "kq_soft_max",                OFFLOAD_FUNC_V   },
+    { "kq_soft_max",                OFFLOAD_FUNC_KQV },
-    { "kq_soft_max_ext",            OFFLOAD_FUNC_V   },
+    { "kq_soft_max_ext",            OFFLOAD_FUNC_KQV },
-    { "v",                          OFFLOAD_FUNC_V   },
+    { "v",                          OFFLOAD_FUNC_KQV },
-    { "kqv",                        OFFLOAD_FUNC_V   },
+    { "kqv",                        OFFLOAD_FUNC_KQV },
-    { "kqv_merged",                 OFFLOAD_FUNC_V   },
+    { "kqv_merged",                 OFFLOAD_FUNC_KQV },
-    { "kqv_merged_cont",            OFFLOAD_FUNC_V   },
+    { "kqv_merged_cont",            OFFLOAD_FUNC_KQV },
-    { "kqv_wo",                     OFFLOAD_FUNC_V   },
+    { "kqv_wo",                     OFFLOAD_FUNC_KQV },
-    { "kqv_out",                    OFFLOAD_FUNC_V   },
+    { "kqv_out",                    OFFLOAD_FUNC_KQV },
    { "ffn_inp",                    OFFLOAD_FUNC     },
    { "ffn_norm",                   OFFLOAD_FUNC     },
@ -5748,14 +5693,14 @@ static struct ggml_cgraph * llama_build_graph(
            { OFFLOAD_FUNC_OUT, "CPU" },
 #ifdef GGML_USE_CUBLAS
            { OFFLOAD_FUNC,     "GPU (CUDA)"     },
-            { OFFLOAD_FUNC_KQ,  "GPU (CUDA) KQ" },
+            { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
-            { OFFLOAD_FUNC_V,   "GPU (CUDA) V" },
+            { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR"  },
            { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
 #else
            { OFFLOAD_FUNC,     "CPU" },
-            { OFFLOAD_FUNC_KQ,  "CPU" },
+            { OFFLOAD_FUNC_FRC, "CPU" },
-            { OFFLOAD_FUNC_V,   "CPU" },
+            { OFFLOAD_FUNC_KQV, "CPU" },
            { OFFLOAD_FUNC_NR,  "CPU" },
            { OFFLOAD_FUNC_EMB, "CPU" },
 #endif // GGML_USE_CUBLAS
@ -5788,21 +5733,26 @@ static struct ggml_cgraph * llama_build_graph(
                    }
                }
                break;
            case OFFLOAD_FUNC_FRC:
                if (!lctx.cparams.offload_kqv) {
                    func_e = OFFLOAD_FUNC_NOP;
                } break;
            case OFFLOAD_FUNC_KQV:
                if (!lctx.cparams.offload_kqv) {
                    func_e = OFFLOAD_FUNC_NOP;
                } else {
                    if (n_gpu_layers < n_layer) {
                        if (il < i_gpu_start) {
                            func_e = OFFLOAD_FUNC_NOP;
                        }
                    }
                }
                break;
            case OFFLOAD_FUNC_NR:
                if (n_gpu_layers <= n_layer + 0) {
                    func_e = OFFLOAD_FUNC_NOP;
                }
                break;
            case OFFLOAD_FUNC_V:
                if (n_gpu_layers <= n_layer + 1) {
                    func_e = OFFLOAD_FUNC_NOP;
                }
                break;
            case OFFLOAD_FUNC_KQ:
                if (n_gpu_layers <= n_layer + 2) {
                    func_e = OFFLOAD_FUNC_NOP;
                }
                break;
            case OFFLOAD_FUNC_EMB:
                if (!offload_emb || n_gpu_layers < n_layer) {
                    func_e = OFFLOAD_FUNC_NOP;
@ -5824,8 +5774,8 @@ static struct ggml_cgraph * llama_build_graph(
            case OFFLOAD_FUNC_NOP:
            case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
            case OFFLOAD_FUNC:
-            case OFFLOAD_FUNC_KQ:
+            case OFFLOAD_FUNC_KQV:
-            case OFFLOAD_FUNC_V:
+            case OFFLOAD_FUNC_FRC:
            case OFFLOAD_FUNC_NR:
            case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
            default: GGML_ASSERT(false);
@ -6017,6 +5967,7 @@ static int llama_decode_internal(
    // after enough generations, the benefit from this heuristic disappears
    // if we start defragmenting the cache, the benefit from this will be more important
    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
    //kv_self.n = llama_kv_cache_cell_max(kv_self);
    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
@ -6067,7 +6018,7 @@ static int llama_decode_internal(
        n_threads = std::min(4, n_threads);
    }
-    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
+    const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
    if (ggml_cpu_has_cublas() && fully_offloaded) {
        n_threads = 1;
    }
@ -6090,13 +6041,17 @@ static int llama_decode_internal(
        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
    } else {
        if (lctx.ctx_kompute) {
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            for (int il = 0; il < hparams.n_layer; ++il) {
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
            }
        }
        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
        if (lctx.ctx_kompute) {
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            for (int il = 0; il < hparams.n_layer; ++il) {
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
            }
        }
    }
 #else
@ -8911,10 +8866,12 @@ struct llama_context_params llama_context_default_params() {
        /*.yarn_beta_fast              =*/ 32.0f,
        /*.yarn_beta_slow              =*/ 1.0f,
        /*.yarn_orig_ctx               =*/ 0,
        /*.type_k                      =*/ GGML_TYPE_F16,
        /*.type_v                      =*/ GGML_TYPE_F16,
        /*.mul_mat_q                   =*/ true,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
        /*.embedding                   =*/ false,
        /*.offload_kqv                 =*/ true,
    };
    return result;
@ -9037,6 +8994,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.yarn_beta_fast   = params.yarn_beta_fast;
    cparams.yarn_beta_slow   = params.yarn_beta_slow;
    cparams.mul_mat_q        = params.mul_mat_q;
    cparams.offload_kqv      = params.offload_kqv;
    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
    cparams.rope_freq_base   = params.rope_freq_base  == 0.0f ? hparams.rope_freq_base_train  : params.rope_freq_base;
@ -9070,19 +9028,36 @@ struct llama_context * llama_new_context_with_model(
    ctx->rng = std::mt19937(params.seed);
    ctx->logits_all = params.logits_all;
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    const ggml_type type_k = params.type_k;
    const ggml_type type_v = params.type_v;
    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
    GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
    // reserve memory for context buffers
    if (!hparams.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, cparams.n_ctx, model->n_gpu_layers)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
            LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
            return nullptr;
        }
        {
-            const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
+            size_t memory_size_k = 0;
-            LLAMA_LOG_INFO("%s: kv self size  = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
+            size_t memory_size_v = 0;
            for (auto & k : ctx->kv_self.k_l) {
                memory_size_k += ggml_nbytes(k);
            }
            for (auto & v : ctx->kv_self.v_l) {
                memory_size_v += ggml_nbytes(v);
            }
            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
        }
        // resized during inference
@ -9153,8 +9128,12 @@ struct llama_context * llama_new_context_with_model(
            }
            size_t kv_vram_size = 0;
-            add_tensor(ctx->kv_self.k, kv_vram_size);
+            for (auto & k : ctx->kv_self.k_l) {
-            add_tensor(ctx->kv_self.v, kv_vram_size);
+                add_tensor(k, kv_vram_size);
            }
            for (auto & v : ctx->kv_self.v_l) {
                add_tensor(v, kv_vram_size);
            }
            size_t ctx_vram_size = alloc_size + kv_vram_size;
            size_t total_vram_size = model_vram_size + ctx_vram_size;
@ -9644,37 +9623,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
        data_ctx->write(&kv_used,     sizeof(kv_used));
        if (kv_buf_size) {
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
-            ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
+            std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
-            std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
+            std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
            kout3d->data = kout3d_data.data();
-            ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
+            for (int il = 0; il < (int) n_layer; ++il) {
-            std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
+                ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
-            vout3d->data = vout3d_data.data();
+                kout2d_data[il].resize(ggml_nbytes(kout2d));
                kout2d->data = kout2d_data[il].data();
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
+                ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
-                n_embd, kv_head, n_layer,
+                vout2d_data[il].resize(ggml_nbytes(vout2d));
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                vout2d->data = vout2d_data[il].data();
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
+                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                kv_head, n_embd, n_layer,
+                        n_embd, kv_head,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                        elt_size*n_embd, 0);
                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
                        kv_head, n_embd,
                        elt_size*n_ctx, 0);
                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
            }
            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
            ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
            ggml_free(cpy_ctx);
-            // our data is now in the kout3d_data and vout3d_data buffers
+            // our data is now in the kout2d_data and vout2d_data buffers
            // write them to file
-            data_ctx->write(kout3d_data.data(), kout3d_data.size());
+            for (uint32_t il = 0; il < n_layer; ++il) {
-            data_ctx->write(vout3d_data.data(), vout3d_data.size());
+                data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
                data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
            }
        }
        for (uint32_t i = 0; i < kv_size; ++i) {
@ -9774,29 +9761,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
        if (kv_buf_size) {
            GGML_ASSERT(kv_self.buf.size == kv_buf_size);
-            const size_t elt_size = ggml_element_size(kv_self.k);
+            const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
-            ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
+            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
-            ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
+            for (int il = 0; il < n_layer; ++il) {
-            kin3d->data = (void *) inp;
+                ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
-            inp += ggml_nbytes(kin3d);
+                kin2d->data = (void *) inp;
                inp += ggml_nbytes(kin2d);
-            ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
+                ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
-            vin3d->data = (void *) inp;
+                vin2d->data = (void *) inp;
-            inp += ggml_nbytes(vin3d);
+                inp += ggml_nbytes(vin2d);
-            ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
+                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                n_embd, kv_head, n_layer,
+                    n_embd, kv_head,
-                elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
+                    elt_size*n_embd, 0);
-            ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
+                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
-                kv_head, n_embd, n_layer,
+                    kv_head, n_embd,
-                elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
+                    elt_size*n_ctx, 0);
                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
            }
            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
            ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
            ggml_free(cpy_ctx);
--- a/llama.h
+++ b/llama.h
@ -42,7 +42,7 @@
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@ -211,11 +211,14 @@ extern "C" {
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
        enum ggml_type type_k; // data type for K cache
        enum ggml_type type_v; // data type for V cache
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
        bool logits_all;  // the llama_eval() call computes all logits, not just the last one
        bool embedding;   // embedding mode only
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };
    // model quantization parameters