From 0bebe45a25614401c372959770f89bab01165c47 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 12 Jan 2025 12:15:19 +0200 Subject: [PATCH] llama : de-shadow (wip) [no ci] --- examples/gguf/gguf.cpp | 12 +++++++----- src/llama-kv-cache.h | 6 +++--- src/llama.cpp | 42 +++++++++++++++++++++--------------------- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f31989c8c..d928db8fe 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -204,13 +204,15 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) { __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data); // print first 10 elements - const float * data = (const float *) cur->data; + { + const float * data = (const float *) cur->data; - printf("%s data[:10] : ", name); - for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { - printf("%f ", data[j]); + printf("%s data[:10] : ", name); + for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) { + printf("%f ", data[j]); + } + printf("\n\n"); } - printf("\n\n"); // check data if (check_data) { diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index dca6f3998..2645fd23b 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -58,12 +58,12 @@ struct llama_kv_cache { std::vector bufs; size_t total_size() const { - size_t size = 0; + size_t size_all = 0; for (const auto & buf : bufs) { - size += ggml_backend_buffer_get_size(buf.get()); + size_all += ggml_backend_buffer_get_size(buf.get()); } - return size; + return size_all; } // TODO: better data structures to reduce the cost of this operation diff --git a/src/llama.cpp b/src/llama.cpp index daf1b7c97..83822668e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1174,14 +1174,15 @@ struct llm_build_context { ggml_set_input(lctx.inp_K_shift); for (int il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_head_kv_i = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, + n_embd_head_k, n_head_kv_i, n_ctx, ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), 0); struct ggml_tensor * tmp; @@ -1231,18 +1232,18 @@ struct llm_build_context { } for (int il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + const int64_t n_embd_k_gqa_i = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa_i = hparams.n_embd_v_gqa(il); ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + n_embd_k_gqa_i, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*i)); ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + n_embd_k_gqa_i, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa_i*id)); ggml_tensor * view_v_src; ggml_tensor * view_v_dst; @@ -1250,22 +1251,22 @@ struct llm_build_context { if (flash_attn) { // NOTE: the V cache is not transposed when using flash attention view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + n_embd_v_gqa_i, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*i)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + n_embd_v_gqa_i, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa_i*id)); } else { view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + nm, n_embd_v_gqa_i, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, i)); view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, + nm, n_embd_v_gqa_i, ggml_row_size(kv_self.v_l[il]->type, kv_self.size), ggml_row_size(kv_self.v_l[il]->type, id)); } @@ -1459,7 +1460,6 @@ struct llm_build_context { } struct ggml_tensor * llm_build_inp_embd_enc() { - const int64_t n_embd = hparams.n_embd; lctx.inp_embd_enc = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_outputs_enc); ggml_set_input(lctx.inp_embd_enc); cb(lctx.inp_embd_enc, "embd_enc", -1);