From cfb8e35b73bd8daca6fa5179d30c48a5db43bc31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 10 Aug 2023 19:56:56 +0300
Subject: [PATCH] gguf :  inference with 7B model working (WIP)

---
 gguf-llama.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/gguf-llama.cpp b/gguf-llama.cpp
index b88a2d8bf..0c4095714 100644
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@@ -493,6 +493,8 @@ struct ggml_context * ctx_data = NULL;
 
     gguf_ctx = gguf_init_from_file(fname, params);
 
+    read_hparams();
+    read_vocab();
         read_tensor_metadata(tensors_map);
     }
 
@@ -523,7 +525,7 @@ struct ggml_context * ctx_data = NULL;
             }
         }
 
-        throw std::runtime_error(format("failed to find n_mult for n_ff = %d and n_emb = %d\n", n_ff, n_embd));
+        throw std::runtime_error(format("failed to find n_mult for n_ff = %d and n_embd = %d\n", n_ff, n_embd));
     }
 
     void read_hparams() {
@@ -534,14 +536,14 @@ struct ggml_context * ctx_data = NULL;
         hparams.n_ctx   = read_u32("llama.context_length");
         hparams.n_embd  = read_u32("llama.embedding_length");
         uint32_t n_ff    = read_u32("llama.feed_forward_length");
-        hparams.n_mult  = find_n_mult(n_ff, hparams.n_embd);
+        //hparams.n_mult  = find_n_mult(n_ff, hparams.n_embd);
         hparams.n_head  = read_u32("llama.attention.head_count");
         hparams.n_layer = read_u32("llama.layer_count");
         hparams.n_rot   = hparams.n_embd / hparams.n_head;
         //hparams.ftype   = (enum llama_ftype) file.read_u32();
 
         // LLaMAv2
-        hparams.n_head_kv = read_u32("llama.attention.head_count_kv");
+        // hparams.n_head_kv = read_u32("llama.attention.head_count_kv");
     }
 
     void read_vocab() {