gguf : inference with 7B model working (WIP)

2024-12-28 12:24:35 +00:00 · 2023-08-10 19:56:56 +03:00 · 2023-08-10 19:56:56 +03:00 · cfb8e35b73
commit cfb8e35b73
parent 42cc04d11d
1 changed files with 5 additions and 3 deletions
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@ -493,6 +493,8 @@ struct ggml_context * ctx_data = NULL;

    gguf_ctx = gguf_init_from_file(fname, params);

+    read_hparams();
+    read_vocab();
        read_tensor_metadata(tensors_map);
    }

@ -523,7 +525,7 @@ struct ggml_context * ctx_data = NULL;
            }
        }

-        throw std::runtime_error(format("failed to find n_mult for n_ff = %d and n_emb = %d\n", n_ff, n_embd));
+        throw std::runtime_error(format("failed to find n_mult for n_ff = %d and n_embd = %d\n", n_ff, n_embd));
    }

    void read_hparams() {
@ -534,14 +536,14 @@ struct ggml_context * ctx_data = NULL;
        hparams.n_ctx   = read_u32("llama.context_length");
        hparams.n_embd  = read_u32("llama.embedding_length");
        uint32_t n_ff    = read_u32("llama.feed_forward_length");
-        hparams.n_mult  = find_n_mult(n_ff, hparams.n_embd);
+        //hparams.n_mult  = find_n_mult(n_ff, hparams.n_embd);
        hparams.n_head  = read_u32("llama.attention.head_count");
        hparams.n_layer = read_u32("llama.layer_count");
        hparams.n_rot   = hparams.n_embd / hparams.n_head;
        //hparams.ftype   = (enum llama_ftype) file.read_u32();

        // LLaMAv2
-        hparams.n_head_kv = read_u32("llama.attention.head_count_kv");
+        // hparams.n_head_kv = read_u32("llama.attention.head_count_kv");
    }

    void read_vocab() {