From ba2135ccae7462470b3865c6e41d2e1d734eac05 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 21 Feb 2024 22:18:23 +0100 Subject: [PATCH] gemma : allow offloading the output tensor (#5646) --- llama.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 3a226c426..4054d5da6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4394,6 +4394,8 @@ static bool llm_load_tensors( // output model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading + ml.n_created--; // artificial tensor const int64_t n_ff = hparams.n_ff; const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -7525,7 +7527,7 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur);