From 4b8d5e389085a1d986877dbc86976361f2778846 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 21 Apr 2023 17:42:02 +0300 Subject: [PATCH] llama : quantize attention results --- llama.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/llama.cpp b/llama.cpp index 4e92f5515..70b249672 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1133,6 +1133,11 @@ static bool llama_eval_internal( n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); + // re-quantize K + if (ggml_is_quantized(model.layers[il].wk->type)) { + K = ggml_cpy(ctx0, K, ggml_new_tensor_3d(ctx0, model.layers[il].wk->type, n_embd/n_head, n_past + N, n_head)); + } + // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); @@ -1157,6 +1162,11 @@ static bool llama_eval_internal( il*n_ctx*ggml_element_size(kv_self.v)*n_embd); #if 1 + // re-quantize V + if (ggml_is_quantized(model.layers[il].wv->type) && ((n_past + N) % ggml_blck_size(model.layers[il].wv->type) == 0)) { + V = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, model.layers[il].wv->type, n_past + N, n_embd/n_head, n_head)); + } + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy