llama : no longer perform uninitialized access to the KV cache

2024-12-25 10:54:36 +00:00 · 2023-10-08 11:49:38 +03:00 · 2023-10-08 11:49:38 +03:00 · ee268b5446
commit ee268b5446
parent acead654d2
1 changed files with 5 additions and 4 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -1329,7 +1329,9 @@ static bool llama_kv_cache_init(
    //       cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
    //       change it and test that it works
    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
-    memset(cache.buf.data, 0, cache.buf.size);
+
+    // this is not necessary, since we should not be accessing cache data that has not been initialized yet
+    //memset(cache.buf.data, 0, cache.buf.size);

    struct ggml_init_params params;
    params.mem_size   = cache.buf.size;
@ -1430,7 +1432,7 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
        }
    }

-    return 0;
+    return 1;
 }

 static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
@ -5020,8 +5022,7 @@ static int llama_decode_internal(
    // a heuristic, to avoid attending the full cache if it is not yet utilized
    // after enough generations, the benefit from this heuristic disappears
    // if we start defragmenting the cache, the benefit from this will be more important
-    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
+    kv_self.n = llama_kv_cache_cell_max(kv_self);

    //printf("kv_self.n = %d\n", kv_self.n);