Revert the prompt processing on gpu for now.

Fixes issues #1580 and #1581
2025-01-05 00:04:36 +00:00 · 2023-10-27 18:32:51 -04:00 · 2023-10-27 18:32:51 -04:00 · a5eb001eab
commit a5eb001eab
parent e006d377dd
1 changed files with 10 additions and 2 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3870,11 +3870,19 @@ static bool llama_eval_internal(
        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
    }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute) {
+    if (lctx.ctx_kompute && N == 1) {
        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
    } else {
+        if (lctx.ctx_kompute) {
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+        }
        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
    }
 #else
    ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);