mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 21:39:52 +00:00
llama : fix Metal KV cache sync (close #1695)
This commit is contained in:
parent
827f5eda91
commit
d1f563a743
@ -1455,6 +1455,14 @@ static bool llama_eval_internal(
|
||||
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
||||
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
||||
//
|
||||
// TODO: avoid these syncs via shared memory (ref #1696)
|
||||
//
|
||||
if (lctx.ctx_metal) {
|
||||
// We need to sync the GPU KV cache with the CPU KV cache
|
||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
if (lctx.ctx_metal) {
|
||||
|
Loading…
Reference in New Issue
Block a user