mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-24 10:24:35 +00:00
llama : fix Metal KV cache sync (close #1695)
This commit is contained in:
parent
827f5eda91
commit
d1f563a743
@ -1455,6 +1455,14 @@ static bool llama_eval_internal(
|
|||||||
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
||||||
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
||||||
//
|
//
|
||||||
|
// TODO: avoid these syncs via shared memory (ref #1696)
|
||||||
|
//
|
||||||
|
if (lctx.ctx_metal) {
|
||||||
|
// We need to sync the GPU KV cache with the CPU KV cache
|
||||||
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
||||||
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
|
||||||
if (lctx.ctx_metal) {
|
if (lctx.ctx_metal) {
|
||||||
|
Loading…
Reference in New Issue
Block a user