mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-11 03:01:45 +00:00
Adjust Metal buffer allocation to avoid allocating beyond MTLDevice.recommendedMaxWorkingSetSize
This commit is contained in:
parent
b213227067
commit
da7d2f9587
@ -50,8 +50,6 @@ int main(int argc, char ** argv) {
|
|||||||
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
|
||||||
*(int32_t *) input->data = 1; // BOS
|
*(int32_t *) input->data = 1; // BOS
|
||||||
|
|
||||||
ggml_metal_set_tensor(ctx_metal, input);
|
|
||||||
|
|
||||||
// warmup
|
// warmup
|
||||||
ggml_metal_graph_compute(ctx_metal, &gf);
|
ggml_metal_graph_compute(ctx_metal, &gf);
|
||||||
|
|
||||||
@ -72,7 +70,6 @@ int main(int argc, char ** argv) {
|
|||||||
// debug output
|
// debug output
|
||||||
{
|
{
|
||||||
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
|
||||||
ggml_metal_get_tensor(ctx_metal, logits);
|
|
||||||
|
|
||||||
float * ptr = (float *) ggml_get_data(logits);
|
float * ptr = (float *) ggml_get_data(logits);
|
||||||
|
|
||||||
|
11
ggml-metal.h
11
ggml-metal.h
@ -13,9 +13,6 @@
|
|||||||
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
|
||||||
// used during the graph evaluation to determine the arguments of the compute kernels.
|
// used during the graph evaluation to determine the arguments of the compute kernels.
|
||||||
//
|
//
|
||||||
// Synchronization between device and host memory (for example for input and output tensors)
|
|
||||||
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
|
|
||||||
//
|
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
@ -23,7 +20,7 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
// max memory buffers that can be mapped to the device
|
// max memory buffers that can be mapped to the device
|
||||||
#define GGML_METAL_MAX_BUFFERS 16
|
#define GGML_METAL_MAX_BUFFERS 256
|
||||||
|
|
||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct ggml_cgraph;
|
struct ggml_cgraph;
|
||||||
@ -51,12 +48,6 @@ bool ggml_metal_add_buffer(
|
|||||||
size_t size,
|
size_t size,
|
||||||
size_t max_size);
|
size_t max_size);
|
||||||
|
|
||||||
// set data from host memory into the device
|
|
||||||
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// get data from the device into host memory
|
|
||||||
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
// creates gf->n_threads command buffers in parallel
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
1303
ggml-metal.m
1303
ggml-metal.m
File diff suppressed because it is too large
Load Diff
11
llama.cpp
11
llama.cpp
@ -1555,7 +1555,6 @@ static bool llama_eval_internal(
|
|||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
|
||||||
} else {
|
} else {
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||||
@ -1564,15 +1563,7 @@ static bool llama_eval_internal(
|
|||||||
//
|
//
|
||||||
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
|
||||||
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
// But for now, we have focused only on Matrix x Vector Metal multiplication.
|
||||||
//
|
|
||||||
// TODO: avoid these syncs via shared memory (ref #1696)
|
|
||||||
//
|
|
||||||
if (lctx.ctx_metal) {
|
|
||||||
// We need to sync the GPU KV cache with the CPU KV cache
|
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
|
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
Loading…
Reference in New Issue
Block a user