fixes

2025-01-07 09:11:46 +00:00 · 2024-10-06 00:37:25 +02:00 · 2024-10-06 00:37:25 +02:00 · 5ea66f4354
commit 5ea66f4354
parent 4ef1b017af
5 changed files with 40 additions and 22 deletions
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -127,6 +127,8 @@ extern "C" {
        bool async;
        // pinned host buffer
        bool host_buffer;
        // creating buffers from host ptr
        bool buffer_from_host_ptr;
        // event synchronization
        bool events;
    };
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@ -463,6 +463,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) {
 }
 void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) {
    memset(props, 0, sizeof(*props));
    device->iface.get_props(device, props);
 }
@ -1129,9 +1130,10 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm
    props->type        = ggml_backend_cpu_device_get_type(dev);
    ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = {
-        /* async       */ false,
+        /* .async                 = */ false,
-        /* host_buffer */ false,
+        /* .host_buffer           = */ false,
-        /* events      */ false,
+        /* .buffer_from_host_ptr  = */ true,
        /* .events                = */ false,
    };
 }
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -2920,9 +2920,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
 #endif
    props->caps = {
-        /* async       */ true,
+        /* .async                 = */ true,
-        /* host_buffer */ host_buffer,
+        /* .host_buffer           = */ host_buffer,
-        /* events      */ events,
+        /* .buffer_from_host_ptr  = */ false,
        /* .events                = */ events,
    };
 }
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@ -3567,12 +3567,14 @@ static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t
 }
 static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
    // TODO
    *free = 0;
    *total = 0;
    if (@available(macOS 10.12, iOS 16.0, *)) {
-        *total = g_state.mtl_device.recommendedMaxWorkingSetSize;
+        id<MTLDevice> device = ggml_backend_metal_get_device();
        *total = device.recommendedMaxWorkingSetSize;
        *free  = *total - device.currentAllocatedSize;
        ggml_backend_metal_free_device();
    } else {
        *free = 1;
        *total = 1;
    }
    GGML_UNUSED(dev);
@ -3590,9 +3592,10 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g
    props->type        = ggml_backend_metal_device_get_type(dev);
    ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total);
    props->caps = (struct ggml_backend_dev_caps) {
-        /* async       */ false,
+        /* .async                 = */ false,
-        /* host_buffer */ false,
+        /* .host_buffer           = */ false,
-        /* events      */ false,
+        /* .buffer_from_host_ptr  = */ true,
        /* .events                = */ false,
    };
 }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -8907,20 +8907,30 @@ static bool llm_load_tensors(
        llama_buf_map bufs;
        bufs.reserve(n_max_backend_buffer);
-        // only the mmap region containing the tensors in the model is mapped to the backend buffer
+        // check if this backend device supports buffer_from_host_ptr
-        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+        ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
-        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+        bool buffer_from_host_ptr_supported = false;
-        if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) {
+        if (dev) {
            ggml_backend_dev_props props;
            ggml_backend_dev_get_props(dev, &props);
            buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
        }
        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) {
            for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                // only the mmap region containing the tensors in the model is mapped to the backend buffer
                // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
                // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
                void * addr = nullptr;
-                size_t first, last;
+                size_t first, last; // NOLINT
                ml.get_mapping_range(&first, &last, &addr, idx, ctx);
                if (first >= last) {
                    continue;
                }
-                ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first);
+                const size_t max_size = ggml_get_max_tensor_size(ctx);
                ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
                if (buf == nullptr) {
-                    throw std::runtime_error("unable to allocate backend CPU buffer");
+                    throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
                }
                model.bufs.push_back(buf);
                bufs.emplace(idx, buf);
@ -8929,7 +8939,7 @@ static bool llm_load_tensors(
        else {
            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
            if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
+                throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
            }
            model.bufs.push_back(buf);
            if (use_mlock && ggml_backend_buffer_is_host(buf)) {