diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 4d7d2716e..152b9adb0 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -127,6 +127,8 @@ extern "C" { bool async; // pinned host buffer bool host_buffer; + // creating buffers from host ptr + bool buffer_from_host_ptr; // event synchronization bool events; }; diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index afe64b105..d638ae1bb 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -463,6 +463,7 @@ enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device) { } void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props) { + memset(props, 0, sizeof(*props)); device->iface.get_props(device, props); } @@ -1129,9 +1130,10 @@ static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggm props->type = ggml_backend_cpu_device_get_type(dev); ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = { - /* async */ false, - /* host_buffer */ false, - /* events */ false, + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, }; } diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 5b6f605b0..edb61abdf 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2920,9 +2920,10 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back #endif props->caps = { - /* async */ true, - /* host_buffer */ host_buffer, - /* events */ events, + /* .async = */ true, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ events, }; } diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m index ee4dbd246..ce04552ba 100644 --- a/ggml/src/ggml-metal.m +++ b/ggml/src/ggml-metal.m @@ -3567,12 +3567,14 @@ static const char * ggml_backend_metal_device_get_description(ggml_backend_dev_t } static void ggml_backend_metal_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; - if (@available(macOS 10.12, iOS 16.0, *)) { - *total = g_state.mtl_device.recommendedMaxWorkingSetSize; + id device = ggml_backend_metal_get_device(); + *total = device.recommendedMaxWorkingSetSize; + *free = *total - device.currentAllocatedSize; + ggml_backend_metal_free_device(); + } else { + *free = 1; + *total = 1; } GGML_UNUSED(dev); @@ -3590,9 +3592,10 @@ static void ggml_backend_metal_device_get_props(ggml_backend_dev_t dev, struct g props->type = ggml_backend_metal_device_get_type(dev); ggml_backend_metal_device_get_memory(dev, &props->memory_free, &props->memory_total); props->caps = (struct ggml_backend_dev_caps) { - /* async */ false, - /* host_buffer */ false, - /* events */ false, + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, }; } diff --git a/src/llama.cpp b/src/llama.cpp index a4581c2dd..77df74723 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -8907,20 +8907,30 @@ static bool llm_load_tensors( llama_buf_map bufs; bufs.reserve(n_max_backend_buffer); - // only the mmap region containing the tensors in the model is mapped to the backend buffer - // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers - // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size - if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(model, true)) { + // check if this backend device supports buffer_from_host_ptr + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + bool buffer_from_host_ptr_supported = false; + if (dev) { + ggml_backend_dev_props props; + ggml_backend_dev_get_props(dev, &props); + buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; + } + + if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size void * addr = nullptr; - size_t first, last; + size_t first, last; // NOLINT ml.get_mapping_range(&first, &last, &addr, idx, ctx); if (first >= last) { continue; } - ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); + const size_t max_size = ggml_get_max_tensor_size(ctx); + ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend CPU buffer"); + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } model.bufs.push_back(buf); bufs.emplace(idx, buf); @@ -8929,7 +8939,7 @@ static bool llm_load_tensors( else { ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); + throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } model.bufs.push_back(buf); if (use_mlock && ggml_backend_buffer_is_host(buf)) {