mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 06:49:54 +00:00
llama : initial ggml-backend integration (#4520)
* llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
31f27758fa
commit
d232aca5a7
2
Makefile
2
Makefile
@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
|
|||||||
./$$test_target; \
|
./$$test_target; \
|
||||||
fi; \
|
fi; \
|
||||||
if [ $$? -ne 0 ]; then \
|
if [ $$? -ne 0 ]; then \
|
||||||
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
|
printf 'Test %s FAILED!\n\n' $$test_target; \
|
||||||
failures=$$(( failures + 1 )); \
|
failures=$$(( failures + 1 )); \
|
||||||
else \
|
else \
|
||||||
printf 'Test %s passed.\n\n' $$test_target; \
|
printf 'Test %s passed.\n\n' $$test_target; \
|
||||||
|
16
ggml-alloc.c
16
ggml-alloc.c
@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|||||||
if (update_backend) {
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
}
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
// views are initialized in the alloc buffer rather than the view_src buffer
|
||||||
|
view->buffer = alloc->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
|
||||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_free(ggml_allocr_t alloc) {
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
||||||
|
if (alloc == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_gallocr_free(alloc->galloc);
|
ggml_gallocr_free(alloc->galloc);
|
||||||
ggml_tallocr_free(alloc->talloc);
|
ggml_tallocr_free(alloc->talloc);
|
||||||
free(alloc);
|
free(alloc);
|
||||||
@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (nbytes == 0) {
|
if (nbytes == 0) {
|
||||||
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
// all the tensors in the context are already allocated
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
} else {
|
} else {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(buffer, t);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (t->view_src != NULL) {
|
||||||
|
// view of a pre-allocated tensor
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,9 @@ extern "C" {
|
|||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
// check if tensor data is in host memory
|
||||||
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||||
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
@ -40,6 +43,7 @@ extern "C" {
|
|||||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
|
@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|||||||
return buft->iface.supports_backend(buft, backend);
|
return buft->iface.supports_backend(buft, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.is_host) {
|
||||||
|
return buft->iface.is_host(buft);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend buffer
|
// backend buffer
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
buffer->iface.clear(buffer, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->buft;
|
return buffer->buft;
|
||||||
}
|
}
|
||||||
@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
memset(buffer->context, value, buffer->size);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
// for buffers from ptr, free is not called
|
// for buffers from ptr, free is not called
|
||||||
@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cpu;
|
return &ggml_backend_cpu_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
|
// buffer type HBM
|
||||||
|
|
||||||
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
hbw_free(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
//void * ptr = hbw_malloc(size);
|
||||||
|
void * ptr;
|
||||||
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
|
if (result != 0) {
|
||||||
|
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_buffer_type_hbm;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
void * work_data;
|
||||||
@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
cpu_plan->cgraph = *cgraph;
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|||||||
// utils
|
// utils
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->data == NULL);
|
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
@ -21,6 +21,7 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
@ -29,6 +30,8 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -76,6 +79,10 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
//
|
//
|
||||||
|
81
ggml-cuda.cu
81
ggml-cuda.cu
@ -9226,11 +9226,10 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|||||||
|
|
||||||
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
||||||
|
|
||||||
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
const bool inplace = tensor->view_src != nullptr;
|
||||||
tensor->op == GGML_OP_VIEW;
|
|
||||||
|
|
||||||
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
|
||||||
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
|
||||||
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
||||||
size_t view_offset = 0;
|
size_t view_offset = 0;
|
||||||
if (tensor->op == GGML_OP_VIEW) {
|
if (tensor->op == GGML_OP_VIEW) {
|
||||||
@ -9317,7 +9316,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -9523,7 +9522,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||||
tensor->backend = tensor->view_src->backend;
|
tensor->backend = tensor->view_src->backend;
|
||||||
tensor->extra = tensor->view_src->extra;
|
tensor->extra = tensor->view_src->extra;
|
||||||
return;
|
return;
|
||||||
@ -9554,23 +9553,34 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
UNUSED(buffer);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
UNUSED(buffer);
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
|
||||||
|
ggml_cuda_set_device(ctx->device);
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||||
@ -9581,6 +9591,7 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|||||||
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ NULL,
|
/* .cpy_tensor_from = */ NULL,
|
||||||
/* .cpy_tensor_to = */ NULL,
|
/* .cpy_tensor_to = */ NULL,
|
||||||
|
/* .clear = */ ggml_backend_cuda_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
// cuda buffer type
|
// cuda buffer type
|
||||||
@ -9632,35 +9643,36 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|||||||
UNUSED(buft);
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
|
||||||
static bool ggml_backend_buffer_type_cuda_initialized = false;
|
|
||||||
if (!ggml_backend_buffer_type_cuda_initialized) {
|
static bool ggml_backend_cuda_buffer_type_initialized = false;
|
||||||
|
|
||||||
|
if (!ggml_backend_cuda_buffer_type_initialized) {
|
||||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
||||||
ggml_backend_buffer_type_cuda[i] = {
|
ggml_backend_cuda_buffer_types[i] = {
|
||||||
/* .iface = */ cuda_backend_buffer_type_interface,
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
||||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_type_cuda_initialized = true;
|
ggml_backend_cuda_buffer_type_initialized = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cuda[device];
|
return &ggml_backend_cuda_buffer_types[device];
|
||||||
}
|
}
|
||||||
|
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
CUDA_CHECK(cudaFreeHost(buffer->context));
|
||||||
CUDA_CHECK(cudaFreeHost(ctx->dev_ptr));
|
|
||||||
delete ctx;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
@ -9673,24 +9685,21 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
|
|
||||||
UNUSED(buft);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||||
|
/* .iface = */ {
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
|
||||||
};
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
|
},
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
|
|
||||||
/* .iface = */ cuda_backend_host_buffer_type_interface,
|
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cuda_host;
|
return &ggml_backend_cuda_buffer_type_host;
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend
|
// backend
|
||||||
@ -9722,8 +9731,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
|||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
@ -9733,8 +9740,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
|||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
|
226
ggml-metal.m
226
ggml-metal.m
@ -180,7 +180,15 @@ struct ggml_metal_context {
|
|||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
ggml_log_callback ggml_metal_log_callback = NULL;
|
|
||||||
|
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
||||||
|
fprintf(stderr, "%s", msg);
|
||||||
|
|
||||||
|
UNUSED(level);
|
||||||
|
UNUSED(user_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
||||||
void * ggml_metal_log_user_data = NULL;
|
void * ggml_metal_log_user_data = NULL;
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
@ -607,12 +615,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// temporarily defined here for compatibility between ggml-backend and the old API
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
||||||
struct ggml_backend_metal_buffer_context {
|
|
||||||
|
struct ggml_backend_metal_buffer {
|
||||||
void * data;
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
id<MTLBuffer> metal;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_metal_buffer_context {
|
||||||
|
void * all_data;
|
||||||
|
size_t all_size;
|
||||||
|
bool owned;
|
||||||
|
|
||||||
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
|
int n_buffers;
|
||||||
|
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
|
};
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
@ -622,17 +642,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
|
|
||||||
// compatibility with ggml-backend
|
// compatibility with ggml-backend
|
||||||
if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
|
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
|
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
|
// find the view that contains the tensor fully
|
||||||
|
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||||
GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
|
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||||
|
|
||||||
|
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||||
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||||
*offs = (size_t) ioffs;
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
return buf_ctx->metal;
|
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||||
|
|
||||||
|
return buf_ctx->buffers[i].metal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||||
|
|
||||||
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
@ -2361,6 +2393,7 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
|
// default buffer
|
||||||
static id<MTLDevice> g_backend_device = nil;
|
static id<MTLDevice> g_backend_device = nil;
|
||||||
static int g_backend_device_ref_count = 0;
|
static int g_backend_device_ref_count = 0;
|
||||||
|
|
||||||
@ -2388,34 +2421,31 @@ static void ggml_backend_metal_free_device(void) {
|
|||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->data;
|
return ctx->all_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
[ctx->metal release];
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
[ctx->buffers[i].metal release];
|
||||||
|
}
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
||||||
free(ctx->data);
|
if (ctx->owned) {
|
||||||
free(ctx);
|
free(ctx->all_data);
|
||||||
|
}
|
||||||
|
|
||||||
UNUSED(buffer);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
@ -2433,7 +2463,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
memset(ctx->all_data, value, ctx->all_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -2441,8 +2477,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_metal_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// default buffer type
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
@ -2453,13 +2492,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->data = ggml_metal_host_malloc(size);
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
|
|
||||||
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
|
ctx->all_size = size_aligned;
|
||||||
|
ctx->owned = true;
|
||||||
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
|
ctx->buffers[0].size = size;
|
||||||
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
|
if (ctx->buffers[0].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_free_device();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
@ -2470,7 +2542,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|||||||
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
@ -2480,6 +2558,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
@ -2487,6 +2566,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
return &ggml_backend_buffer_type_metal;
|
return &ggml_backend_buffer_type_metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buffer from ptr
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
ctx->all_data = data;
|
||||||
|
ctx->all_size = size;
|
||||||
|
ctx->owned = false;
|
||||||
|
ctx->n_buffers = 0;
|
||||||
|
|
||||||
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
size_t size_aligned = size;
|
||||||
|
if ((size_aligned % size_page) != 0) {
|
||||||
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
|
}
|
||||||
|
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
|
|
||||||
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
} else {
|
||||||
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
|
// one of the views
|
||||||
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
|
const size_t size_step = device.maxBufferLength - size_ovlp;
|
||||||
|
const size_t size_view = device.maxBufferLength;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
|
if (i + size_step < size) {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
@ -2499,10 +2659,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
|
||||||
UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_metal_buffer_type();
|
return ggml_backend_metal_buffer_type();
|
||||||
|
|
||||||
@ -2529,25 +2685,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_from_async = */ NULL,
|
/* .cpy_tensor_from_async = */ NULL,
|
||||||
/* .cpy_tensor_to_async = */ NULL,
|
/* .cpy_tensor_to_async = */ NULL,
|
||||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: make a common log callback for all backends in ggml-backend
|
|
||||||
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
|
||||||
fprintf(stderr, "%s", msg);
|
|
||||||
|
|
||||||
UNUSED(level);
|
|
||||||
UNUSED(user_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_metal_init(void) {
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
|
24
ggml.c
24
ggml.c
@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|||||||
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
||||||
size_t max_size = 0;
|
size_t max_size = 0;
|
||||||
|
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||||
|
max_size = MAX(max_size, ggml_nbytes(tensor));
|
||||||
while (obj != NULL) {
|
|
||||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
|
||||||
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
|
||||||
|
|
||||||
const size_t size = ggml_nbytes(tensor);
|
|
||||||
|
|
||||||
if (max_size < size) {
|
|
||||||
max_size = size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
obj = obj->next;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return max_size;
|
return max_size;
|
||||||
@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
||||||
struct ggml_object * obj = ctx->objects_begin;
|
struct ggml_object * obj = ctx->objects_begin;
|
||||||
|
|
||||||
char * const mem_buffer = ctx->mem_buffer;
|
char * const mem_buffer = ctx->mem_buffer;
|
||||||
@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||||
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
||||||
obj = obj->next;
|
obj = obj->next;
|
||||||
|
|
||||||
@ -19213,6 +19201,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|||||||
return ctx->infos[i].name.data;
|
return ctx->infos[i].name.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
||||||
|
return ctx->infos[i].type;
|
||||||
|
}
|
||||||
|
|
||||||
// returns the index
|
// returns the index
|
||||||
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||||
const int idx = gguf_find_key(ctx, key);
|
const int idx = gguf_find_key(ctx, key);
|
||||||
|
5
ggml.h
5
ggml.h
@ -735,8 +735,8 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||||
|
|
||||||
// Context tensor enumeration and lookup
|
// Context tensor enumeration and lookup
|
||||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
||||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
@ -2139,6 +2139,7 @@ extern "C" {
|
|||||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||||
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
// overrides existing values or adds a new one
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
|
Loading…
Reference in New Issue
Block a user