diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h index 22b3f70f4..042d6d919 100644 --- a/ggml/include/ggml-amx.h +++ b/ggml/include/ggml-amx.h @@ -9,16 +9,16 @@ extern "C" { #endif // buffer_type API -GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); -GGML_API bool ggml_backend_is_amx(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend); // backend API -GGML_API ggml_backend_t ggml_backend_amx_init(void); +GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void); -GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); +GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); -GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index 125413d1b..0a65dbfca 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -3,6 +3,20 @@ #include "ggml.h" #include "ggml-alloc.h" +#ifdef GGML_BACKEND_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BACKEND_BUILD +# define GGML_BACKEND_API __declspec(dllexport) extern +# else +# define GGML_BACKEND_API __declspec(dllimport) extern +# endif +# else +# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern +# endif +#else +# define GGML_BACKEND_API extern +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/ggml/include/ggml-blas.h b/ggml/include/ggml-blas.h index 25b2e637f..87a81b363 100644 --- a/ggml/include/ggml-blas.h +++ b/ggml/include/ggml-blas.h @@ -9,15 +9,15 @@ extern "C" { #endif // backend API -GGML_API ggml_backend_t ggml_backend_blas_init(void); +GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void); -GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend); // number of threads used for conversion to float // for openblas and blis, this will also set the number of threads used for blas operations -GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); +GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); -GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void); #ifdef __cplusplus diff --git a/ggml/include/ggml-cann.h b/ggml/include/ggml-cann.h index 528975493..b469e228d 100644 --- a/ggml/include/ggml-cann.h +++ b/ggml/include/ggml-cann.h @@ -34,7 +34,7 @@ extern "C" { */ #define GGML_CANN_MAX_DEVICES 16 -GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void); /** * @brief Initializes the CANN backend for a specified device. @@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void); * @param device The index of the device to initialize. * @return A pointer to the initialized backend instance, or nullptr on failure. */ -GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device); +GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device); /** * @brief Checks if a given backend is a CANN backend. @@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device); * @param backend The backend instance to check. * @return True if the backend is a CANN backend, false otherwise. */ -GGML_API bool ggml_backend_is_cann(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend); /** * @brief Retrieves the CANN buffer type for a specified device. @@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend); * @return A pointer to the buffer type interface for the specified device, or * nullptr if the device index is out of range. */ -GGML_API ggml_backend_buffer_type_t +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device); /** @@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device); * * @return The number of CANN devices available. */ -GGML_API int32_t ggml_backend_cann_get_device_count(void); +GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void); /** * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. * * @return A pointer to the host buffer type interface. */ -GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); /** * @brief Retrieves the description of a specific CANN device. @@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); * @param description Pointer to a buffer where the description will be written. * @param description_size Size of the description buffer. */ -GGML_API void ggml_backend_cann_get_device_description( +GGML_BACKEND_API void ggml_backend_cann_get_device_description( int32_t device, char* description, size_t description_size); /** @@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description( * @param total Pointer to a variable where the total memory size will be * stored. */ -GGML_API void ggml_backend_cann_get_device_memory(int32_t device, +GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, size_t* total); diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index ac0b141a3..4da62cb2b 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -54,77 +54,77 @@ extern "C" { GGML_NUMA_STRATEGY_COUNT }; - GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems - GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems + GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node - GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); - GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); - GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); - GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); - GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); - GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); - GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); - GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); - GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); - GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); - GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan( + GGML_BACKEND_API struct ggml_cplan ggml_graph_plan( const struct ggml_cgraph * cgraph, int n_threads, /* = GGML_DEFAULT_N_THREADS */ struct ggml_threadpool * threadpool /* = NULL */ ); - GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data - GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); // // system info // // x86 - GGML_API int ggml_cpu_has_sse3 (void); - GGML_API int ggml_cpu_has_ssse3 (void); - GGML_API int ggml_cpu_has_avx (void); - GGML_API int ggml_cpu_has_avx2 (void); - GGML_API int ggml_cpu_has_f16c (void); - GGML_API int ggml_cpu_has_fma (void); - GGML_API int ggml_cpu_has_avx_vnni (void); - GGML_API int ggml_cpu_has_avx512 (void); - GGML_API int ggml_cpu_has_avx512_vbmi(void); - GGML_API int ggml_cpu_has_avx512_vnni(void); - GGML_API int ggml_cpu_has_avx512_bf16(void); - GGML_API int ggml_cpu_has_amx_int8 (void); + GGML_BACKEND_API int ggml_cpu_has_sse3 (void); + GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); + GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx2 (void); + GGML_BACKEND_API int ggml_cpu_has_f16c (void); + GGML_BACKEND_API int ggml_cpu_has_fma (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); + GGML_BACKEND_API int ggml_cpu_has_avx512 (void); + GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); + GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); + GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void); + GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void); // ARM - GGML_API int ggml_cpu_has_neon (void); - GGML_API int ggml_cpu_has_arm_fma (void); - GGML_API int ggml_cpu_has_fp16_va (void); - GGML_API int ggml_cpu_has_matmul_int8(void); - GGML_API int ggml_cpu_has_sve (void); - GGML_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes + GGML_BACKEND_API int ggml_cpu_has_neon (void); + GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); + GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); + GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); + GGML_BACKEND_API int ggml_cpu_has_sve (void); + GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes // other - GGML_API int ggml_cpu_has_riscv_v (void); - GGML_API int ggml_cpu_has_vsx (void); - GGML_API int ggml_cpu_has_wasm_simd (void); - GGML_API int ggml_cpu_has_llamafile (void); + GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); + GGML_BACKEND_API int ggml_cpu_has_vsx (void); + GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); + GGML_BACKEND_API int ggml_cpu_has_llamafile (void); // Internal types and functions exposed for tests and benchmarks @@ -148,25 +148,25 @@ extern "C" { ggml_gemm_t gemm; }; - GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); + GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); - GGML_API void ggml_cpu_init(void); + GGML_BACKEND_API void ggml_cpu_init(void); // // CPU backend // - GGML_API ggml_backend_t ggml_backend_cpu_init(void); + GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void); - GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend); - GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); - GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); - GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); - GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); #ifdef GGML_USE_CPU_HBM - GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); + GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void); #endif #ifdef __cplusplus diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 305d0b636..8a832aace 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -20,27 +20,27 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 // backend API -GGML_API ggml_backend_t ggml_backend_cuda_init(int device); +GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device); -GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend); // device buffer -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); -GGML_API int ggml_backend_cuda_get_device_count(void); -GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); -GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void); +GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); -GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); -GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); +GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); -GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h index c0c43521b..154aa56a7 100644 --- a/ggml/include/ggml-kompute.h +++ b/ggml/include/ggml-kompute.h @@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void); // forward declaration typedef struct ggml_backend * ggml_backend_t; -GGML_API ggml_backend_t ggml_backend_kompute_init(int device); +GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device); -GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend); -GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); -GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index b8d3f678b..669c1f84a 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -39,27 +39,27 @@ extern "C" { // user-code should use only these functions // -GGML_API ggml_backend_t ggml_backend_metal_init(void); +GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); -GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_DEPRECATED( - GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), + GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size), "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713"); -GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); +GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); -GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family // ideally, the user code should be doing these checks // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf -GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); +GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); // capture all command buffers committed the next time `ggml_backend_graph_compute` is called -GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); +GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); -GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h index d57967368..ade6c3b0e 100644 --- a/ggml/include/ggml-rpc.h +++ b/ggml/include/ggml-rpc.h @@ -10,18 +10,18 @@ extern "C" { #define GGML_RPC_MAX_SERVERS 16 // backend API -GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); -GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); +GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); +GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend); -GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); -GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); +GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); -GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); +GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); -GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); -GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); +GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint); #ifdef __cplusplus } diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index af521f599..5ce349a88 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -17,32 +17,32 @@ extern "C" { #endif // backend API -GGML_API ggml_backend_t ggml_backend_sycl_init(int device); +GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device); -GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend); +GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend); // devide buffer -GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); // split tensor buffer that splits matrices by rows across multiple devices -GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); -GGML_API void ggml_backend_sycl_print_sycl_devices(void); -GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); -GGML_API void ggml_backend_sycl_get_device_description(int device, +GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); +GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device, char *description, size_t description_size); -GGML_API int ggml_backend_sycl_get_device_count(); -GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); +GGML_BACKEND_API int ggml_backend_sycl_get_device_count(); +GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); // SYCL doesn't support registering host memory, keep here for reference -// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); -// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); +// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); -GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml-vulkan.h b/ggml/include/ggml-vulkan.h index c03bbfe5e..53cdba072 100644 --- a/ggml/include/ggml-vulkan.h +++ b/ggml/include/ggml-vulkan.h @@ -10,21 +10,21 @@ extern "C" { #define GGML_VK_NAME "Vulkan" #define GGML_VK_MAX_DEVICES 16 -GGML_API void ggml_vk_instance_init(void); +GGML_BACKEND_API void ggml_vk_instance_init(void); // backend API -GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); +GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); -GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); -GGML_API int ggml_backend_vk_get_device_count(void); -GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); -GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); +GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_BACKEND_API int ggml_backend_vk_get_device_count(void); +GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); -GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU -GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); -GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void); #ifdef __cplusplus } diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 7d0ec0af5..3b3f6798a 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1490,7 +1490,7 @@ extern "C" { "use ggml_rope_ext_inplace instead"); // compute correction dims for YaRN RoPE scaling - void ggml_rope_yarn_corr_dims( + GGML_API void ggml_rope_yarn_corr_dims( int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); // rotary position embedding backward, i.e compute dx from dy diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 04da1c238..fc66be07e 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -755,25 +755,23 @@ add_subdirectory(ggml-cpu) target_link_libraries(ggml PUBLIC ggml-base ggml-cpu) -if (GGML_CUDA) - add_subdirectory(ggml-cuda) - target_link_libraries(ggml PUBLIC ggml-cuda) -endif() +function(ggml_add_backend backend) + string(TOUPPER "GGML_${backend}" backend_id) + string(TOLOWER "ggml-${backend}" backend_target) + if (${backend_id}) + add_subdirectory(${backend_target}) + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend_target} PUBLIC GGML_BABKEND_SHARED) + endif() + target_link_libraries(ggml PUBLIC ${backend_target}) + endif() +endfunction() -if (GGML_METAL) - add_subdirectory(ggml-metal) - target_link_libraries(ggml PUBLIC ggml-metal) -endif() - -if (GGML_BLAS) - add_subdirectory(ggml-blas) - target_link_libraries(ggml PUBLIC ggml-blas) -endif() - -if (GGML_RPC) - add_subdirectory(ggml-rpc) - target_link_libraries(ggml PUBLIC ggml-rpc) -endif() +ggml_add_backend(CUDA) +ggml_add_backend(METAL) +ggml_add_backend(BLAS) +ggml_add_backend(RPC) if (EMSCRIPTEN) set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128") diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index cd5d7e953..bf2bbdef6 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -9164,12 +9164,6 @@ static void rope_yarn( *sin_theta = sinf(theta) * mscale; } -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); -} - static void ggml_rope_cache_init( float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale, float * cache, float sin_sign, float theta_scale) { @@ -9186,16 +9180,6 @@ static void ggml_rope_cache_init( } } -void ggml_rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] -) { - // start and end correction dims - float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); - float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); - dims[0] = MAX(0, start); - dims[1] = MIN(n_dims - 1, end); -} - static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst, diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index b82eab664..d09173e11 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -11,87 +11,89 @@ extern "C" { #endif +// NOTE: these functions are defined as GGML_API because they used by the CPU backend + // Quantization -void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); -void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); -void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); -void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); -void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -void iq2xs_init_impl(enum ggml_type type); -void iq2xs_free_impl(enum ggml_type type); -void iq3xs_init_impl(int grid_size); -void iq3xs_free_impl(int grid_size); +GGML_API void iq2xs_init_impl(enum ggml_type type); +GGML_API void iq2xs_free_impl(enum ggml_type type); +GGML_API void iq3xs_init_impl(int grid_size); +GGML_API void iq3xs_free_impl(int grid_size); #ifdef __cplusplus } diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 8a772f224..47357daab 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -671,7 +671,7 @@ static ggml_backend_i ggml_backend_rpc_interface = { /* .event_wait = */ NULL, }; -GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { +ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) { static std::mutex mutex; std::lock_guard lock(mutex); // NOTE: buffer types are allocated and never freed; this is by design @@ -718,7 +718,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) { return backend; } -GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) { +bool ggml_backend_is_rpc(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid()); } @@ -730,7 +730,7 @@ static void get_device_memory(const std::shared_ptr & sock, size_t * f *total = response.total_mem; } -GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { +void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) { auto sock = get_socket(endpoint); if (sock == nullptr) { *free = 0; diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index d2ef04c29..11f0803c4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3617,6 +3617,22 @@ struct ggml_tensor * ggml_rope_custom_inplace( ); } +// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get +// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` +static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) { + return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base)); +} + +void ggml_rope_yarn_corr_dims( + int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2] +) { + // start and end correction dims + float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base)); + float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base)); + dims[0] = MAX(0, start); + dims[1] = MIN(n_dims - 1, end); +} + // ggml_rope_back struct ggml_tensor * ggml_rope_back(