mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-15 07:19:53 +00:00
This commit is contained in:
parent
ab26fb9005
commit
efdd713023
@ -9,16 +9,16 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// buffer_type API
|
// buffer_type API
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_amx_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,20 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
|
#ifdef GGML_BACKEND_SHARED
|
||||||
|
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||||
|
# ifdef GGML_BACKEND_BUILD
|
||||||
|
# define GGML_BACKEND_API __declspec(dllexport) extern
|
||||||
|
# else
|
||||||
|
# define GGML_BACKEND_API __declspec(dllimport) extern
|
||||||
|
# endif
|
||||||
|
# else
|
||||||
|
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define GGML_BACKEND_API extern
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -9,15 +9,15 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||||
|
|
||||||
// number of threads used for conversion to float
|
// number of threads used for conversion to float
|
||||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||||
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -34,7 +34,7 @@ extern "C" {
|
|||||||
*/
|
*/
|
||||||
#define GGML_CANN_MAX_DEVICES 16
|
#define GGML_CANN_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Initializes the CANN backend for a specified device.
|
* @brief Initializes the CANN backend for a specified device.
|
||||||
@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
|||||||
* @param device The index of the device to initialize.
|
* @param device The index of the device to initialize.
|
||||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Checks if a given backend is a CANN backend.
|
* @brief Checks if a given backend is a CANN backend.
|
||||||
@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
|||||||
* @param backend The backend instance to check.
|
* @param backend The backend instance to check.
|
||||||
* @return True if the backend is a CANN backend, false otherwise.
|
* @return True if the backend is a CANN backend, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the CANN buffer type for a specified device.
|
* @brief Retrieves the CANN buffer type for a specified device.
|
||||||
@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
|||||||
* @return A pointer to the buffer type interface for the specified device, or
|
* @return A pointer to the buffer type interface for the specified device, or
|
||||||
* nullptr if the device index is out of range.
|
* nullptr if the device index is out of range.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_buffer_type_t
|
GGML_BACKEND_API ggml_backend_buffer_type_t
|
||||||
ggml_backend_cann_buffer_type(int32_t device);
|
ggml_backend_cann_buffer_type(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
|||||||
*
|
*
|
||||||
* @return The number of CANN devices available.
|
* @return The number of CANN devices available.
|
||||||
*/
|
*/
|
||||||
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||||
*
|
*
|
||||||
* @return A pointer to the host buffer type interface.
|
* @return A pointer to the host buffer type interface.
|
||||||
*/
|
*/
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the description of a specific CANN device.
|
* @brief Retrieves the description of a specific CANN device.
|
||||||
@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
|||||||
* @param description Pointer to a buffer where the description will be written.
|
* @param description Pointer to a buffer where the description will be written.
|
||||||
* @param description_size Size of the description buffer.
|
* @param description_size Size of the description buffer.
|
||||||
*/
|
*/
|
||||||
GGML_API void ggml_backend_cann_get_device_description(
|
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
|
||||||
int32_t device, char* description, size_t description_size);
|
int32_t device, char* description, size_t description_size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
|
|||||||
* @param total Pointer to a variable where the total memory size will be
|
* @param total Pointer to a variable where the total memory size will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||||
size_t* free,
|
size_t* free,
|
||||||
size_t* total);
|
size_t* total);
|
||||||
|
|
||||||
|
@ -54,77 +54,77 @@ extern "C" {
|
|||||||
GGML_NUMA_STRATEGY_COUNT
|
GGML_NUMA_STRATEGY_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||||
|
|
||||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
||||||
const struct ggml_cgraph * cgraph,
|
const struct ggml_cgraph * cgraph,
|
||||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
//
|
//
|
||||||
// system info
|
// system info
|
||||||
//
|
//
|
||||||
|
|
||||||
// x86
|
// x86
|
||||||
GGML_API int ggml_cpu_has_sse3 (void);
|
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||||
GGML_API int ggml_cpu_has_avx (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||||
GGML_API int ggml_cpu_has_avx2 (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||||
GGML_API int ggml_cpu_has_f16c (void);
|
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||||
GGML_API int ggml_cpu_has_avx_vnni (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||||
GGML_API int ggml_cpu_has_avx512 (void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
||||||
// ARM
|
// ARM
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||||
GGML_API int ggml_cpu_has_sve (void);
|
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||||
GGML_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||||
// other
|
// other
|
||||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||||
GGML_API int ggml_cpu_has_vsx (void);
|
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||||
GGML_API int ggml_cpu_has_llamafile (void);
|
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||||
|
|
||||||
// Internal types and functions exposed for tests and benchmarks
|
// Internal types and functions exposed for tests and benchmarks
|
||||||
|
|
||||||
@ -148,25 +148,25 @@ extern "C" {
|
|||||||
ggml_gemm_t gemm;
|
ggml_gemm_t gemm;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||||
|
|
||||||
GGML_API void ggml_cpu_init(void);
|
GGML_BACKEND_API void ggml_cpu_init(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// CPU backend
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
#ifdef GGML_USE_CPU_HBM
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
@ -20,27 +20,27 @@ extern "C" {
|
|||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
// device buffer
|
// device buffer
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
|
|||||||
// forward declaration
|
// forward declaration
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -39,27 +39,27 @@ extern "C" {
|
|||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_DEPRECATED(
|
GGML_DEPRECATED(
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||||
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
// ideally, the user code should be doing these checks
|
// ideally, the user code should be doing these checks
|
||||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||||
|
|
||||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -10,18 +10,18 @@ extern "C" {
|
|||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||||
|
|
||||||
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -17,32 +17,32 @@ extern "C" {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||||
|
|
||||||
// devide buffer
|
// devide buffer
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
||||||
GGML_API void ggml_backend_sycl_get_device_description(int device,
|
GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
|
||||||
char *description,
|
char *description,
|
||||||
size_t description_size);
|
size_t description_size);
|
||||||
GGML_API int ggml_backend_sycl_get_device_count();
|
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
|
||||||
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, keep here for reference
|
// SYCL doesn't support registering host memory, keep here for reference
|
||||||
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||||
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -10,21 +10,21 @@ extern "C" {
|
|||||||
#define GGML_VK_NAME "Vulkan"
|
#define GGML_VK_NAME "Vulkan"
|
||||||
#define GGML_VK_MAX_DEVICES 16
|
#define GGML_VK_MAX_DEVICES 16
|
||||||
|
|
||||||
GGML_API void ggml_vk_instance_init(void);
|
GGML_BACKEND_API void ggml_vk_instance_init(void);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||||
GGML_API int ggml_backend_vk_get_device_count(void);
|
GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
|
||||||
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -1490,7 +1490,7 @@ extern "C" {
|
|||||||
"use ggml_rope_ext_inplace instead");
|
"use ggml_rope_ext_inplace instead");
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
// compute correction dims for YaRN RoPE scaling
|
||||||
void ggml_rope_yarn_corr_dims(
|
GGML_API void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// rotary position embedding backward, i.e compute dx from dy
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
|
@ -755,25 +755,23 @@ add_subdirectory(ggml-cpu)
|
|||||||
|
|
||||||
target_link_libraries(ggml PUBLIC ggml-base ggml-cpu)
|
target_link_libraries(ggml PUBLIC ggml-base ggml-cpu)
|
||||||
|
|
||||||
if (GGML_CUDA)
|
function(ggml_add_backend backend)
|
||||||
add_subdirectory(ggml-cuda)
|
string(TOUPPER "GGML_${backend}" backend_id)
|
||||||
target_link_libraries(ggml PUBLIC ggml-cuda)
|
string(TOLOWER "ggml-${backend}" backend_target)
|
||||||
|
if (${backend_id})
|
||||||
|
add_subdirectory(${backend_target})
|
||||||
|
if (${BUILD_SHARED_LIBS})
|
||||||
|
target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD)
|
||||||
|
target_compile_definitions(${backend_target} PUBLIC GGML_BABKEND_SHARED)
|
||||||
endif()
|
endif()
|
||||||
|
target_link_libraries(ggml PUBLIC ${backend_target})
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
if (GGML_METAL)
|
ggml_add_backend(CUDA)
|
||||||
add_subdirectory(ggml-metal)
|
ggml_add_backend(METAL)
|
||||||
target_link_libraries(ggml PUBLIC ggml-metal)
|
ggml_add_backend(BLAS)
|
||||||
endif()
|
ggml_add_backend(RPC)
|
||||||
|
|
||||||
if (GGML_BLAS)
|
|
||||||
add_subdirectory(ggml-blas)
|
|
||||||
target_link_libraries(ggml PUBLIC ggml-blas)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (GGML_RPC)
|
|
||||||
add_subdirectory(ggml-rpc)
|
|
||||||
target_link_libraries(ggml PUBLIC ggml-rpc)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
|
set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||||
|
@ -9164,12 +9164,6 @@ static void rope_yarn(
|
|||||||
*sin_theta = sinf(theta) * mscale;
|
*sin_theta = sinf(theta) * mscale;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
|
||||||
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
|
||||||
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
|
||||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_rope_cache_init(
|
static void ggml_rope_cache_init(
|
||||||
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||||
float * cache, float sin_sign, float theta_scale) {
|
float * cache, float sin_sign, float theta_scale) {
|
||||||
@ -9186,16 +9180,6 @@ static void ggml_rope_cache_init(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_rope_yarn_corr_dims(
|
|
||||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
|
||||||
) {
|
|
||||||
// start and end correction dims
|
|
||||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
|
||||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
|
||||||
dims[0] = MAX(0, start);
|
|
||||||
dims[1] = MIN(n_dims - 1, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_f32(
|
static void ggml_compute_forward_rope_f32(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
struct ggml_tensor * dst,
|
struct ggml_tensor * dst,
|
||||||
|
@ -11,87 +11,89 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// NOTE: these functions are defined as GGML_API because they used by the CPU backend
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq1_0_ref(const float * GGML_RESTRICT x, block_tq1_0 * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_tq2_0_ref(const float * GGML_RESTRICT x, block_tq2_0 * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
||||||
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
//GGML_API void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq1_0(const block_tq1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_tq2_0(const block_tq2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
GGML_API void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
||||||
|
|
||||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||||
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_tq2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
GGML_API size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||||
|
|
||||||
void iq2xs_init_impl(enum ggml_type type);
|
GGML_API void iq2xs_init_impl(enum ggml_type type);
|
||||||
void iq2xs_free_impl(enum ggml_type type);
|
GGML_API void iq2xs_free_impl(enum ggml_type type);
|
||||||
void iq3xs_init_impl(int grid_size);
|
GGML_API void iq3xs_init_impl(int grid_size);
|
||||||
void iq3xs_free_impl(int grid_size);
|
GGML_API void iq3xs_free_impl(int grid_size);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -671,7 +671,7 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
// NOTE: buffer types are allocated and never freed; this is by design
|
// NOTE: buffer types are allocated and never freed; this is by design
|
||||||
@ -718,7 +718,7 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|||||||
return backend;
|
return backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -730,7 +730,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|||||||
*total = response.total_mem;
|
*total = response.total_mem;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||||
auto sock = get_socket(endpoint);
|
auto sock = get_socket(endpoint);
|
||||||
if (sock == nullptr) {
|
if (sock == nullptr) {
|
||||||
*free = 0;
|
*free = 0;
|
||||||
|
@ -3617,6 +3617,22 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||||
|
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||||
|
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||||
|
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_rope_yarn_corr_dims(
|
||||||
|
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||||
|
) {
|
||||||
|
// start and end correction dims
|
||||||
|
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||||
|
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||||
|
dims[0] = MAX(0, start);
|
||||||
|
dims[1] = MIN(n_dims - 1, end);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_rope_back
|
// ggml_rope_back
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rope_back(
|
struct ggml_tensor * ggml_rope_back(
|
||||||
|
Loading…
Reference in New Issue
Block a user