From ab26fb9005c796f2e5dd53ee6b6f67a2c6ff73cb Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 12 Nov 2024 02:32:22 +0100 Subject: [PATCH] build fixes --- ggml/include/ggml.h | 8 +- ggml/src/CMakeLists.txt | 52 +---- ggml/src/ggml-backend-reg.cpp | 6 +- ggml/src/ggml-cpu/ggml-cpu-quants.c | 23 --- ggml/src/ggml-cpu/ggml-cpu-quants.h | 3 - ggml/src/ggml-cpu/ggml-cpu.c | 7 +- ggml/src/ggml-cpu/ggml-cpu.cpp | 8 + ggml/src/ggml-impl.h | 301 ++++++++++++++-------------- src/CMakeLists.txt | 3 +- 9 files changed, 175 insertions(+), 236 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3bf90659e..7d0ec0af5 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -176,15 +176,15 @@ #ifdef GGML_SHARED # if defined(_WIN32) && !defined(__MINGW32__) # ifdef GGML_BUILD -# define GGML_API __declspec(dllexport) +# define GGML_API __declspec(dllexport) extern # else -# define GGML_API __declspec(dllimport) +# define GGML_API __declspec(dllimport) extern # endif # else -# define GGML_API __attribute__ ((visibility ("default"))) +# define GGML_API __attribute__ ((visibility ("default"))) extern # endif #else -# define GGML_API +# define GGML_API extern #endif // TODO: support for clang diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9571e153e..04da1c238 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -29,37 +29,6 @@ endif() unset(GGML_EXTRA_LIBS_PRIVATE) unset(GGML_EXTRA_LIBS_PUBLIC) -# musa, hip: add directory with a CMakeLists.txt file, but no source files (use refer to ggml-cuda files as ../ggml-cuda) -if (GGML_MUSA) - list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/") - find_package(MUSAToolkit) - set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND}) -else() - find_package(CUDAToolkit) -endif() - -# if (GGML_MUSA) -# set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE}) -# else() -# if (GGML_MUSA) -# set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX) -# foreach(SOURCE ${GGML_SOURCES_CUDA}) -# set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") -# endforeach() -# endif() - - -if (GGML_MUSA) - set(CMAKE_C_COMPILER clang) - set(CMAKE_C_EXTENSIONS OFF) - set(CMAKE_CXX_COMPILER clang++) - set(CMAKE_CXX_EXTENSIONS OFF) - - set(GGML_CUDA ON) - - list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA) -endif() - if (GGML_AMX) if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) else() @@ -597,12 +566,6 @@ function(get_flags CCID CCVER) elseif (CCID STREQUAL "GNU") set(C_FLAGS -Wdouble-promotion) set(CXX_FLAGS -Wno-array-bounds) - - if (NOT GGML_MUSA) - if (CCVER VERSION_GREATER_EQUAL 7.1.0) - list(APPEND CXX_FLAGS -Wno-format-truncation) - endif() - endif() if (CCVER VERSION_GREATER_EQUAL 8.1.0) list(APPEND CXX_FLAGS -Wextra-semi) endif() @@ -768,7 +731,7 @@ endif() # ggml -add_library(ggml-base STATIC +add_library(ggml-base ../include/ggml.h ../include/ggml-alloc ../include/ggml-alloc.h @@ -779,18 +742,16 @@ add_library(ggml-base STATIC ggml-backend.cpp ggml-threading.cpp ggml-threading.h - ggml-quants.c # for quantize functions TODO: move dot fns to a separate file + ggml-quants.c ggml-quants.h ggml-aarch64.c ggml-aarch64.h - -) - -add_subdirectory(ggml-cpu) + ) add_library(ggml ggml-backend-reg.cpp -) + ) +add_subdirectory(ggml-cpu) target_link_libraries(ggml PUBLIC ggml-base ggml-cpu) @@ -850,5 +811,6 @@ target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTR if (BUILD_SHARED_LIBS) set_target_properties(ggml-base PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD) + target_compile_definitions(ggml-base PRIVATE GGML_BUILD) + target_compile_definitions(ggml-base PUBLIC GGML_SHARED) endif() diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 8c56b3480..6bd537d96 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,5 +1,7 @@ -#include "ggml-backend.h" #include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" +#include "ggml-impl.h" #include #include @@ -45,8 +47,6 @@ #include "ggml-kompute.h" #endif -#include "ggml-cpu.h" - struct ggml_backend_registry { std::vector backends; std::vector devices; diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c index 1a3f6fb56..7fa2897c2 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.c +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c @@ -10783,22 +10783,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void * #endif } -// -// ============================================= 3-bit using D4 lattice -// - -void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) { - assert(k % QK_K == 0); - block_iq3_xxs * restrict y = vy; - quantize_row_iq3_xxs_ref(x, y, k); -} - -void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) { - assert(k % QK_K == 0); - block_iq3_s * restrict y = vy; - quantize_row_iq3_s_ref(x, y, k); -} - // ============================ 4-bit non-linear quants void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) { @@ -10810,10 +10794,3 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k) assert(k % QK_K == 0); quantize_iq4_xs(x, y, 1, k, NULL); } - -// =============================== 2.5625 bpw - -void quantize_row_iq2_s(const float * restrict x, void * restrict y, int64_t k) { - assert(k % QK_K == 0); - quantize_iq2_s(x, y, 1, k, NULL); -} diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h index 8bbe08f47..e33d9d473 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-quants.h +++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h @@ -29,11 +29,8 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1a82b0124..cd5d7e953 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -356,19 +356,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .nrows = 1, }, [GGML_TYPE_IQ3_XXS] = { - .from_float = quantize_row_iq3_xxs, + // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init + //.from_float = quantize_row_iq3_xxs, .vec_dot = ggml_vec_dot_iq3_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, [GGML_TYPE_IQ3_S] = { - .from_float = quantize_row_iq3_s, + //.from_float = quantize_row_iq3_s, .vec_dot = ggml_vec_dot_iq3_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, [GGML_TYPE_IQ2_S] = { - .from_float = quantize_row_iq2_s, + //.from_float = quantize_row_iq2_s, .vec_dot = ggml_vec_dot_iq2_s_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index cc77fe174..c7216117b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -11,6 +11,14 @@ #include #endif +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif +#include +#endif + // ggml-backend interface #ifdef GGML_USE_CPU_HBM diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 5357022f2..a040e4b56 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -3,7 +3,6 @@ // GGML internal header #include "ggml.h" - #include #include #include // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/ @@ -11,6 +10,18 @@ #include #include +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +#if defined(__ARM_NEON) +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include +#endif + #ifdef __cplusplus extern "C" { #endif @@ -29,13 +40,13 @@ extern "C" { // if C99 - static_assert is noop // ref: https://stackoverflow.com/a/53923785/4039976 #ifndef __cplusplus -#ifndef static_assert -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) -#define static_assert(cond, msg) _Static_assert(cond, msg) -#else -#define static_assert(cond, msg) struct global_scope_noop_trick -#endif -#endif + #ifndef static_assert + #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L) + #define static_assert(cond, msg) _Static_assert(cond, msg) + #else + #define static_assert(cond, msg) struct global_scope_noop_trick + #endif + #endif #endif static inline int ggml_up32(int n) { @@ -121,14 +132,12 @@ struct ggml_map_custom1_op_params { void * userdata; }; - struct ggml_map_custom2_op_params { ggml_custom2_op_t fun; int n_tasks; void * userdata; }; - struct ggml_map_custom3_op_params { ggml_custom3_op_t fun; int n_tasks; @@ -291,171 +300,155 @@ void ggml_aligned_free(void * ptr, size_t size); // FP16 to FP32 conversion #if defined(__ARM_NEON) - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#ifdef _MSC_VER -typedef uint16_t ggml_fp16_internal_t; -#else -typedef __fp16 ggml_fp16_internal_t; -#endif + #ifdef _MSC_VER + typedef uint16_t ggml_fp16_internal_t; + #else + typedef __fp16 ggml_fp16_internal_t; + #endif #endif #if defined(__ARM_NEON) && !defined(_MSC_VER) + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + ggml_fp16_internal_t tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; + } -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - ggml_fp16_internal_t tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; -} + static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + ggml_fp16_internal_t tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; + } -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - ggml_fp16_internal_t tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; -} +#elif defined(__F16C__) -#else - -#ifdef __F16C__ - -#ifdef _MSC_VER -#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) -#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) -#else -#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) -#endif + #ifdef _MSC_VER + #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) + #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) + #else + #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) + #endif #elif defined(__POWER9_VECTOR__) -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) -/* the inline asm below is about 12% faster than the lookup method */ -#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) + /* the inline asm below is about 12% faster than the lookup method */ + #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) + #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - register float f; - register double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - register double d; - register ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; -} - -#else - -// FP16 <-> FP32 -// ref: https://github.com/Maratyszcza/FP16 - -static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; -} - -static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; -} - -static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float exp_scale = 0x1.0p-112f; -#else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); -#endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); -} - -static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; -#else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); -#endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); + static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; } - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); -} + static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; + } -#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) -#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +#else -#endif // __F16C__ + // FP16 <-> FP32 + // ref: https://github.com/Maratyszcza/FP16 + + static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; + } + + static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; + } + + static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; + #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; + #else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); + #endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); + } + + static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; + #else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); + #endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); + } + + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #endif // defined(__ARM_NEON) && (!defined(__MSC_VER) -#ifdef __ARM_FEATURE_SVE -#include -#endif // __ARM_FEATURE_SVE - // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() -extern float ggml_table_f32_f16[1 << 16]; +GGML_API float ggml_table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 46a6ad562..a86624750 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -29,5 +29,6 @@ target_link_libraries(llama PUBLIC ggml) if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) - target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) + target_compile_definitions(llama PRIVATE LLAMA_BUILD) + target_compile_definitions(llama PUBLIC LLAMA_SHARED) endif()