From ab26fb9005c796f2e5dd53ee6b6f67a2c6ff73cb Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 12 Nov 2024 02:32:22 +0100
Subject: [PATCH] build fixes

---
 ggml/include/ggml.h                 |   8 +-
 ggml/src/CMakeLists.txt             |  52 +----
 ggml/src/ggml-backend-reg.cpp       |   6 +-
 ggml/src/ggml-cpu/ggml-cpu-quants.c |  23 ---
 ggml/src/ggml-cpu/ggml-cpu-quants.h |   3 -
 ggml/src/ggml-cpu/ggml-cpu.c        |   7 +-
 ggml/src/ggml-cpu/ggml-cpu.cpp      |   8 +
 ggml/src/ggml-impl.h                | 301 ++++++++++++++--------------
 src/CMakeLists.txt                  |   3 +-
 9 files changed, 175 insertions(+), 236 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 3bf90659e..7d0ec0af5 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -176,15 +176,15 @@
 #ifdef GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
+#            define GGML_API __declspec(dllexport) extern
 #        else
-#            define GGML_API __declspec(dllimport)
+#            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default")))
+#        define GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define GGML_API
+#    define GGML_API extern
 #endif
 
 // TODO: support for clang
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 9571e153e..04da1c238 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -29,37 +29,6 @@ endif()
 unset(GGML_EXTRA_LIBS_PRIVATE)
 unset(GGML_EXTRA_LIBS_PUBLIC)
 
-# musa, hip: add directory with a CMakeLists.txt file, but no source files (use refer to ggml-cuda files as ../ggml-cuda)
-if (GGML_MUSA)
-    list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
-    find_package(MUSAToolkit)
-    set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
-else()
-    find_package(CUDAToolkit)
-endif()
-
-# if (GGML_MUSA)
-# set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
-# else()
-# if (GGML_MUSA)
-# set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
-# foreach(SOURCE ${GGML_SOURCES_CUDA})
-#     set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
-# endforeach()
-# endif()
-
-
-if (GGML_MUSA)
-    set(CMAKE_C_COMPILER clang)
-    set(CMAKE_C_EXTENSIONS OFF)
-    set(CMAKE_CXX_COMPILER clang++)
-    set(CMAKE_CXX_EXTENSIONS OFF)
-
-    set(GGML_CUDA ON)
-
-    list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
-endif()
-
 if (GGML_AMX)
     if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
     else()
@@ -597,12 +566,6 @@ function(get_flags CCID CCVER)
     elseif (CCID STREQUAL "GNU")
         set(C_FLAGS   -Wdouble-promotion)
         set(CXX_FLAGS -Wno-array-bounds)
-
-        if (NOT GGML_MUSA)
-            if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-                list(APPEND CXX_FLAGS -Wno-format-truncation)
-            endif()
-        endif()
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             list(APPEND CXX_FLAGS -Wextra-semi)
         endif()
@@ -768,7 +731,7 @@ endif()
 
 # ggml
 
-add_library(ggml-base STATIC
+add_library(ggml-base
             ../include/ggml.h
             ../include/ggml-alloc
             ../include/ggml-alloc.h
@@ -779,18 +742,16 @@ add_library(ggml-base STATIC
             ggml-backend.cpp
             ggml-threading.cpp
             ggml-threading.h
-            ggml-quants.c # for quantize functions TODO: move dot fns to a separate file
+            ggml-quants.c
             ggml-quants.h
             ggml-aarch64.c
             ggml-aarch64.h
-
-)
-
-add_subdirectory(ggml-cpu)
+            )
 
 add_library(ggml
             ggml-backend-reg.cpp
-)
+            )
+add_subdirectory(ggml-cpu)
 
 target_link_libraries(ggml PUBLIC ggml-base ggml-cpu)
 
@@ -850,5 +811,6 @@ target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTR
 if (BUILD_SHARED_LIBS)
     set_target_properties(ggml-base PROPERTIES POSITION_INDEPENDENT_CODE ON)
     set_target_properties(ggml      PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
+    target_compile_definitions(ggml-base PRIVATE GGML_BUILD)
+    target_compile_definitions(ggml-base PUBLIC  GGML_SHARED)
 endif()
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 8c56b3480..6bd537d96 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -1,5 +1,7 @@
-#include "ggml-backend.h"
 #include "ggml-backend-impl.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
 #include <cstring>
 #include <vector>
 
@@ -45,8 +47,6 @@
 #include "ggml-kompute.h"
 #endif
 
-#include "ggml-cpu.h"
-
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_t> backends;
     std::vector<ggml_backend_dev_t> devices;
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index 1a3f6fb56..7fa2897c2 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -10783,22 +10783,6 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
 #endif
 }
 
-//
-// ============================================= 3-bit using D4 lattice
-//
-
-void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq3_xxs * restrict y = vy;
-    quantize_row_iq3_xxs_ref(x, y, k);
-}
-
-void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_iq3_s * restrict y = vy;
-    quantize_row_iq3_s_ref(x, y, k);
-}
-
 // ============================ 4-bit non-linear quants
 
 void quantize_row_iq4_nl(const float * restrict x, void * restrict y, int64_t k) {
@@ -10810,10 +10794,3 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict y, int64_t k)
     assert(k % QK_K == 0);
     quantize_iq4_xs(x, y, 1, k, NULL);
 }
-
-// =============================== 2.5625 bpw
-
-void quantize_row_iq2_s(const float * restrict x, void * restrict y, int64_t k) {
-    assert(k % QK_K == 0);
-    quantize_iq2_s(x, y, 1, k, NULL);
-}
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.h b/ggml/src/ggml-cpu/ggml-cpu-quants.h
index 8bbe08f47..e33d9d473 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.h
@@ -29,11 +29,8 @@ void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 1a82b0124..cd5d7e953 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -356,19 +356,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_XXS] = {
-        .from_float               = quantize_row_iq3_xxs,
+        // NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
+        //.from_float               = quantize_row_iq3_xxs,
         .vec_dot                  = ggml_vec_dot_iq3_xxs_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ3_S] = {
-        .from_float               = quantize_row_iq3_s,
+        //.from_float               = quantize_row_iq3_s,
         .vec_dot                  = ggml_vec_dot_iq3_s_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
     [GGML_TYPE_IQ2_S] = {
-        .from_float               = quantize_row_iq2_s,
+        //.from_float               = quantize_row_iq2_s,
         .vec_dot                  = ggml_vec_dot_iq2_s_q8_K,
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index cc77fe174..c7216117b 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -11,6 +11,14 @@
 #include <sys/sysctl.h>
 #endif
 
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+    #define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 // ggml-backend interface
 
 #ifdef GGML_USE_CPU_HBM
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
index 5357022f2..a040e4b56 100644
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -3,7 +3,6 @@
 // GGML internal header
 
 #include "ggml.h"
-
 #include <assert.h>
 #include <math.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
@@ -11,6 +10,18 @@
 #include <stdint.h>
 #include <string.h>
 
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#endif // __ARM_FEATURE_SVE
+
+#if defined(__ARM_NEON)
+// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
+//
+//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
+//
+#include <arm_neon.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -29,13 +40,13 @@ extern "C" {
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
+    #ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+    #endif
 #endif
 
 static inline int ggml_up32(int n) {
@@ -121,14 +132,12 @@ struct ggml_map_custom1_op_params {
     void             * userdata;
 };
 
-
 struct ggml_map_custom2_op_params {
     ggml_custom2_op_t   fun;
     int                 n_tasks;
     void              * userdata;
 };
 
-
 struct ggml_map_custom3_op_params {
     ggml_custom3_op_t fun;
     int n_tasks;
@@ -291,171 +300,155 @@ void ggml_aligned_free(void * ptr, size_t size);
 // FP16 to FP32 conversion
 
 #if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-typedef uint16_t ggml_fp16_internal_t;
-#else
-typedef __fp16 ggml_fp16_internal_t;
-#endif
+    #ifdef _MSC_VER
+        typedef uint16_t ggml_fp16_internal_t;
+    #else
+        typedef __fp16 ggml_fp16_internal_t;
+    #endif
 #endif
 
 #if defined(__ARM_NEON) && !defined(_MSC_VER)
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        ggml_fp16_internal_t tmp;
+        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
+        return (float)tmp;
+    }
 
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    ggml_fp16_internal_t tmp;
-    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
-    return (float)tmp;
-}
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        ggml_fp16_t res;
+        ggml_fp16_internal_t tmp = f;
+        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
+        return res;
+    }
 
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    ggml_fp16_t res;
-    ggml_fp16_internal_t tmp = f;
-    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
-    return res;
-}
+#elif defined(__F16C__)
 
-#else
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
+    #ifdef _MSC_VER
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
+    #else
+        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
+        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
+    #endif
 
 #elif defined(__POWER9_VECTOR__)
 
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+    /* the inline asm below is about 12% faster than the lookup method */
+    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
+    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        register float f;
+        register double d;
+        __asm__(
+            "mtfprd %0,%2\n"
+            "xscvhpdp %0,%0\n"
+            "frsp %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=f"(f):
+            /* in */   "r"(h));
+        return f;
     }
 
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+        register double d;
+        register ggml_fp16_t r;
+        __asm__( /* xscvdphp can work on double or single precision */
+            "xscvdphp %0,%2\n"
+            "mffprd %1,%0\n" :
+            /* temp */ "=d"(d),
+            /* out */  "=r"(r):
+            /* in */   "f"(f));
+        return r;
+    }
 
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+#else
 
-#endif // __F16C__
+    // FP16 <-> FP32
+    // ref: https://github.com/Maratyszcza/FP16
+
+    static inline float fp32_from_bits(uint32_t w) {
+        union {
+            uint32_t as_bits;
+            float as_value;
+        } fp32;
+        fp32.as_bits = w;
+        return fp32.as_value;
+    }
+
+    static inline uint32_t fp32_to_bits(float f) {
+        union {
+            float as_value;
+            uint32_t as_bits;
+        } fp32;
+        fp32.as_value = f;
+        return fp32.as_bits;
+    }
+
+    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+        const uint32_t w = (uint32_t) h << 16;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        const uint32_t two_w = w + w;
+
+        const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float exp_scale = 0x1.0p-112f;
+    #else
+        const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+    #endif
+        const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+        const uint32_t magic_mask = UINT32_C(126) << 23;
+        const float magic_bias = 0.5f;
+        const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+        const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+        const uint32_t result = sign |
+            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+        return fp32_from_bits(result);
+    }
+
+    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+        const float scale_to_inf = 0x1.0p+112f;
+        const float scale_to_zero = 0x1.0p-110f;
+    #else
+        const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+    #endif
+        float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+        const uint32_t w = fp32_to_bits(f);
+        const uint32_t shl1_w = w + w;
+        const uint32_t sign = w & UINT32_C(0x80000000);
+        uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+        if (bias < UINT32_C(0x71000000)) {
+            bias = UINT32_C(0x71000000);
+        }
+
+        base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+        const uint32_t bits = fp32_to_bits(base);
+        const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+        const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+        const uint32_t nonsign = exp_bits + mantissa_bits;
+        return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+    }
+
+    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 
 #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 
-#ifdef __ARM_FEATURE_SVE
-#include <arm_sve.h>
-#endif // __ARM_FEATURE_SVE
-
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
+GGML_API float ggml_table_f32_f16[1 << 16];
 
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 46a6ad562..a86624750 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -29,5 +29,6 @@ target_link_libraries(llama PUBLIC ggml)
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    target_compile_definitions(llama PRIVATE LLAMA_BUILD)
+    target_compile_definitions(llama PUBLIC  LLAMA_SHARED)
 endif()