ggml : fix arm build (#10890)

* ggml: GGML_NATIVE uses -mcpu=native on ARM Signed-off-by: Adrien Gallouët <angt@huggingface.co> * ggml: Show detected features with GGML_NATIVE Signed-off-by: Adrien Gallouët <angt@huggingface.co> * remove msvc support, add GGML_CPU_ARM_ARCH option * disable llamafile in android example * march -> mcpu, skip adding feature macros ggml-ci --------- Signed-off-by: Adrien Gallouët <angt@huggingface.co> Co-authored-by: Adrien Gallouët <angt@huggingface.co>
2024-12-25 19:04:35 +00:00 · 2024-12-18 23:21:42 +01:00 · 2024-12-18 23:21:42 +01:00 · 9177484f58
commit 9177484f58
parent 0bf2d10c55
5 changed files with 69 additions and 95 deletions
--- a/examples/llama.android/llama/build.gradle.kts
+++ b/examples/llama.android/llama/build.gradle.kts
@ -19,6 +19,7 @@ android {
        externalNativeBuild {
            cmake {
                arguments += "-DLLAMA_BUILD_COMMON=ON"
                arguments += "-DGGML_LLAMAFILE=OFF"
                arguments += "-DCMAKE_BUILD_TYPE=Release"
                cppFlags += listOf()
                arguments += listOf()
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -74,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
 endif()
 # general
-option(GGML_STATIC "ggml: static link libraries"         OFF)
+option(GGML_STATIC "ggml: static link libraries"                     OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
+option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
-option(GGML_LTO    "ggml: enable link time optimization" OFF)
+option(GGML_LTO    "ggml: enable link time optimization"             OFF)
-option(GGML_CCACHE "ggml: use ccache if available"       ON)
+option(GGML_CCACHE "ggml: use ccache if available"                   ON)
 # debug
 option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
@ -120,8 +120,9 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_SVE              "ggml: enable SVE"              OFF)
+
 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
 set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
 if (WIN32)
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@ -74,112 +74,77 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
-        (NOT CMAKE_OSX_ARCHITECTURES      AND
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
        NOT CMAKE_GENERATOR_PLATFORM_LWR AND
            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
        message(STATUS "ARM detected")
-        if (MSVC)
+        if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
-            list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead
+            message(FATAL_ERROR "MSVC is not supported for ARM, use clang")
            list(APPEND ARCH_DEFINITIONS __ARM_NEON)
            list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA)
            set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
            string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
            if (GGML_COMPILER_SUPPORT_DOTPROD)
                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
                message(STATUS "ARM feature DOTPROD enabled")
            endif ()
            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
            if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
                message(STATUS "ARM feature MATMUL_INT8 enabled")
            endif ()
            check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
            if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
                list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
                message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled")
            endif ()
            set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
        elseif (APPLE)
            if (GGML_NATIVE)
                set(USER_PROVIDED_MARCH FALSE)
                foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
                    if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
                        set(USER_PROVIDED_MARCH TRUE)
                        break()
                    endif()
                endforeach()
                if (NOT USER_PROVIDED_MARCH)
                    set(MARCH_FLAGS "-march=armv8.2a")
                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
                    if (GGML_COMPILER_SUPPORT_DOTPROD)
                        set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)
                        message(STATUS "ARM feature DOTPROD enabled")
                    endif ()
                    set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")
                    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
                    set(CMAKE_REQUIRED_FLAGS     "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")
                    check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
                    if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
                        set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
                        list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)
                        message(STATUS "ARM feature MATMUL_INT8 enabled")
                    endif ()
                    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
                    list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
                endif ()
            endif ()
        else()
            check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
            if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
                list(APPEND ARCH_FLAGS -mfp16-format=ieee)
            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+
-                # Raspberry Pi 1, Zero
+            if (GGML_NATIVE)
-                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+                list(APPEND ARCH_FLAGS -mcpu=native)
-            endif()
+
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+                set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-                if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+
-                    # Android armeabi-v7a
+                # -mcpu=native does not always enable all the features in some compilers,
-                    list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+                # so we check for them manually and enable them if available
-                else()
+
-                    # Raspberry Pi 2
+                include(CheckCXXSourceRuns)
-                    list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+
                set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+dotprod")
                check_cxx_source_runs(
                    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }"
                    GGML_COMPILER_SUPPORT_DOTPROD)
                if (GGML_COMPILER_SUPPORT_DOTPROD)
                    set(ARCH_FLAGS "${ARCH_FLAGS}+dotprod")
                endif()
                set(CMAKE_REQUIRED_FLAGS "${ARCH_FLAGS}+i8mm")
                check_cxx_source_runs(
                    "#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }"
                    GGML_COMPILER_SUPPORT_I8MM)
                if (GGML_COMPILER_SUPPORT_I8MM)
                    set(ARCH_FLAGS "${ARCH_FLAGS}+i8mm")
                endif()
                set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
            else()
                if (GGML_CPU_ARM_ARCH)
                    list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH})
                endif()
            endif()
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+
-                # Android arm64-v8a
+            # show enabled features
-                # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            execute_process(
-                list(APPEND ARCH_FLAGS -mno-unaligned-access)
+                COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
-            endif()
+                INPUT_FILE "/dev/null"
-            if (GGML_SVE)
+                OUTPUT_VARIABLE ARM_FEATURE
-                list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+                RESULT_VARIABLE ARM_FEATURE_RESULT
            )
            if (ARM_FEATURE_RESULT)
                message(FATAL_ERROR "Failed to get ARM features")
            else()
                foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
                    string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
                    if (NOT ${feature_pos} EQUAL -1)
                        message(STATUS "ARM feature ${feature} enabled")
                    endif()
                endforeach()
            endif()
        endif()
    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
        message(STATUS "x86 detected")
        if (MSVC)
            # instruction set detection for MSVC only
            if (GGML_NATIVE)
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@ -522,6 +522,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
        if (ggml_cpu_has_sve()) {
            features.push_back({ "SVE", "1" });
        }
        if (ggml_cpu_has_dotprod()) {
            features.push_back({ "DOTPROD", "1" });
        }
        if (ggml_cpu_has_matmul_int8()) {
            features.push_back({ "MATMUL_INT8", "1" });
        }
        if (ggml_cpu_get_sve_cnt() > 0) {
            static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
            features.push_back({ "SVE_CNT", sve_cnt.c_str() });
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -204,6 +204,7 @@ template <> inline float32x4_t load(const float *p) {
    return vld1q_f32(p);
 }
 #if !defined(_MSC_VER)
 // FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <> inline float16x8_t load(const ggml_fp16_t *p) {
    return vld1q_f16((const float16_t *)p);
 }