Merge 904111af8c into d09770cae7

ggml-alloc : fix list of allocated tensors with GGML_ALLOCATOR_DEBUG (#9573 )
ggml: Extend feature detection to include non aarch64 Arm arch
2024-09-22 21:16:20 +00:00 · 2024-09-21 21:38:20 +05:30 · 2024-09-21 14:24:23 +02:00 · 2024-09-20 11:40:50 +02:00 · 2024-09-17 10:49:19 +02:00
6 changed files with 110 additions and 34 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -2503,6 +2503,9 @@ extern "C" {
    GGML_API int ggml_cpu_has_cann       (void);
    GGML_API int ggml_cpu_has_llamafile  (void);
    // get the sve vector length in bytes
    GGML_API int ggml_cpu_get_sve_cnt(void);
    //
    // Internal types and functions exposed for tests and benchmarks
    //
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
@ -547,8 +547,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -659,8 +659,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -777,7 +777,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -843,12 +843,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
@ -998,8 +998,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1519,8 +1519,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
-        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
    }
 #endif
@ -1981,7 +1981,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
    UNUSED(blocklen);
 #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__))
-    if (ggml_sve_cnt_b == QK8_0) {
+    if (ggml_cpu_get_sve_cnt() == QK8_0) {
        const void * b_ptr = vx;
        const void * a_ptr = vy;
        float * res_ptr = s;
@ -2392,12 +2392,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
        return;
    }
    else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
-        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) &&
+        GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal "
                    "performance");
    }
    else if (ggml_cpu_has_neon()) {
-        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
+        GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) &&
                    "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 "
                    "quantization format for optimal performance");
    }
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -294,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
    alloc->free_blocks[0].offset = 0;
    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
    alloc->max_size = 0;
 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
        alloc->allocated_tensors[i].tensor = NULL;
    }
 #endif
 }
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4013,7 +4013,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
    // VLA Implementation using switch case
    switch (vector_length) {
@ -5597,7 +5597,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
    svfloat32_t sumv0 = svdup_n_f32(0.0f);
    svfloat32_t sumv1 = svdup_n_f32(0.0f);
-    const int vector_length = ggml_sve_cnt_b*8;
+    const int vector_length = ggml_cpu_get_sve_cnt()*8;
    //VLA Implemenation for SVE
    switch (vector_length) {
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 #if defined(__ARM_FEATURE_SVE)
 extern int ggml_sve_cnt_b;
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -39,9 +39,15 @@
 #include <unistd.h>
 #endif
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_ARCH)
-int ggml_sve_cnt_b = 0;
+struct ggml_arm_arch_features_type {
    int has_neon;
    int has_i8mm;
    int has_sve;
    int sve_cnt;
 } ggml_arm_arch_features = {-1, -1, -1, 0};
 #endif
 #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
 #undef GGML_USE_LLAMAFILE
 #endif
@ -3643,6 +3649,66 @@ static inline int ggml_up(int n, int m) {
 ////////////////////////////////////////////////////////////////////////////////
 #if defined(__ARM_ARCH)
 #if defined(__linux__) && defined(__aarch64__)
 #include <sys/auxv.h>
 #elif defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif
 static void ggml_init_arm_arch_features(void) {
 #if defined(__linux__) && defined(__aarch64__)
    uint32_t hwcap = getauxval(AT_HWCAP);
    uint32_t hwcap2 = getauxval(AT_HWCAP2);
    ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
    ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
    ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
 #if defined(__ARM_FEATURE_SVE)
    ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
 #endif
 #elif defined(__APPLE__)
    int oldp = 0;
    size_t size = sizeof(oldp);
    if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) {
        oldp = 0;
    }
    ggml_arm_arch_features.has_neon = oldp;
    if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
        oldp = 0;
    }
    ggml_arm_arch_features.has_i8mm = oldp;
    ggml_arm_arch_features.has_sve = 0;
    ggml_arm_arch_features.sve_cnt = 0;
 #else
 // Run-time CPU feature detection not implemented for this platform, fallback to compile time
 #if defined(__ARM_NEON)
    ggml_arm_arch_features.has_neon = 1;
 #else
    ggml_arm_arch_features.has_neon = 0;
 #endif
 #if defined(__ARM_FEATURE_MATMUL_INT8)
    ggml_arm_arch_features.has_i8mm = 1;
 #else
    ggml_arm_arch_features.has_i8mm = 0;
 #endif
 #if defined(__ARM_FEATURE_SVE)
    ggml_arm_arch_features.has_sve = 1;
    ggml_arm_arch_features.sve_cnt = 16;
 #else
    ggml_arm_arch_features.has_sve = 0;
    ggml_arm_arch_features.sve_cnt = 0;
 #endif
 #endif
 }
 #endif
 struct ggml_context * ggml_init(struct ggml_init_params params) {
    // make this function thread safe
    ggml_critical_section_start();
@ -3693,6 +3759,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
        }
 #if defined(__ARM_ARCH)
        ggml_init_arm_arch_features();
 #endif
        is_first_call = false;
    }
@ -3741,12 +3811,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
    GGML_ASSERT_ALIGNED(ctx->mem_buffer);
 #if defined(__ARM_FEATURE_SVE)
    if (!ggml_sve_cnt_b) {
        ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
    }
 #endif
    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
    ggml_critical_section_end();
@ -23545,16 +23609,16 @@ int ggml_cpu_has_fma(void) {
 }
 int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
+#if defined(__ARM_ARCH)
-    return 1;
+    return ggml_arm_arch_features.has_neon;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_sve(void) {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_ARCH)
-    return 1;
+    return ggml_arm_arch_features.has_sve;
 #else
    return 0;
 #endif
@ -23701,11 +23765,18 @@ int ggml_cpu_has_vsx(void) {
 }
 int ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_FEATURE_MATMUL_INT8)
+#if defined(__ARM_ARCH)
-    return 1;
+    return ggml_arm_arch_features.has_i8mm;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_get_sve_cnt(void) {
 #if defined(__ARM_ARCH)
    return ggml_arm_arch_features.sve_cnt;
 #else
    return 0;
 #endif
 }
 ////////////////////////////////////////////////////////////////////////////////
Author	SHA1	Message	Date
Dan Johansson	dc4476ac73	Merge `904111af8c` into `d09770cae7`	2024-09-21 21:38:20 +05:30
slaren	d09770cae7	ggml-alloc : fix list of allocated tensors with GGML_ALLOCATOR_DEBUG (#9573 ) Some checks are pending Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run Details Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run Details Nix CI / nix-eval (macos-latest) (push) Waiting to run Details Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run Details Nix CI / nix-build (macos-latest) (push) Waiting to run Details Nix CI / nix-build (ubuntu-latest) (push) Waiting to run Details flake8 Lint / Lint (push) Waiting to run Details	2024-09-21 14:24:23 +02:00
Dan Johansson	904111af8c	ggml: Extend feature detection to include non aarch64 Arm arch	2024-09-20 11:40:50 +02:00
Dan Johansson	aab436c58d	ggml: Added run-time detection of neon, i8mm and sve Adds run-time detection of the Arm instructions set features neon, i8mm and sve for Linux and Apple build targets.	2024-09-17 10:49:19 +02:00