mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
Improvements for Windows with Snapdragon X
This commit is contained in:
parent
5e116e8dd5
commit
bf21397ae5
@ -9,7 +9,8 @@ set( CMAKE_CXX_COMPILER clang++ )
|
|||||||
set( CMAKE_C_COMPILER_TARGET ${target} )
|
set( CMAKE_C_COMPILER_TARGET ${target} )
|
||||||
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
set( CMAKE_CXX_COMPILER_TARGET ${target} )
|
||||||
|
|
||||||
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
# march for Snapdragon X should be 8.7-a, but this currently breaks Q_4_0_4_4 acceleration, 8.5 works
|
||||||
|
set( arch_c_flags "-march=armv8.5-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
|
||||||
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
|
||||||
|
|
||||||
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
|
||||||
|
@ -16,7 +16,7 @@ In order to build llama.cpp you have four different options.
|
|||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
- On Windows:
|
- On Windows (x86/x64 only, arm64 requires cmake):
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
2. Extract `w64devkit` on your pc.
|
2. Extract `w64devkit` on your pc.
|
||||||
@ -45,6 +45,13 @@ In order to build llama.cpp you have four different options.
|
|||||||
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
|
||||||
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For Windows:
|
||||||
|
- Install cmake e.g. via `winget install cmake`:
|
||||||
|
- As alternative to the w64devkit mentioned in "using make" above, install MSVC (e.g. via Visual Studio 2022 Community Edition).
|
||||||
|
- For Windows on ARM you need MSVC installed and _additonally_:
|
||||||
|
- Install [clang via LLVM for woa64](https://releases.llvm.org) to enable better ARM optimizations (clang needs the MSVC backend).
|
||||||
|
- For using clang, the first build step needs to be `cmake --preset arm64-windows-llvm-release` (instead of the `cmake -B ...` which defaults to MSVC).
|
||||||
|
- Note: Building for ARM can also just be done with MSVC (without installing clang or using the preset), but this e.g. does not support Q_4_0_4_4 acceleration, because the MSVC frontend cannot inline ARM assembly-code.
|
||||||
- For debug builds, there are two cases:
|
- For debug builds, there are two cases:
|
||||||
|
|
||||||
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
||||||
|
@ -392,7 +392,7 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -501,7 +501,7 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -613,7 +613,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE) && ! defined(_MSC_VER)
|
||||||
if (svcntw() == 8) {
|
if (svcntw() == 8) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
@ -753,7 +753,7 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||||
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
GGML_ASSERT(!(ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) &&
|
||||||
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
"__ARM_NEON and __ARM_FEATURE_MATMUL_INT8 defined, use the Q4_0_4_8 quantization format for optimal performance");
|
||||||
#elif defined(__ARM_NEON) && defined(__aarch64__)
|
#elif defined(__ARM_NEON) && defined(__aarch64__) && ! defined(_MSC_VER)
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -1271,7 +1271,7 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
"__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
float * res_ptr = s;
|
float * res_ptr = s;
|
||||||
@ -1727,7 +1727,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||||||
UNUSED(ncols_interleaved);
|
UNUSED(ncols_interleaved);
|
||||||
UNUSED(blocklen);
|
UNUSED(blocklen);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! defined(_MSC_VER)
|
||||||
if (svcntw() == 8) {
|
if (svcntw() == 8) {
|
||||||
const void * b_ptr = vx;
|
const void * b_ptr = vx;
|
||||||
const void * a_ptr = vy;
|
const void * a_ptr = vy;
|
||||||
|
Loading…
Reference in New Issue
Block a user