diff --git a/.clang-tidy b/.clang-tidy index 952c0cca8..310c3d182 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,8 +17,10 @@ Checks: > -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, portability-*, + -portability-simd-intrinsics, misc-*, -misc-const-correctness, -misc-non-private-member-variables-in-classes, -misc-no-recursion, + -misc-use-anonymous-namespace, FormatStyle: none diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 48953dafa..e2291bd34 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1121,6 +1121,11 @@ jobs: run: | & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + - name: Install ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + - name: Build id: cmake_build run: | diff --git a/Makefile b/Makefile index 25214ec05..83adcef28 100644 --- a/Makefile +++ b/Makefile @@ -254,8 +254,8 @@ endif # keep standard at C11 and C++11 MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC -MK_NVCCFLAGS = -std=c++11 +MK_CXXFLAGS = -std=c++17 -fPIC +MK_NVCCFLAGS = -std=c++17 ifdef LLAMA_NO_CCACHE GGML_NO_CCACHE := 1 @@ -575,9 +575,12 @@ endif ifndef GGML_NO_AMX MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o + OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o endif +# only necessary for the CPU backend files +MK_CPPFLAGS += -Iggml/src/ggml-cpu + ifdef GGML_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJ_GGML_EXT += ggml/src/ggml-rpc.o diff --git a/Package.swift b/Package.swift index d9e8a4e2d..1e75aa7e2 100644 --- a/Package.swift +++ b/Package.swift @@ -28,13 +28,16 @@ var cSettings: [CSetting] = [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-fno-objc-arc"]), .headerSearchPath("ggml/src"), + .headerSearchPath("ggml/src/ggml-cpu"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_LAPACK_ILP64") + .define("GGML_USE_CPU"), ] + #if canImport(Darwin) sources.append("ggml/src/ggml-common.h") sources.append("ggml/src/ggml-metal/ggml-metal.m") @@ -44,7 +47,6 @@ cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), .define("GGML_USE_METAL"), - .define("GGML_USE_CPU") ] ) #endif diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 223174884..89862fe11 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -88,5 +88,5 @@ if (LLAMA_CURL) endif () target_include_directories(${TARGET} PUBLIC .) -target_compile_features (${TARGET} PUBLIC cxx_std_11) +target_compile_features (${TARGET} PUBLIC cxx_std_17) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/common.cpp b/common/common.cpp index 2b2f00098..6143516d2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) { std::u32string filename_utf32; try { +#if defined(__clang__) + // disable C++17 deprecation warning for std::codecvt_utf8 +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif std::wstring_convert, char32_t> converter; + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + filename_utf32 = converter.from_bytes(filename); // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 959acaeee..68ad707f3 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 77e33343b..0d439f498 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt index a6790e617..44e5f722a 100644 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml) add_executable(${TARGET} convert-llama2c-to-ggml.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt index 0a559d60c..49ad9561c 100644 --- a/examples/cvector-generator/CMakeLists.txt +++ b/examples/cvector-generator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 8256e789a..809040307 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index 5d1048aad..95915ed91 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -2,7 +2,7 @@ set(TARGET llama-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TEST_TARGET test-eval-callback) add_test(NAME ${TEST_TARGET} diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index 1cef6e716..310455787 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index 4edd6ec73..d2cb524c0 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt index c94cda776..25de0af35 100644 --- a/examples/gen-docs/CMakeLists.txt +++ b/examples/gen-docs/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gen-docs) add_executable(${TARGET} gen-docs.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 7a494ce32..15c5c68c6 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -19,4 +19,4 @@ add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index f63887da7..c407e2f0a 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index a9569b411..fb04eb83f 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index 86dfddca3..fa1b4dc70 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index d4c8265bd..412696c47 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 9b1aa3b63..fb26628d8 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt index 5bdbea4e2..17e3b9b87 100644 --- a/examples/llama-bench/CMakeLists.txt +++ b/examples/llama-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index bbf5fec58..5d32f377f 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .) target_include_directories(llava PUBLIC ../..) target_include_directories(llava PUBLIC ../../common) -target_compile_features(llava PRIVATE cxx_std_11) +target_compile_features(llava PRIVATE cxx_std_17) add_library(llava_static STATIC $) if (BUILD_SHARED_LIBS) @@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-minicpmv-cli) add_executable(${TARGET} minicpmv-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt index f0ae5cd89..346861314 100644 --- a/examples/lookahead/CMakeLists.txt +++ b/examples/lookahead/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-lookahead) add_executable(${TARGET} lookahead.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index ef19fe25e..fba78ceda 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -2,22 +2,22 @@ set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index 3b38db292..5563f4de0 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) target_include_directories(${TARGET} PRIVATE ${_common_path}) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index 5f6efaa9a..af3d9150f 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt index c13557bac..847e916de 100644 --- a/examples/parallel/CMakeLists.txt +++ b/examples/parallel/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-parallel) add_executable(${TARGET} parallel.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt index dc467a5d3..9bc5110c2 100644 --- a/examples/passkey/CMakeLists.txt +++ b/examples/passkey/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-passkey) add_executable(${TARGET} passkey.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index be0f2fd02..3e6864093 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index bb986a716..9a3a0d3cd 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 62680cda4..47e5cbe30 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt index 66610f311..512a602ec 100644 --- a/examples/retrieval/CMakeLists.txt +++ b/examples/retrieval/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-retrieval) add_executable(${TARGET} retrieval.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt index 084f1e92d..52add51ef 100644 --- a/examples/run/CMakeLists.txt +++ b/examples/run/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-run) add_executable(${TARGET} run.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 0fb5e359b..0f50e50de 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-save-load-state) add_executable(${TARGET} save-load-state.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 93e876f5a..e82f91533 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -50,4 +50,4 @@ if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple-chat/CMakeLists.txt b/examples/simple-chat/CMakeLists.txt index 87723533b..567f7fbbb 100644 --- a/examples/simple-chat/CMakeLists.txt +++ b/examples/simple-chat/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-simple-chat) add_executable(${TARGET} simple-chat.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt index b63afbb8b..104ecabfd 100644 --- a/examples/simple/CMakeLists.txt +++ b/examples/simple/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-simple) add_executable(${TARGET} simple.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/speculative-simple/CMakeLists.txt b/examples/speculative-simple/CMakeLists.txt index 7a3a141c2..aeaea74fc 100644 --- a/examples/speculative-simple/CMakeLists.txt +++ b/examples/speculative-simple/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-speculative-simple) add_executable(${TARGET} speculative-simple.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/speculative/CMakeLists.txt b/examples/speculative/CMakeLists.txt index aa208e7aa..c84196bd9 100644 --- a/examples/speculative/CMakeLists.txt +++ b/examples/speculative/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-speculative) add_executable(${TARGET} speculative.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tokenize/CMakeLists.txt b/examples/tokenize/CMakeLists.txt index b704dcae1..1690b53e5 100644 --- a/examples/tokenize/CMakeLists.txt +++ b/examples/tokenize/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-tokenize) add_executable(${TARGET} tokenize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 70b5cfdf7..789fa3b0c 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -161,7 +161,6 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)") option(GGML_OPENMP "ggml: use OpenMP" ON) option(GGML_RPC "ggml: use RPC" OFF) -option(GGML_AMX "ggml: use AMX" OFF) option(GGML_SYCL "ggml: use SYCL" OFF) option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) set (GGML_SYCL_TARGET "INTEL" CACHE STRING diff --git a/ggml/include/ggml-amx.h b/ggml/include/ggml-amx.h deleted file mode 100644 index 042d6d919..000000000 --- a/ggml/include/ggml-amx.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - - -#ifdef __cplusplus -extern "C" { -#endif - -// buffer_type API -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); - -GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend); - -// backend API -GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void); - -GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9022aa3ae..19289f32b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -261,21 +261,15 @@ function(ggml_add_backend backend) if (${backend_id}) string(TOLOWER "ggml-${backend}" backend_target) add_subdirectory(${backend_target}) - # check again in case the backend disabled itself - # note that this should NOT be the normal behavior, in case of errors the backend should fail the build - # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp - if (${backend_id}) - message(STATUS "Including ${backend} backend") - if (NOT GGML_BACKEND_DL) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) - endif() + message(STATUS "Including ${backend} backend") + if (NOT GGML_BACKEND_DL) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() ggml_add_backend(CPU) -ggml_add_backend(AMX) ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) @@ -289,7 +283,7 @@ ggml_add_backend(Vulkan) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) - target_compile_features (${target} PRIVATE c_std_11) # don't bump + target_compile_features (${target} PRIVATE c_std_11 cxx_std_17) # don't bump endforeach() target_link_libraries(ggml-base PRIVATE Threads::Threads) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt deleted file mode 100644 index cf3ade6f0..000000000 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ /dev/null @@ -1,105 +0,0 @@ -if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR - (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND - CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) - message(STATUS "Using AMX") - - file(GLOB GGML_HEADERS_AMX "*.h") - list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h") - - file(GLOB GGML_SOURCES_AMX "*.cpp") - - ggml_add_backend_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX} - ) - - # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags - # TODO: integrate AMX backend into the CPU backend - if (MSVC) - # instruction set detection for MSVC only - if (GGML_NATIVE) - # TODO: improve, should not reference files from the parent folder - include(../ggml-cpu/cmake/FindSIMD.cmake) - endif () - if (GGML_AVX512) - list(APPEND ARCH_FLAGS /arch:AVX512) - # MSVC has no compile-time flags enabling specific - # AVX512 extensions, neither it defines the - # macros corresponding to the extensions. - # Do it manually. - if (GGML_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) - endif() - if (GGML_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) - endif() - if (GGML_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) - endif() - if (GGML_AMX_TILE) - add_compile_definitions($<$:__AMX_TILE__>) - add_compile_definitions($<$:__AMX_TILE__>) - endif() - if (GGML_AMX_INT8) - add_compile_definitions($<$:__AMX_INT8__>) - add_compile_definitions($<$:__AMX_INT8__>) - endif() - if (GGML_AMX_BF16) - add_compile_definitions($<$:__AMX_BF16__>) - add_compile_definitions($<$:__AMX_BF16__>) - endif() - elseif (GGML_AVX2) - list(APPEND ARCH_FLAGS /arch:AVX2) - elseif (GGML_AVX) - list(APPEND ARCH_FLAGS /arch:AVX) - endif() - else() - if (GGML_NATIVE) - list(APPEND ARCH_FLAGS -march=native) - endif() - if (GGML_F16C) - list(APPEND ARCH_FLAGS -mf16c) - endif() - if (GGML_FMA) - list(APPEND ARCH_FLAGS -mfma) - endif() - if (GGML_AVX) - list(APPEND ARCH_FLAGS -mavx) - endif() - if (GGML_AVX2) - list(APPEND ARCH_FLAGS -mavx2) - endif() - if (GGML_AVX512) - list(APPEND ARCH_FLAGS -mavx512f) - list(APPEND ARCH_FLAGS -mavx512dq) - list(APPEND ARCH_FLAGS -mavx512bw) - endif() - if (GGML_AVX512_VBMI) - list(APPEND ARCH_FLAGS -mavx512vbmi) - endif() - if (GGML_AVX512_VNNI) - list(APPEND ARCH_FLAGS -mavx512vnni) - endif() - if (GGML_AVX512_BF16) - list(APPEND ARCH_FLAGS -mavx512bf16) - endif() - if (GGML_AMX_TILE) - list(APPEND ARCH_FLAGS -mamx-tile) - endif() - if (GGML_AMX_INT8) - list(APPEND ARCH_FLAGS -mamx-int8) - endif() - if (GGML_AMX_BF16) - list(APPEND ARCH_FLAGS -mamx-bf16) - endif() - endif() - - target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS}) -else() - set(GGML_AMX OFF PARENT_SCOPE) - message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.") -endif() diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp deleted file mode 100644 index 6bfb3da27..000000000 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ /dev/null @@ -1,449 +0,0 @@ -#include "ggml-amx.h" -#include "ggml-amx/common.h" -#include "ggml-amx/mmq.h" -#include "ggml-backend-impl.h" -#include "ggml-impl.h" - -#if defined(__gnu_linux__) -#include -#include -#endif - -#include -#include -#include - -#if defined(__AMX_INT8__) - -// AMX buffer interface -static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { - free(buffer->context); -} - -static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)(buffer->context); -} - -static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { - memset((char *)tensor->data + offset, value, size); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - if (qtype_has_amx_kernels(tensor->type)) { - ggml_backend_amx_convert_weight(tensor, data, offset, size); - } else { - memcpy((char *)tensor->data + offset, data, size); - } - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); - memcpy(data, (const char *)tensor->data + offset, size); - - GGML_UNUSED(buffer); -} - -static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { - if (ggml_backend_buffer_is_host(src->buffer)) { - if (qtype_has_amx_kernels(src->type)) { - ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst)); - } else { - memcpy(dst->data, src->data, ggml_nbytes(src)); - } - return true; - } - return false; - - GGML_UNUSED(buffer); -} - -static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - memset(buffer->context, value, buffer->size); -} - -static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { - /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, - /* .get_base = */ ggml_backend_amx_buffer_get_base, - /* .init_tensor = */ NULL, // no initialization required - /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, - /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, - /* .clear = */ ggml_backend_amx_buffer_clear, - /* .reset = */ NULL, -}; - -static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - return "AMX"; - - GGML_UNUSED(buft); -} - -static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * data = aligned_alloc(TENSOR_ALIGNMENT, size); - if (data == NULL) { - fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); - return NULL; - } - - return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); -} - -static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return TENSOR_ALIGNMENT; - - GGML_UNUSED(buft); -} - -static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { - return ggml_backend_amx_get_alloc_size(tensor); - - GGML_UNUSED(buft); -} - -static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - GGML_UNUSED(buft); -} - -ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { - /* .iface = */ { - /* .get_name = */ ggml_backend_amx_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, - /* .is_host = */ ggml_backend_amx_buffer_type_is_host, - }, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), - /* .context = */ NULL, - }; - - return &ggml_backend_buffer_type_amx; -} - -// backend interface - -static const char * ggml_backend_amx_name(ggml_backend_t backend) { - return "AMX"; - - GGML_UNUSED(backend); -} - -static void ggml_backend_amx_free(ggml_backend_t backend) { - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; - delete ctx; - delete backend; -} - -static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context; - - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_MUL_MAT: - ggml_backend_amx_mul_mat(ctx, node); - break; - - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - break; - - default: - fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); - GGML_ASSERT(false); - } - } - - return GGML_STATUS_SUCCESS; - - GGML_UNUSED(backend); -} - -static struct ggml_backend_i ggml_backend_amx_i = { - /* .get_name = */ ggml_backend_amx_name, - /* .free = */ ggml_backend_amx_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_amx_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_amx_guid() { - static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e }; - return &guid; -} - -#define ARCH_GET_XCOMP_PERM 0x1022 -#define ARCH_REQ_XCOMP_PERM 0x1023 -#define XFEATURE_XTILECFG 17 -#define XFEATURE_XTILEDATA 18 - -static bool ggml_amx_init() { -#if defined(__gnu_linux__) - if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { - fprintf(stderr, "AMX is not ready to be used!\n"); - return false; - } - return true; -#elif defined(_WIN32) - return true; -#endif -} - -ggml_backend_t ggml_backend_amx_init() { - - // invoke a Linux system call to request access to AMX features - ggml_amx_init(); - - // backend context - ggml_backend_amx_context * ctx = new ggml_backend_amx_context; - - // ggml amx backend - ggml_backend_t backend = new ggml_backend { - /* .guid = */ ggml_backend_amx_guid(), - /* .interface = */ ggml_backend_amx_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0), - /* .context = */ ctx, - }; - - return backend; -} - -bool ggml_backend_is_amx(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid()); -} - -void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { - GGML_ASSERT(ggml_backend_is_amx(backend_amx)); - - ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context; - ctx->n_threads = n_threads; -} - -// device interface - -static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) { - return "AMX"; - - GGML_UNUSED(dev); -} - -static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) { - return "Intel Advanced Matrix Extensions"; - - GGML_UNUSED(dev); -} - -static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - // TODO - *free = 0; - *total = 0; - - GGML_UNUSED(dev); -} - -static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) { - return GGML_BACKEND_DEVICE_TYPE_ACCEL; - - GGML_UNUSED(dev); -} - -static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_amx_device_get_name(dev); - props->description = ggml_backend_amx_device_get_description(dev); - props->type = ggml_backend_amx_device_get_type(dev); - ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total); - - // `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged - props->caps = { - /* .async = */ false, - /* .host_buffer = */ false, - /* .buffer_from_host_ptr = */ false, - /* .events = */ false, - }; -} - -static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) { - return ggml_backend_amx_init(); - - GGML_UNUSED(dev); - GGML_UNUSED(params); -} - -static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) { - return ggml_backend_amx_buffer_type(); - - GGML_UNUSED(dev); -} - -static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - - // handle only 2d gemm for now - auto is_contiguous_2d = [](const struct ggml_tensor * t) { - return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; - }; - - switch (op->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - return true; - - case GGML_OP_MUL_MAT: { - const struct ggml_tensor * src0 = op->src[0]; - const struct ggml_tensor * src1 = op->src[1]; - - const enum ggml_type type = src0->type; - const int64_t ne0 = op->ne[0]; - - // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 - // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 - bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); - - bool can_use_amx = - is_contiguous_2d(src0) && // src0 must be contiguous - is_contiguous_2d(src1) && // src1 must be contiguous - src1->type == GGML_TYPE_F32 && // src1 must be float32 - has_amx_kernels && // with amx kernel impls - ne0 % (TILE_N * 2) == 0; // out_features is 32x - - return can_use_amx; - } - default: - return false; - } - - GGML_UNUSED(dev); -} - -static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_amx_device_i = { - /* .get_name = */ ggml_backend_amx_device_get_name, - /* .get_description = */ ggml_backend_amx_device_get_description, - /* .get_memory = */ ggml_backend_amx_device_get_memory, - /* .get_type = */ ggml_backend_amx_device_get_type, - /* .get_props = */ ggml_backend_amx_device_get_props, - /* .init_backend = */ ggml_backend_amx_device_init, - /* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_amx_device_supports_op, - /* .supports_buft = */ ggml_backend_amx_device_supports_buft, - /* .offload_op = */ NULL, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -// backend reg interface - -static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) { - return "AMX"; - - GGML_UNUSED(reg); -} - -static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) { - return 1; - - GGML_UNUSED(reg); -} - -static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) { - GGML_ASSERT(index == 0); - - static ggml_backend_device ggml_backend_amx_device = { - /* .iface = */ ggml_backend_amx_device_i, - /* .reg = */ reg, - /* .context = */ nullptr, - }; - - return &ggml_backend_amx_device; - - GGML_UNUSED(reg); - GGML_UNUSED(index); -} - -static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) { - if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { - return (void *)ggml_backend_amx_set_n_threads; - } - return NULL; - - GGML_UNUSED(reg); - GGML_UNUSED(name); -} - -static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { - /* .get_name = */ ggml_backend_amx_reg_get_name, - /* .get_device_count = */ ggml_backend_amx_reg_get_device_count, - /* .get_device = */ ggml_backend_amx_reg_get_device, - /* .get_proc_address = */ ggml_backend_amx_get_proc_address, -}; - -ggml_backend_reg_t ggml_backend_amx_reg(void) { - static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, - }; - - return &ggml_backend_amx_reg; -} - -#else // if defined(__AMX_INT8__) - -ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) { - return nullptr; -} - -bool ggml_backend_is_amx(ggml_backend_t backend) { - GGML_UNUSED(backend); - return false; -} - -ggml_backend_t ggml_backend_amx_init(void) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - return nullptr; -} - -void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - - GGML_UNUSED(backend_amx); - GGML_UNUSED(n_threads); -} - -ggml_backend_reg_t ggml_backend_amx_reg(void) { - return nullptr; -} - -#endif - -GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index a0e0e2c58..3182b84f5 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -49,10 +49,6 @@ #include "ggml-rpc.h" #endif -#ifdef GGML_USE_AMX -# include "ggml-amx.h" -#endif - #ifdef GGML_USE_CANN #include "ggml-cann.h" #endif @@ -92,9 +88,6 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif -#ifdef GGML_USE_AMX - register_backend(ggml_backend_amx_reg()); -#endif #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 45da0c27d..fdb4b986f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -742,7 +742,8 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) { // since the tensor is pre-allocated, it cannot be moved to another backend - GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name); + ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ABORT("pre-allocated tensor (%s) in a buffer (%s) that cannot run the operation (%s)", tensor->name, ggml_backend_buffer_name(buffer), ggml_op_name(tensor->op)); } // graph input diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 4dbc1f75b..fe2222084 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,12 +1,20 @@ -ggml_add_backend_library(ggml-cpu - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - ) +ggml_add_backend_library(ggml-cpu) +list (APPEND GGML_CPU_SOURCES + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + amx/amx.cpp + amx/amx.h + amx/mmq.cpp + amx/mmq.h + ggml-cpu-impl.h + ) + +target_compile_features(ggml-cpu PRIVATE c_std_11 cxx_std_17) target_include_directories(ggml-cpu PRIVATE .) if (APPLE AND GGML_ACCELERATE) @@ -14,9 +22,9 @@ if (APPLE AND GGML_ACCELERATE) if (ACCELERATE_FRAMEWORK) message(STATUS "Accelerate framework found") - add_compile_definitions(GGML_USE_ACCELERATE) - add_compile_definitions(ACCELERATE_NEW_LAPACK) - add_compile_definitions(ACCELERATE_LAPACK_ILP64) + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_ACCELERATE) + target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_NEW_LAPACK) + target_compile_definitions(ggml-cpu PRIVATE ACCELERATE_LAPACK_ILP64) target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK}) else() @@ -29,15 +37,9 @@ if (GGML_OPENMP) if (OpenMP_FOUND) message(STATUS "OpenMP found") - add_compile_definitions(GGML_USE_OPENMP) + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_OPENMP) target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) - - # FIXME: should be replaced with a compiler id check - #if (GGML_MUSA) - # list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include") - # list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so") - #endif() else() message(WARNING "OpenMP not found") endif() @@ -46,11 +48,11 @@ endif() if (GGML_LLAMAFILE) message(STATUS "Using llamafile") - add_compile_definitions(GGML_USE_LLAMAFILE) + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_LLAMAFILE) - target_sources(ggml-cpu PRIVATE - llamafile/sgemm.cpp - llamafile/sgemm.h) + list(APPEND GGML_CPU_SOURCES + llamafile/sgemm.cpp + llamafile/sgemm.h) endif() if (GGML_CPU_HBM) @@ -58,7 +60,7 @@ if (GGML_CPU_HBM) message(STATUS "Using memkind for CPU HBM") - add_compile_definitions(GGML_USE_CPU_HBM) + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_HBM) target_link_libraries(ggml-cpu PUBLIC memkind) endif() @@ -72,16 +74,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR message(STATUS "ARM detected") if (MSVC) - add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead - add_compile_definitions(__ARM_NEON) - add_compile_definitions(__ARM_FEATURE_FMA) + list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead + list(APPEND ARCH_DEFINITIONS __ARM_NEON) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD) - add_compile_definitions(__ARM_FEATURE_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) message(STATUS "ARM feature DOTPROD enabled") endif () @@ -89,14 +91,14 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) message(STATUS "ARM feature MATMUL_INT8 enabled") endif () check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") endif () @@ -118,7 +120,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD) set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") - add_compile_definitions(__ARM_FEATURE_DOTPROD) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) message(STATUS "ARM feature DOTPROD enabled") endif () @@ -131,7 +133,7 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) if (GGML_COMPILER_SUPPORT_MATMUL_INT8) set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") - add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) message(STATUS "ARM feature MATMUL_INT8 enabled") endif () @@ -175,7 +177,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW if (MSVC) # instruction set detection for MSVC only if (GGML_NATIVE) - # TODO: improve, should not reference files from the parent folder include(cmake/FindSIMD.cmake) endif () if (GGML_AVX512) @@ -185,37 +186,31 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW # macros corresponding to the extensions. # Do it manually. if (GGML_AVX512_VBMI) - add_compile_definitions($<$:__AVX512VBMI__>) - add_compile_definitions($<$:__AVX512VBMI__>) + list(APPEND ARCH_DEFINITIONS __AVX512VBMI__) if (CMAKE_C_COMPILER_ID STREQUAL "Clang") list(APPEND ARCH_FLAGS -mavx512vbmi) endif() endif() if (GGML_AVX512_VNNI) - add_compile_definitions($<$:__AVX512VNNI__>) - add_compile_definitions($<$:__AVX512VNNI__>) + list(APPEND ARCH_DEFINITIONS __AVX512VNNI__) if (CMAKE_C_COMPILER_ID STREQUAL "Clang") list(APPEND ARCH_FLAGS -mavx512vnni) endif() endif() if (GGML_AVX512_BF16) - add_compile_definitions($<$:__AVX512BF16__>) - add_compile_definitions($<$:__AVX512BF16__>) + list(APPEND ARCH_DEFINITIONS __AVX512BF16__) if (CMAKE_C_COMPILER_ID STREQUAL "Clang") list(APPEND ARCH_FLAGS -mavx512bf16) endif() endif() if (GGML_AMX_TILE) - add_compile_definitions($<$:__AMX_TILE__>) - add_compile_definitions($<$:__AMX_TILE__>) + list(APPEND ARCH_DEFINITIONS __AMX_TILE__) endif() if (GGML_AMX_INT8) - add_compile_definitions($<$:__AMX_INT8__>) - add_compile_definitions($<$:__AMX_INT8__>) + list(APPEND ARCH_DEFINITIONS __AMX_INT8__) endif() if (GGML_AMX_BF16) - add_compile_definitions($<$:__AMX_BF16__>) - add_compile_definitions($<$:__AMX_BF16__>) + list(APPEND ARCH_DEFINITIONS __AMX_BF16__) endif() elseif (GGML_AVX2) list(APPEND ARCH_FLAGS /arch:AVX2) @@ -276,7 +271,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") list(APPEND ARCH_FLAGS -mcpu=powerpc64le) else() list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) - #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + # TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") message(STATUS "loongarch64 detected") @@ -299,11 +294,12 @@ endif() if (GGML_CPU_AARCH64) message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels") - add_compile_definitions(GGML_USE_CPU_AARCH64) + target_compile_definitions(ggml-cpu PRIVATE GGML_USE_CPU_AARCH64) endif() -target_compile_options(ggml-cpu PRIVATE "$<$:${ARCH_FLAGS}>") -target_compile_options(ggml-cpu PRIVATE "$<$:${ARCH_FLAGS}>") +target_sources(ggml-cpu PRIVATE ${GGML_CPU_SOURCES}) +set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_OPTIONS "${ARCH_FLAGS}") +set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "${ARCH_DEFINITIONS}") if (EMSCRIPTEN) set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128") diff --git a/ggml/src/ggml-cpu/amx/amx.cpp b/ggml/src/ggml-cpu/amx/amx.cpp new file mode 100644 index 000000000..09c0df0f5 --- /dev/null +++ b/ggml/src/ggml-cpu/amx/amx.cpp @@ -0,0 +1,196 @@ +#include "amx.h" +#include "common.h" +#include "mmq.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" +#include "ggml-impl.h" +#include "ggml-cpu.h" + +#if defined(__gnu_linux__) +#include +#include +#endif + +#include +#include +#include + +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + +// AMX buffer interface +static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); +} + +static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)(buffer->context); +} + +static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) { + memset((char *)tensor->data + offset, value, size); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + if (qtype_has_amx_kernels(tensor->type)) { + ggml_backend_amx_convert_weight(tensor, data, offset, size); + } else { + memcpy((char *)tensor->data + offset, data, size); + } + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(!qtype_has_amx_kernels(tensor->type)); + memcpy(data, (const char *)tensor->data + offset, size); + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + if (qtype_has_amx_kernels(src->type)) { + ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst)); + } else { + memcpy(dst->data, src->data, ggml_nbytes(src)); + } + return true; + } + return false; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + memset(buffer->context, value, buffer->size); +} + +static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { + /* .free_buffer = */ ggml_backend_amx_buffer_free_buffer, + /* .get_base = */ ggml_backend_amx_buffer_get_base, + /* .init_tensor = */ NULL, // no initialization required + /* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_amx_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_amx_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor, + /* .clear = */ ggml_backend_amx_buffer_clear, + /* .reset = */ NULL, +}; + +static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "AMX"; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * data = aligned_alloc(TENSOR_ALIGNMENT, size); + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } + + return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); +} + +static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return TENSOR_ALIGNMENT; + + GGML_UNUSED(buft); +} + +static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { + return ggml_backend_amx_get_alloc_size(tensor); + + GGML_UNUSED(buft); +} + +static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 + +static bool ggml_amx_init() { +#if defined(__gnu_linux__) + if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { + fprintf(stderr, "AMX is not ready to be used!\n"); + return false; + } + return true; +#elif defined(_WIN32) + return true; +#endif +} +ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { + /* .iface = */ { + /* .get_name = */ ggml_backend_amx_buffer_type_get_name, + /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_amx_buffer_type_is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), + /* .context = */ NULL, + }; + + if (!ggml_amx_init()) { + return NULL; + } + + return &ggml_backend_buffer_type_amx; +} + +bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name; +} + +bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) { + // handle only 2d gemm for now + auto is_contiguous_2d = [](const struct ggml_tensor * t) { + return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; + }; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + + case GGML_OP_MUL_MAT: { + const struct ggml_tensor * src0 = op->src[0]; + const struct ggml_tensor * src1 = op->src[1]; + + const enum ggml_type type = src0->type; + const int64_t ne0 = op->ne[0]; + + // amx kernels enables for Q4_0, Q4_1, Q8_0, F16 + // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256 + bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16); + + bool can_use_amx = + is_contiguous_2d(src0) && // src0 must be contiguous + is_contiguous_2d(src1) && // src1 must be contiguous + src1->type == GGML_TYPE_F32 && // src1 must be float32 + has_amx_kernels && // with amx kernel impls + ne0 % (TILE_N * 2) == 0; // out_features is 32x + + return can_use_amx; + } + default: + return false; + } +} + +#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__) diff --git a/ggml/src/ggml-cpu/amx/amx.h b/ggml/src/ggml-cpu/amx/amx.h new file mode 100644 index 000000000..c43546273 --- /dev/null +++ b/ggml/src/ggml-cpu/amx/amx.h @@ -0,0 +1,20 @@ +#include "ggml-backend.h" +#include "ggml-cpu-impl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + +ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void); +bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft); +bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op); +void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); +size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst); + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-amx/common.h b/ggml/src/ggml-cpu/amx/common.h similarity index 77% rename from ggml/src/ggml-amx/common.h rename to ggml/src/ggml-cpu/amx/common.h index 5db8ce30d..0b0657289 100644 --- a/ggml/src/ggml-amx/common.h +++ b/ggml/src/ggml-cpu/amx/common.h @@ -1,8 +1,7 @@ #pragma once #include "ggml.h" -// hack until AMX is moved into the CPU backend -#include "../ggml-cpu/ggml-cpu-impl.h" // +#include "ggml-cpu-impl.h" #include #include @@ -74,16 +73,24 @@ inline void parallel_for(int nth, int n, const func_t& f) { #endif } +template +inline void parallel_for_ggml(const ggml_compute_params * params, int n, const func_t & f) { + int tbegin, tend; + balance211(n, params->nth, params->ith, tbegin, tend); + f(tbegin, tend); + ggml_barrier(params->threadpool); // TODO: might not always be needed +} + // quantized types that have AMX support inline bool qtype_has_amx_kernels(const enum ggml_type type) { // TODO: fix padding for vnni format return (type == GGML_TYPE_Q4_0) || - (type == GGML_TYPE_Q4_1); - //(type == GGML_TYPE_Q8_0) || - //(type == GGML_TYPE_Q4_K) || - //(type == GGML_TYPE_Q5_K) || - //(type == GGML_TYPE_Q6_K) || - //(type == GGML_TYPE_IQ4_XS); + (type == GGML_TYPE_Q4_1) || + (type == GGML_TYPE_Q8_0) || + (type == GGML_TYPE_Q4_K) || + (type == GGML_TYPE_Q5_K) || + (type == GGML_TYPE_Q6_K) || + (type == GGML_TYPE_IQ4_XS); } // ggml backend context diff --git a/ggml/src/ggml-amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp similarity index 98% rename from ggml/src/ggml-amx/mmq.cpp rename to ggml/src/ggml-cpu/amx/mmq.cpp index 529bee25b..6447e73d0 100644 --- a/ggml/src/ggml-amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -4,8 +4,11 @@ #pragma GCC diagnostic ignored "-Wunused-local-typedefs" #endif +#include "amx.h" #include "mmq.h" #include "ggml-impl.h" +#include "ggml-cpu-impl.h" +#include "ggml-cpu-quants.h" #include "ggml-quants.h" #include #include @@ -33,7 +36,7 @@ #define ALWAYS_INLINE inline #endif -#if defined(__AMX_INT8__) +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) namespace { @@ -496,13 +499,12 @@ inline void from_float(const float * x, char * vy, int64_t k); template <> inline void from_float(const float * x, char * vy, int64_t k) { - // FIXME: using unoptimized reference impl until moved to CPU backend - quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k); + quantize_row_q8_0(x, (block_q8_0 *)vy, k); } template <> inline void from_float(const float * x, char * vy, int64_t k) { - quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k); + quantize_row_q8_1(x, (block_q8_1 *)vy, k); } template <> @@ -950,7 +952,7 @@ template> void unpack_B(packed_B_t * RESTRICT tile, const void * RESTRICT packed_B) { GGML_UNUSED(tile); GGML_UNUSED(packed_B); -}; +} template <> void unpack_B(int8_t * RESTRICT tile, const void * RESTRICT packed_B) { @@ -2327,9 +2329,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor) { // pack weight to vnni format void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - - size_t alloc_size = ggml_backend_amx_get_alloc_size(tensor); - GGML_ASSERT(alloc_size == size); + GGML_ASSERT(offset == 0 && size == ggml_nbytes(tensor)); // only full tensor conversion is supported for now const enum ggml_type TYPE = tensor->type; @@ -2348,6 +2348,29 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d }); } +size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst) { + struct ggml_tensor * src0 = dst->src[0]; + + const enum ggml_type TYPE = src0->type; + + const bool is_floating_type = TYPE == GGML_TYPE_F16; + if (is_floating_type) { + return 0; + } + + const int M = dst->ne[1]; + const int K = src0->ne[0]; + + size_t desired_wsize = 0; + + GGML_DISPATCH_QTYPES(TYPE, [&] { + const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); + desired_wsize = M * row_size_A; + }); + + return desired_wsize; +} + // NB: mixed dtype gemm with Advanced Matrix Extensions (Intel AMX) // // src0: weight in shape of {N, K}, quantized @@ -2356,14 +2379,12 @@ void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * d // // the function performs: dst = src1 @ src0.T // -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { +void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_tensor * dst) { struct ggml_tensor * src0 = dst->src[0]; struct ggml_tensor * src1 = dst->src[1]; const enum ggml_type TYPE = src0->type; - const int n_threads = ctx->n_threads; - // f16 only has avx512 kernels for now, // amx kernels will be added once 6th gen xeon is released. const bool is_floating_type = TYPE == GGML_TYPE_F16; @@ -2379,7 +2400,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor const int MB = div_up(M, BLOCK_M); const int NB = div_up(N, BLOCK_N); - parallel_for(n_threads, MB * NB, [&](int begin, int end) { + parallel_for_ggml(params, MB * NB, [&](int begin, int end) { GGML_DISPATCH_FLOATING_TYPES(TYPE, [&] { for (int i = begin; i < end; ++i) { int mb = i / NB; @@ -2412,27 +2433,29 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor } // pointer to work space, used convert A from float to quantized type - void * wdata = nullptr; + void * wdata = params->wdata; //TODO: performance improvement: merge quant A - GGML_DISPATCH_QTYPES(TYPE, [&] { - const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); - const size_t desired_wsize = M * row_size_A; - if (ctx->work_size < desired_wsize) { - ctx->work_data.reset(new char[desired_wsize]); - ctx->work_size = desired_wsize; - } - wdata = ctx->work_data.get(); + if (params->ith == 0) { + GGML_DISPATCH_QTYPES(TYPE, [&] { + const size_t row_size_A = K / blck_size * sizeof(vec_dot_type); + const size_t desired_wsize = M * row_size_A; + if (params->wsize < desired_wsize) { + GGML_ABORT("insufficient work space size"); + } - // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size - // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size - GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); + // Q4_0, Q4_1, Q8_0 handles 1 TILE_K per blck_size + // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size + GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - const float * A_data = static_cast(src1->data); - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); - } - }); + const float * A_data = static_cast(src1->data); + for (int m = 0; m < M; ++m) { + from_float(A_data + m * K, (char *)wdata + m * row_size_A, K); + } + }); + } + + ggml_barrier(params->threadpool); if (M == 1) { // MB = 1 and handle 8 tiles in each block @@ -2440,7 +2463,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor constexpr int BLOCK_N = TILE_N * kTilesN; const int NB = div_up(N, BLOCK_N); - parallel_for(n_threads, NB, [&](int begin, int end) { + parallel_for_ggml(params, NB, [&](int begin, int end) { GGML_DISPATCH_QTYPES(TYPE, [&] { const int KB = K / blck_size; const int TILE_SIZE = get_tile_size(); @@ -2470,7 +2493,7 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor const int MB = div_up(M, BLOCK_M); const int NB = div_up(N, BLOCK_N); - parallel_for(n_threads, MB * NB, [&](int begin, int end) { + parallel_for_ggml(params, MB * NB, [&](int begin, int end) { // init tile config for each thread ggml_tile_config_init(); @@ -2498,13 +2521,4 @@ void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor }); } -#else // if defined(__AMX_INT8__) - -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst) { - fprintf(stderr, "GGML is not compiled with AMX support!\n"); - - GGML_UNUSED(ctx); - GGML_UNUSED(dst); -} - -#endif // if defined(__AMX_INT8__) +#endif // if defined(__AMX_INT8__) && defined(__AVX512VNNI__) diff --git a/ggml/src/ggml-amx/mmq.h b/ggml/src/ggml-cpu/amx/mmq.h similarity index 72% rename from ggml/src/ggml-amx/mmq.h rename to ggml/src/ggml-cpu/amx/mmq.h index cf0920620..f37366093 100644 --- a/ggml/src/ggml-amx/mmq.h +++ b/ggml/src/ggml-cpu/amx/mmq.h @@ -1,6 +1,5 @@ #pragma once #include "common.h" -#include #ifdef __cplusplus extern "C" { @@ -10,7 +9,7 @@ size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor); void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); -void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst); +void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst); #ifdef __cplusplus } diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index 27a530b22..d71076ad1 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -15,6 +15,18 @@ extern "C" { #endif +struct ggml_compute_params { + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + + struct ggml_threadpool * threadpool; +}; + + #if defined(_MSC_VER) #define m512bh(p) p @@ -366,6 +378,9 @@ static __m256 __lasx_xvreplfr2vr_s(float val) { } #endif +// TODO: move to ggml-threading +void ggml_barrier(struct ggml_threadpool * tp); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index e0cefc20b..23ae2e10c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -10,6 +10,7 @@ #include "ggml-quants.h" #include "ggml-cpu-quants.h" #include "ggml-threading.h" +#include "amx/amx.h" #include "ggml.h" #if defined(_MSC_VER) || defined(__MINGW32__) @@ -624,7 +625,7 @@ do { \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \ } \ - res = _mm512_reduce_add_ps(x[0]); \ + res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ } while (0) // TODO: is this optimal ? @@ -674,7 +675,7 @@ do { \ for (int i = 0; i < offset; ++i) { \ x[i] = _mm512_add_ps(x[i], x[offset+i]); \ } \ - res = _mm512_reduce_add_ps(x[0]); \ + res = (ggml_float) _mm512_reduce_add_ps(x[0]); \ } while (0) #define GGML_F16_VEC GGML_F32Cx16 @@ -685,8 +686,8 @@ do { \ #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL -#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE +#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE #elif defined(__AVX__) #define GGML_SIMD @@ -1178,28 +1179,28 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) { #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a) #define GGML_F32x4_ADD __lsx_vfadd_s #define GGML_F32x4_MUL __lsx_vfmul_s -#define GGML_F32x4_REDUCE(res, x) \ -{ \ - int offset = GGML_F32_ARR >> 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - offset >>= 1; \ - for (int i = 0; i < offset; ++i) { \ - x[i] = __lsx_vfadd_s(x[i], x[offset+i]); \ - } \ - __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \ - tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \ - tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ - const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ - tmp = __lsx_vsrli_d((__m128i)t0, 32); \ - tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \ - tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ - res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ +#define GGML_F32x4_REDUCE(res, x) \ +{ \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ + } \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \ + } \ + __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \ + tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \ + tmp = __lsx_vsrli_d((__m128i) t0, 32); \ + tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \ + tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \ + res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \ } #define GGML_F32_VEC GGML_F32x4 @@ -1367,31 +1368,15 @@ struct ggml_compute_state { int ith; }; -struct ggml_compute_params { - // ith = thread index, nth = number of threads - int ith, nth; - - // work buffer for all threads - size_t wsize; - void * wdata; - - struct ggml_threadpool * threadpool; -}; - // // fundamental operations // inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } - inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } @@ -2286,7 +2271,7 @@ struct ggml_state { static struct ggml_state g_state = {0}; -static void ggml_barrier(struct ggml_threadpool * tp) { +void ggml_barrier(struct ggml_threadpool * tp) { int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); if (n_threads == 1) { return; @@ -7455,6 +7440,13 @@ static void ggml_compute_forward_mul_mat( type = (enum ggml_type)(intptr_t)src0->extra; } +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + if (src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { + ggml_backend_amx_mul_mat(params, dst); + return; + } +#endif + enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type; ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float; ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat; @@ -13294,10 +13286,16 @@ struct ggml_cplan ggml_graph_plan( } break; case GGML_OP_MUL_MAT: { +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + if (node->src[0]->buffer && ggml_backend_amx_buft_is_amx(node->src[0]->buffer->buft)) { + cur = ggml_backend_amx_desired_wsize(node); + } +#endif const enum ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type; if (node->src[1]->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + size_t cur2 = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1])); + cur = MAX(cur, cur2); } } break; case GGML_OP_MUL_MAT_ID: diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 44d99089a..77e5d87a8 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -3,6 +3,7 @@ #include "ggml-cpu.h" #include "ggml-cpu-aarch64.h" #include "ggml-impl.h" +#include "amx/amx.h" #include #include #include @@ -134,12 +135,16 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backen static std::vector bufts = []() { std::vector bufts; -#ifdef GGML_USE_CPU_HBM - bufts.push_back(ggml_backend_cpu_hbm_buffer_type()); +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + if (ggml_backend_amx_buffer_type()) { + bufts.push_back(ggml_backend_amx_buffer_type()); + } #endif #ifdef GGML_USE_CPU_AARCH64 - bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); + if (ggml_backend_cpu_aarch64_buffer_type()) { + bufts.push_back(ggml_backend_cpu_aarch64_buffer_type()); + } #endif bufts.push_back(NULL); @@ -456,12 +461,27 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; + if (op->op == GGML_OP_NONE || op->op == GGML_OP_RESHAPE || op->op == GGML_OP_VIEW || op->op == GGML_OP_PERMUTE || op->op == GGML_OP_TRANSPOSE) { + return true; + } + if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) { if (op->op != GGML_OP_MUL_MAT || src0->type == ggml_aarch64_get_optimal_repack_type(src0)) { return false; } } +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + if (src0 && src0->buffer && ggml_backend_amx_buft_is_amx(src0->buffer->buft)) { + return ggml_backend_amx_device_supports_op(op); + } + for (int i = 1; i < GGML_MAX_SRC; i++) { + if (op->src[i] && op->src[i]->buffer && ggml_backend_amx_buft_is_amx(op->src[i]->buffer->buft)) { + return false; + } + } +#endif + for (int i = 1; i < GGML_MAX_SRC; i++) { if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) { return false; @@ -491,7 +511,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st } static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); + bool supported = ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft); + +#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) + supported = supported || ggml_backend_amx_buft_is_amx(buft); +#endif + + return supported; GGML_UNUSED(dev); } diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index b2ce2e664..da4146ec4 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -50,8 +50,7 @@ #include "sgemm.h" #include "ggml-impl.h" -// hack until moved into the CPU backend -#include "../ggml-cpu-impl.h" +#include "ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index f39b7a88c..78e3af8f2 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -30,11 +30,13 @@ extern "C" { #endif -#undef MIN -#undef MAX +#ifndef MIN +# define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#ifndef MAX +# define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif // required for mmap as gguf only guarantees 32-byte alignment #define TENSOR_ALIGNMENT 32 diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt index 10075db33..51c78b7d2 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt @@ -3,5 +3,5 @@ find_package (Threads REQUIRED) set(TARGET vulkan-shaders-gen) add_executable(${TARGET} vulkan-shaders-gen.cpp) install(TARGETS ${TARGET} RUNTIME) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads) diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt index d5405ad29..6235aec1f 100644 --- a/pocs/vdot/CMakeLists.txt +++ b/pocs/vdot/CMakeLists.txt @@ -1,9 +1,9 @@ set(TARGET llama-vdot) add_executable(${TARGET} vdot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-q8dot) add_executable(${TARGET} q8dot.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2f581b921..f3b3908b1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -25,7 +25,7 @@ add_library(llama ) target_include_directories(llama PUBLIC . ../include) -target_compile_features (llama PUBLIC cxx_std_11) # don't bump +target_compile_features (llama PUBLIC cxx_std_17) # don't bump target_link_libraries(llama PUBLIC ggml) diff --git a/src/unicode.cpp b/src/unicode.cpp index 50b35bbbc..3d4592635 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -201,7 +201,18 @@ static std::unordered_map unicode_utf8_to_byte_map() { } static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { +#if defined(__clang__) + // disable C++17 deprecation warning for std::codecvt_utf8 +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + std::wstring_convert> conv; + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + return conv.from_bytes(s); } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index be370044d..e5c9e75e4 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -284,7 +284,7 @@ static void test_perf() { data.reserve(n_vocab); for (int i = 0; i < n_vocab; i++) { - const float logit = 2.0f*((float)(rand())/RAND_MAX - 0.5f); + const float logit = 2.0f*((double)(rand())/RAND_MAX - 0.5); data.emplace_back(llama_token_data{i, logit, 0.0f}); }