This commit is contained in:
Ma Mingfei 2024-09-12 10:32:53 +08:00 committed by GitHub
commit 108d912e9b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 2680 additions and 6 deletions

View File

@ -85,6 +85,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE ON)
endif()
if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
set(GGML_CUDA_USE_GRAPHS ON)
endif()

View File

@ -93,11 +93,6 @@ GGML_METAL := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_OPENMP
GGML_OPENMP := 1
DEPRECATE_WARNING := 1
endif
ifdef LLAMA_RPC
GGML_RPC := 1
DEPRECATE_WARNING := 1
@ -579,6 +574,11 @@ ifndef GGML_NO_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif
ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx/mmq.o
endif
ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML += ggml/src/ggml-rpc.o
@ -1072,6 +1072,14 @@ ggml/src/llamafile/sgemm.o: \
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
ifndef GGML_NO_AMX
ggml/src/ggml-amx/mmq.o: \
ggml/src/ggml-amx/mmq.cpp \
ggml/src/ggml-amx/mmq.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef GGML_RPC
ggml/src/ggml-rpc.o: \
ggml/src/ggml-rpc.cpp \
@ -1218,6 +1226,7 @@ clean:
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp

View File

@ -28,7 +28,7 @@ variety of hardware - locally and in the cloud.
- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- AVX, AVX2, AVX512 and AMX support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan and SYCL backend support

View File

@ -90,6 +90,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@ -149,6 +152,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING

View File

@ -2494,6 +2494,7 @@ extern "C" {
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);

View File

@ -262,6 +262,14 @@ if (GGML_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
endif()
if (GGML_AMX)
message(STATUS "Using AMX")
add_compile_definitions(GGML_USE_AMX)
set(GGML_HEADERS_AMX ggml-amx/mmq.h)
set(GGML_SOURCES_AMX ggml-amx/mmq.cpp)
endif()
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
@ -1155,6 +1163,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
@ -1189,6 +1209,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
@ -1314,6 +1343,7 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
)

2584
ggml/src/ggml-amx/mmq.cpp Normal file

File diff suppressed because it is too large Load Diff

17
ggml/src/ggml-amx/mmq.h Normal file
View File

@ -0,0 +1,17 @@
#pragma once
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
bool ggml_amx_init(void);
bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor * dst);
void ggml_mul_mat_amx(struct ggml_tensor * dst, int nth, int ith, void * wdata, int wsize);
#ifdef __cplusplus
}
#endif

View File

@ -44,10 +44,19 @@ int ggml_sve_cnt_b = 0;
#undef GGML_USE_LLAMAFILE
#endif
// enable AMX only with OPENMP
#if !defined(__AMX_INT8__) || !defined(GGML_USE_OPENMP)
#undef GGML_USE_AMX
#endif
#ifdef GGML_USE_LLAMAFILE
#include <llamafile/sgemm.h>
#endif
#ifdef GGML_USE_AMX
#include <ggml-amx/mmq.h>
#endif
#if defined(_MSC_VER)
// disable "possible loss of data" to avoid hundreds of casts
// we should just be careful :)
@ -12801,6 +12810,13 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
#if GGML_USE_AMX
if (ggml_compute_forward_mul_mat_use_amx(dst)) {
ggml_mul_mat_amx(dst, nth, ith, params->wdata, params->wsize);
return;
}
#endif
#if GGML_USE_LLAMAFILE
// broadcast factors
const int64_t r2 = ne12 / ne02;
@ -23210,6 +23226,14 @@ int ggml_cpu_has_avx512_bf16(void) {
#endif
}
int ggml_cpu_has_amx_int8(void) {
#if defined(__AMX_INT8__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_fma(void) {
#if defined(__FMA__)
return 1;

View File

@ -20666,6 +20666,7 @@ const char * llama_print_system_info(void) {
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | ";
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";