From c8da7d0f705d7174b30ca0dd3ec9e1f853026252 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 12 Nov 2024 20:52:46 +0100 Subject: [PATCH] add hip --- .devops/nix/package.nix | 2 +- .github/workflows/build.yml | 8 +- Makefile | 2 +- cmake/llama-config.cmake.in | 2 +- docs/build.md | 6 +- ggml/CMakeLists.txt | 3 +- ggml/include/ggml-cuda.h | 2 +- ggml/src/CMakeLists.txt | 111 +------------------------ ggml/src/ggml-amx/CMakeLists.txt | 9 +- ggml/src/ggml-cuda/common.cuh | 50 ++++++------ ggml/src/ggml-cuda/fattn-common.cuh | 4 +- ggml/src/ggml-cuda/fattn-tile-f16.cu | 4 +- ggml/src/ggml-cuda/fattn-tile-f32.cu | 4 +- ggml/src/ggml-cuda/fattn-vec-f16.cuh | 4 +- ggml/src/ggml-cuda/fattn-vec-f32.cuh | 4 +- ggml/src/ggml-cuda/fattn-wmma-f16.cuh | 4 +- ggml/src/ggml-cuda/ggml-cuda.cu | 26 +++--- ggml/src/ggml-cuda/mmq.cuh | 22 ++--- ggml/src/ggml-cuda/mmvq.cu | 8 +- ggml/src/ggml-cuda/sum.cu | 4 +- ggml/src/ggml-hip/CMakeLists.txt | 113 ++++++++++++++++++++++++++ 21 files changed, 197 insertions(+), 195 deletions(-) create mode 100644 ggml/src/ggml-hip/CMakeLists.txt diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index 5d7d7ea5a..cd6146421 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { (cmakeBool "GGML_NATIVE" false) (cmakeBool "GGML_BLAS" useBlas) (cmakeBool "GGML_CUDA" useCuda) - (cmakeBool "GGML_HIPBLAS" useRocm) + (cmakeBool "GGML_HIP" useRocm) (cmakeBool "GGML_METAL" useMetalKit) (cmakeBool "GGML_VULKAN" useVulkan) (cmakeBool "GGML_STATIC" enableStatic) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6e6b7ed3a..d6a7b66a5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -405,13 +405,13 @@ jobs: - name: Build with native CMake HIP support id: cmake_build run: | - cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON + cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON cmake --build build --config Release -j $(nproc) - name: Build with legacy HIP support id: cmake_build_legacy_hip run: | - cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON + cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON cmake --build build2 --config Release -j $(nproc) ubuntu-22-cmake-sycl: @@ -1014,7 +1014,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} windows-latest-cmake-hip-release: @@ -1050,7 +1050,7 @@ jobs: run: | $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path) $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}" - cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON + cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON cmake --build build -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\" diff --git a/Makefile b/Makefile index dfa32d516..fc6ef46cd 100644 --- a/Makefile +++ b/Makefile @@ -819,7 +819,7 @@ ifdef GGML_HIPBLAS GGML_CUDA_MMV_Y ?= 1 GGML_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA + MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA ifdef GGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA diff --git a/cmake/llama-config.cmake.in b/cmake/llama-config.cmake.in index f072b76a3..7aaa605e1 100644 --- a/cmake/llama-config.cmake.in +++ b/cmake/llama-config.cmake.in @@ -6,7 +6,7 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(GGML_BLAS @GGML_BLAS@) set(GGML_CUDA @GGML_CUDA@) set(GGML_METAL @GGML_METAL@) -set(GGML_HIPBLAS @GGML_HIPBLAS@) +set(GGML_HIP @GGML_HIPS@) set(GGML_ACCELERATE @GGML_ACCELERATE@) set(GGML_VULKAN @GGML_VULKAN@) set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@) diff --git a/docs/build.md b/docs/build.md index 4e362ebc7..95512415a 100644 --- a/docs/build.md +++ b/docs/build.md @@ -230,7 +230,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ - cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build --config Release -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`. @@ -247,7 +247,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm ```bash HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \ HIP_DEVICE_LIB_PATH= \ - cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ && cmake --build build -- -j 16 ``` @@ -259,7 +259,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): ```bash set PATH=%HIP_PATH%\bin;%PATH% - cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release + cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release cmake --build build ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 81b7a02f5..384b9e680 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -116,6 +116,7 @@ endif() # ggml core set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism") +option(GGML_CPU "ggml: enable CPU backend" ON) # 3rd party libs / backends option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON) @@ -141,7 +142,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) -option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) +option(GGML_HIP "ggml: use HIP" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) diff --git a/ggml/include/ggml-cuda.h b/ggml/include/ggml-cuda.h index 8a832aace..22ad2c009 100644 --- a/ggml/include/ggml-cuda.h +++ b/ggml/include/ggml-cuda.h @@ -7,7 +7,7 @@ extern "C" { #endif -#ifdef GGML_USE_HIPBLAS +#ifdef GGML_USE_HIP #define GGML_CUDA_NAME "ROCm" #define GGML_CUBLAS_NAME "hipBLAS" #elif defined(GGML_USE_MUSA) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 93e1631e3..de174eba9 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -29,115 +29,6 @@ endif() unset(GGML_EXTRA_LIBS_PRIVATE) unset(GGML_EXTRA_LIBS_PUBLIC) -if (GGML_HIPBLAS) - if (NOT EXISTS $ENV{ROCM_PATH}) - if (NOT EXISTS /opt/rocm) - set(ROCM_PATH /usr) - else() - set(ROCM_PATH /opt/rocm) - endif() - else() - set(ROCM_PATH $ENV{ROCM_PATH}) - endif() - - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) - list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") - - # CMake on Windows doesn't support the HIP language yet - if (WIN32) - set(CXX_IS_HIPCC TRUE) - else() - string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") - endif() - - if (CXX_IS_HIPCC) - if (LINUX) - if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") - endif() - - message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." - " Prefer setting the HIP compiler directly. See README for details.") - endif() - else() - # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. - if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) - endif() - cmake_minimum_required(VERSION 3.21) - enable_language(HIP) - endif() - - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - find_package(rocblas REQUIRED) - - message(STATUS "HIP and hipBLAS found") - - file(GLOB GGML_HEADERS_ROCM "ggml-cuda/*.cuh") - list(APPEND GGML_HEADERS_ROCM "../include/ggml-cuda.h") - - file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") - list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") - file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - - if (GGML_CUDA_FA_ALL_QUANTS) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) - else() - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu") - list(APPEND GGML_SOURCES_ROCM ${SRCS}) - endif() - - list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) - - add_compile_definitions(GGML_USE_HIPBLAS) - add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) - add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) - add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) - - if (GGML_HIP_UMA) - add_compile_definitions(GGML_HIP_UMA) - endif() - - if (GGML_CUDA_FORCE_DMMV) - add_compile_definitions(GGML_CUDA_FORCE_DMMV) - endif() - - if (GGML_CUDA_FORCE_MMQ) - add_compile_definitions(GGML_CUDA_FORCE_MMQ) - endif() - - if (GGML_CUDA_FORCE_CUBLAS) - add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) - endif() - - if (GGML_CUDA_NO_PEER_COPY) - add_compile_definitions(GGML_CUDA_NO_PEER_COPY) - endif() - - if (CXX_IS_HIPCC) - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) - list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device) - else() - set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) - endif() - - if (GGML_STATIC) - message(FATAL_ERROR "Static linking not supported for HIP/ROCm") - endif() - - list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas) -endif() - function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") @@ -354,12 +245,12 @@ function(ggml_add_backend backend) endif() endfunction() -set(GGML_CPU ON) ggml_add_backend(CPU) ggml_add_backend(AMX) ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) +ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) ggml_add_backend(RPC) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt index c995142b3..538220294 100644 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ b/ggml/src/ggml-amx/CMakeLists.txt @@ -1,10 +1,4 @@ if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0) -else() - set(GGML_AMX OFF PARENT_SCOPE) - message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.") -endif() - -if (GGML_AMX) message(STATUS "Using AMX") file(GLOB GGML_HEADERS_AMX "*.h") @@ -104,4 +98,7 @@ if (GGML_AMX) endif() target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS}) +else() + set(GGML_AMX OFF PARENT_SCOPE) + message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.") endif() diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index dd203fcde..e146c691c 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -6,7 +6,7 @@ #include #include -#if defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_HIP) #define GGML_COMMON_DECL_HIP #define GGML_COMMON_IMPL_HIP #else @@ -26,13 +26,13 @@ #include #include -#if defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_HIP) #include "vendors/hip.h" #elif defined(GGML_USE_MUSA) #include "vendors/musa.h" #else #include "vendors/cuda.h" -#endif // defined(GGML_USE_HIPBLAS) +#endif // defined(GGML_USE_HIP) #define STRINGIZE_IMPL(...) #__VA_ARGS__ #define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) @@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIP) static const char * cu_get_error_str(CUresult err) { const char * err_str; cuGetErrorString(err, &err_str); @@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float typedef float2 dfloat2; #endif // GGML_CUDA_F16 -#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #define FP16_AVAILABLE -#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL #if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 #define FAST_FP16_AVAILABLE #endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610 -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #define FP16_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #define INT8_MMA_AVAILABLE -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1) #define FLASH_ATTN_AVAILABLE @@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) { static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", file_name, line, function_name, arch); GGML_UNUSED(arch_list); #else printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", file_name, line, function_name, arch, arch_list); -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) __trap(); GGML_UNUSED(no_device_code); // suppress unused function warning @@ -176,7 +176,7 @@ static __device__ void no_device_code( #endif // __CUDA_ARCH__ static __device__ __forceinline__ int warp_reduce_sum(int x) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE return __reduce_add_sync(0xffffffff, x); #else #pragma unroll @@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) { x += __shfl_xor_sync(0xffffffff, x, mask, 32); } return x; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE } static __device__ __forceinline__ float warp_reduce_sum(float x) { @@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #ifdef FP16_AVAILABLE -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); @@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); } return a; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #else NO_DEVICE_CODE; @@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { #ifdef FP16_AVAILABLE -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX return __float2half(fmaxf(__half2float(a), __half2float(b))); #else return __hmax(a, b); -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX #else NO_DEVICE_CODE; @@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b } static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) #if CUDART_VERSION >= CUDART_HMAX return __hmax2(a, b); @@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal GGML_UNUSED(a); GGML_UNUSED(b); NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) } static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); @@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { #else GGML_UNUSED(x); NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } #if CUDART_VERSION < CUDART_HMASK @@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half #endif // CUDART_VERSION < CUDART_HMASK static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2) c = __builtin_amdgcn_sdot4(a, b, c, false); #elif defined(RDNA3) @@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i #endif return c; -#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if __CUDA_ARCH__ >= MIN_CC_DP4A return __dp4a(a, b, c); @@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3]; #endif // __CUDA_ARCH__ >= MIN_CC_DP4A -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } // TODO: move to ggml-common.h diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 1fb5c09c3..ee9752da6 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) { } template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_combine_results( const float * __restrict__ VKQ_parts, const float2 * __restrict__ VKQ_meta, diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 5af02c7ec..4d314dacb 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -5,9 +5,9 @@ #define FATTN_KQ_STRIDE_TILE_F16 64 template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(nwarps*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_tile_ext_f16( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-tile-f32.cu b/ggml/src/ggml-cuda/fattn-tile-f32.cu index f402195ce..bb3360447 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f32.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu @@ -5,9 +5,9 @@ #define FATTN_KQ_STRIDE_TILE_F32 32 template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(nwarps*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_tile_ext_f32( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh index 2ed6509ac..5ec3b91ae 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh @@ -2,9 +2,9 @@ #include "fattn-common.cuh" template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_vec_ext_f16( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-vec-f32.cuh b/ggml/src/ggml-cuda/fattn-vec-f32.cuh index bf5125902..3d93f4a8a 100644 --- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh +++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh @@ -2,9 +2,9 @@ #include "fattn-common.cuh" template // D == head size -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(D, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_vec_ext_f32( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh index b10d19d93..860d0e6dc 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh @@ -7,9 +7,9 @@ // D == head size, VKQ_stride == num VKQ rows calculated in parallel: template -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) __launch_bounds__(nwarps*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void flash_attn_ext_f16( const char * __restrict__ Q, const char * __restrict__ K, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 357cee660..b54d660b8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -91,7 +91,7 @@ int ggml_cuda_get_device() { static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) { ggml_cuda_set_device(device); -#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA) +#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA) auto res = hipMallocManaged(ptr, size); if (res == hipSuccess) { // if error we "need" to know why... @@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) return res; #else -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIP) cudaError_t err; if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr) { @@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) return err; #else return cudaMalloc(ptr, size); -#endif // !defined(GGML_USE_HIPBLAS) +#endif // !defined(GGML_USE_HIP) #endif } @@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() { for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() { alloc_prop.location.id = id; CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); } -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) info.devices[id].vmm = !!device_vmm; cudaDeviceProp prop; @@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].nsm = prop.multiProcessorCount; info.devices[id].smpb = prop.sharedMemPerBlock; -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].cc = 100*prop.major + 10*prop.minor; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } for (int id = 0; id < info.device_count; ++id) { @@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); } }; -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr(new ggml_cuda_pool_vmm(device)); } -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM) return std::unique_ptr(new ggml_cuda_pool_leg(device)); } @@ -1325,7 +1325,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static cudaError_t ggml_cuda_Memcpy2DPeerAsync( void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices cudaMemcpy3DPeerParms p = {}; p.dstDevice = dstDevice; @@ -1339,7 +1339,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync( GGML_UNUSED(dstDevice); GGML_UNUSED(srcDevice); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } static void ggml_cuda_op_mul_mat( diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 021a25682..425acb20d 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() { return 128; #else // INT8_MMA_AVAILABLE -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) return 128; -#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if __CUDA_ARCH__ >= CC_VOLTA #ifdef GGML_CUDA_FORCE_MMQ @@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() { return 64; #endif // __CUDA_ARCH__ >= CC_VOLTA -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // INT8_MMA_AVAILABLE } @@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) { } static constexpr __device__ int get_mmq_y_device() { -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA1) return 64; #else @@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() { #else return 64; #endif // __CUDA_ARCH__ >= CC_VOLTA -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) } #define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0} @@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile( // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 template -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(RDNA3) || defined(RDNA2) __launch_bounds__(WARP_SIZE*nwarps, 2) #endif // defined(RDNA3) || defined(RDNA2) @@ -2579,7 +2579,7 @@ template #else __launch_bounds__(WARP_SIZE*nwarps, 2) #endif // __CUDA_ARCH__ >= CC_VOLTA -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) static __global__ void mul_mat_q( const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup, const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) { @@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q( constexpr int mmq_y = get_mmq_y_device(); // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead: -#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA { constexpr bool fixup = false; mul_mat_q_process_tile @@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q( blockIdx.x, blockIdx.y, 0, ne00/qk); return; } -#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA +#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA const int64_t blocks_per_ne00 = ne00 / qk; constexpr int blocks_per_iter = MMQ_ITER_K / qk; @@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int shmem = mmq_get_shmem(mmq_x, mmq_y, cc); -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false}; if (!shmem_limit_raised[id]) { CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem)); shmem_limit_raised[id] = true; } -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) const int nty = (args.ne01 + mmq_y - 1) / mmq_y; const int ntx = (args.ne11 + mmq_x - 1) / mmq_x; diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 7dbbc9939..735975c16 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { } template -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) // tell the compiler to use as many registers as it wants, see nwarps definition below __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) static __global__ void mul_mat_vec_q( const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { @@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q( constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type); -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) +#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) constexpr int nwarps = 1; constexpr int rows_per_cuda_block = 1; #else constexpr int nwarps = ncols_y <= 4 ? 4 : 2; constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) +#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; const int row0 = rows_per_cuda_block*blockIdx.x; diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index 0583e4fe0..31cfe5394 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -1,6 +1,6 @@ -#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #define USE_CUB -#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 #ifdef USE_CUB // On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt new file mode 100644 index 000000000..5ed186ded --- /dev/null +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -0,0 +1,113 @@ +if (NOT EXISTS $ENV{ROCM_PATH}) + if (NOT EXISTS /opt/rocm) + set(ROCM_PATH /usr) + else() + set(ROCM_PATH /opt/rocm) + endif() +else() + set(ROCM_PATH $ENV{ROCM_PATH}) +endif() + +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) +list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake") + +# CMake on Windows doesn't support the HIP language yet +if (WIN32) + set(CXX_IS_HIPCC TRUE) +else() + string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}") +endif() + +if (CXX_IS_HIPCC) + if (LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() +else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) +endif() + +find_package(hip REQUIRED) +find_package(hipblas REQUIRED) +find_package(rocblas REQUIRED) + +message(STATUS "HIP and hipBLAS found") + +file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh") +list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h") + +file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu") +file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu") +list(APPEND GGML_SOURCES_ROCM ${SRCS}) +file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu") +list(APPEND GGML_SOURCES_ROCM ${SRCS}) + +if (GGML_CUDA_FA_ALL_QUANTS) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS) +else() + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) + file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu") + list(APPEND GGML_SOURCES_ROCM ${SRCS}) +endif() + +add_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM}) + +target_link_libraries(ggml-hip PRIVATE ggml-base) +target_include_directories(ggml-hip PRIVATE . ..) + +# TODO: do not use CUDA definitions for HIP +target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) + +add_compile_definitions(GGML_USE_HIP) +add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) +add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) +add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) + +if (GGML_HIP_UMA) + add_compile_definitions(GGML_HIP_UMA) +endif() + +if (GGML_CUDA_FORCE_DMMV) + add_compile_definitions(GGML_CUDA_FORCE_DMMV) +endif() + +if (GGML_CUDA_FORCE_MMQ) + add_compile_definitions(GGML_CUDA_FORCE_MMQ) +endif() + +if (GGML_CUDA_FORCE_CUBLAS) + add_compile_definitions(GGML_CUDA_FORCE_CUBLAS) +endif() + +if (GGML_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) +endif() + +if (CXX_IS_HIPCC) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) + target_link_libraries(ggml-hip PRIVATE hip::device) +else() + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) +endif() + +if (GGML_STATIC) + message(FATAL_ERROR "Static linking not supported for HIP/ROCm") +endif() + +target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)